1"""HTML 2.0 parser.
2
3See the HTML 2.0 specification:
4http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5"""
6
7from pysollib.formatter import AS_IS
8
9from six.moves import html_parser
10
11
12class HTMLParseError(RuntimeError):
13    """Error raised when an HTML document can't be parsed."""
14
15
16class HTMLParser(html_parser.HTMLParser):
17    """This is the basic HTML parser class.
18
19    It supports all entity names required by the XHTML 1.0 Recommendation.
20    It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
21    elements.
22
23    """
24
25    # from six.moves.html_entities import entitydefs
26
27    def __init__(self, formatter):
28        """Creates an instance of the HTMLParser class.
29
30        The formatter parameter is the formatter instance associated with
31        the parser.
32
33        """
34        html_parser.HTMLParser.__init__(self)
35        self.formatter = formatter
36
37    def error(self, message):
38        raise HTMLParseError(message)
39
40    def reset(self):
41        html_parser.HTMLParser.reset(self)
42        self.savedata = None
43        self.isindex = 0
44        self.title = None
45        self.base = None
46        self.anchor = None
47        self.anchorlist = []
48        self.nofill = 0
49        self.list_stack = []
50
51    # ------ Methods used internally; some may be overridden
52
53    # --- Formatter interface, taking care of 'savedata' mode;
54    # shouldn't need to be overridden
55
56    def handle_data(self, data):
57        if self.savedata is not None:
58            self.savedata = self.savedata + data
59        else:
60            if self.nofill:
61                self.formatter.add_literal_data(data)
62            else:
63                self.formatter.add_flowing_data(data)
64
65    def handle_starttag(self, tag, attrs):
66        try:
67            method = getattr(self, 'start_' + tag)
68        except AttributeError:
69            try:
70                method = getattr(self, 'do_' + tag)
71            except AttributeError:
72                self.unknown_starttag(tag, attrs)
73                return
74        method(attrs)
75
76    def handle_endtag(self, tag):
77        try:
78            method = getattr(self, 'end_' + tag)
79        except AttributeError:
80            self.unknown_endtag(tag)
81            return
82        method()
83
84    # --- Hooks to save data; shouldn't need to be overridden
85
86    def save_bgn(self):
87        """Begins saving character data in a buffer instead of sending it
88        to the formatter object.
89
90        Retrieve the stored data via the save_end() method.  Use of the
91        save_bgn() / save_end() pair may not be nested.
92
93        """
94        self.savedata = ''
95
96    def save_end(self):
97        """Ends buffering character data and returns all data saved since
98        the preceding call to the save_bgn() method.
99
100        If the nofill flag is false, whitespace is collapsed to single
101        spaces.  A call to this method without a preceding call to the
102        save_bgn() method will raise a TypeError exception.
103
104        """
105        data = self.savedata
106        self.savedata = None
107        if not self.nofill:
108            data = ' '.join(data.split())
109        return data
110
111    # --- Hooks for anchors; should probably be overridden
112
113    def anchor_bgn(self, href, name, type):
114        """This method is called at the start of an anchor region.
115
116        The arguments correspond to the attributes of the <A> tag with
117        the same names.  The default implementation maintains a list of
118        hyperlinks (defined by the HREF attribute for <A> tags) within
119        the document.  The list of hyperlinks is available as the data
120        attribute anchorlist.
121
122        """
123        self.anchor = href
124        if self.anchor:
125            self.anchorlist.append(href)
126
127    def anchor_end(self):
128        """This method is called at the end of an anchor region.
129
130        The default implementation adds a textual footnote marker using an
131        index into the list of hyperlinks created by the anchor_bgn()method.
132
133        """
134        if self.anchor:
135            self.handle_data("[%d]" % len(self.anchorlist))
136            self.anchor = None
137
138    # --- Hook for images; should probably be overridden
139
140    def handle_image(self, src, alt, *args):
141        """This method is called to handle images.
142
143        The default implementation simply passes the alt value to the
144        handle_data() method.
145
146        """
147        self.handle_data(alt)
148
149    # --------- Top level elememts
150
151    def start_html(self, attrs): pass
152
153    def end_html(self): pass
154
155    def start_head(self, attrs): pass
156
157    def end_head(self): pass
158
159    def start_body(self, attrs): pass
160
161    def end_body(self): pass
162
163    # ------ Head elements
164
165    def start_title(self, attrs):
166        self.save_bgn()
167
168    def end_title(self):
169        self.title = self.save_end()
170
171    def do_base(self, attrs):
172        for a, v in attrs:
173            if a == 'href':
174                self.base = v
175
176    def do_isindex(self, attrs):
177        self.isindex = 1
178
179    def do_link(self, attrs):
180        pass
181
182    def do_meta(self, attrs):
183        pass
184
185    def do_nextid(self, attrs):  # Deprecated
186        pass
187
188    # ------ Body elements
189
190    # --- Headings
191
192    def start_h1(self, attrs):
193        self.formatter.end_paragraph(1)
194        self.formatter.push_font(('h1', 0, 1, 0))
195
196    def end_h1(self):
197        self.formatter.end_paragraph(1)
198        self.formatter.pop_font()
199
200    def start_h2(self, attrs):
201        self.formatter.end_paragraph(1)
202        self.formatter.push_font(('h2', 0, 1, 0))
203
204    def end_h2(self):
205        self.formatter.end_paragraph(1)
206        self.formatter.pop_font()
207
208    def start_h3(self, attrs):
209        self.formatter.end_paragraph(1)
210        self.formatter.push_font(('h3', 0, 1, 0))
211
212    def end_h3(self):
213        self.formatter.end_paragraph(1)
214        self.formatter.pop_font()
215
216    def start_h4(self, attrs):
217        self.formatter.end_paragraph(1)
218        self.formatter.push_font(('h4', 0, 1, 0))
219
220    def end_h4(self):
221        self.formatter.end_paragraph(1)
222        self.formatter.pop_font()
223
224    def start_h5(self, attrs):
225        self.formatter.end_paragraph(1)
226        self.formatter.push_font(('h5', 0, 1, 0))
227
228    def end_h5(self):
229        self.formatter.end_paragraph(1)
230        self.formatter.pop_font()
231
232    def start_h6(self, attrs):
233        self.formatter.end_paragraph(1)
234        self.formatter.push_font(('h6', 0, 1, 0))
235
236    def end_h6(self):
237        self.formatter.end_paragraph(1)
238        self.formatter.pop_font()
239
240    # --- Block Structuring Elements
241
242    def do_p(self, attrs):
243        self.formatter.end_paragraph(1)
244
245    def start_pre(self, attrs):
246        self.formatter.end_paragraph(1)
247        self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
248        self.nofill = self.nofill + 1
249
250    def end_pre(self):
251        self.formatter.end_paragraph(1)
252        self.formatter.pop_font()
253        self.nofill = max(0, self.nofill - 1)
254
255    def start_xmp(self, attrs):
256        self.start_pre(attrs)
257        self.setliteral('xmp')  # Tell SGML parser
258
259    def end_xmp(self):
260        self.end_pre()
261
262    def start_listing(self, attrs):
263        self.start_pre(attrs)
264        self.setliteral('listing')  # Tell SGML parser
265
266    def end_listing(self):
267        self.end_pre()
268
269    def start_address(self, attrs):
270        self.formatter.end_paragraph(0)
271        self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
272
273    def end_address(self):
274        self.formatter.end_paragraph(0)
275        self.formatter.pop_font()
276
277    def start_blockquote(self, attrs):
278        self.formatter.end_paragraph(1)
279        self.formatter.push_margin('blockquote')
280
281    def end_blockquote(self):
282        self.formatter.end_paragraph(1)
283        self.formatter.pop_margin()
284
285    # --- List Elements
286
287    def start_ul(self, attrs):
288        self.formatter.end_paragraph(not self.list_stack)
289        self.formatter.push_margin('ul')
290        self.list_stack.append(['ul', '*', 0])
291
292    def end_ul(self):
293        if self.list_stack:
294            del self.list_stack[-1]
295        self.formatter.end_paragraph(not self.list_stack)
296        self.formatter.pop_margin()
297
298    def do_li(self, attrs):
299        self.formatter.end_paragraph(0)
300        if self.list_stack:
301            [dummy, label, counter] = top = self.list_stack[-1]
302            top[2] = counter = counter+1
303        else:
304            label, counter = '*', 0
305        self.formatter.add_label_data(label, counter)
306
307    def start_ol(self, attrs):
308        self.formatter.end_paragraph(not self.list_stack)
309        self.formatter.push_margin('ol')
310        label = '1.'
311        for a, v in attrs:
312            if a == 'type':
313                if len(v) == 1:
314                    v += '.'
315                label = v
316        self.list_stack.append(['ol', label, 0])
317
318    def end_ol(self):
319        if self.list_stack:
320            del self.list_stack[-1]
321        self.formatter.end_paragraph(not self.list_stack)
322        self.formatter.pop_margin()
323
324    def start_menu(self, attrs):
325        self.start_ul(attrs)
326
327    def end_menu(self):
328        self.end_ul()
329
330    def start_dir(self, attrs):
331        self.start_ul(attrs)
332
333    def end_dir(self):
334        self.end_ul()
335
336    def start_dl(self, attrs):
337        self.formatter.end_paragraph(1)
338        self.list_stack.append(['dl', '', 0])
339
340    def end_dl(self):
341        self.ddpop(1)
342        if self.list_stack:
343            del self.list_stack[-1]
344
345    def do_dt(self, attrs):
346        self.ddpop()
347
348    def do_dd(self, attrs):
349        self.ddpop()
350        self.formatter.push_margin('dd')
351        self.list_stack.append(['dd', '', 0])
352
353    def ddpop(self, bl=0):
354        self.formatter.end_paragraph(bl)
355        if self.list_stack:
356            if self.list_stack[-1][0] == 'dd':
357                del self.list_stack[-1]
358                self.formatter.pop_margin()
359
360    # --- Phrase Markup
361
362    # Idiomatic Elements
363
364    def start_cite(self, attrs): self.start_i(attrs)
365
366    def end_cite(self): self.end_i()
367
368    def start_code(self, attrs): self.start_tt(attrs)
369
370    def end_code(self): self.end_tt()
371
372    def start_em(self, attrs): self.start_i(attrs)
373
374    def end_em(self): self.end_i()
375
376    def start_kbd(self, attrs): self.start_tt(attrs)
377
378    def end_kbd(self): self.end_tt()
379
380    def start_samp(self, attrs): self.start_tt(attrs)
381
382    def end_samp(self): self.end_tt()
383
384    def start_strong(self, attrs): self.start_b(attrs)
385
386    def end_strong(self): self.end_b()
387
388    def start_var(self, attrs): self.start_i(attrs)
389
390    def end_var(self): self.end_i()
391
392    # Typographic Elements
393
394    def start_i(self, attrs):
395        self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
396
397    def end_i(self):
398        self.formatter.pop_font()
399
400    def start_b(self, attrs):
401        self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
402
403    def end_b(self):
404        self.formatter.pop_font()
405
406    def start_tt(self, attrs):
407        self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
408
409    def end_tt(self):
410        self.formatter.pop_font()
411
412    def start_a(self, attrs):
413        href = ''
414        name = ''
415        type = ''
416        for attrname, value in attrs:
417            value = value.strip()
418            if attrname == 'href':
419                href = value
420            if attrname == 'name':
421                name = value
422            if attrname == 'type':
423                type = value.lower()
424        self.anchor_bgn(href, name, type)
425
426    def end_a(self):
427        self.anchor_end()
428
429    # --- Line Break
430
431    def do_br(self, attrs):
432        self.formatter.add_line_break()
433
434    # --- Horizontal Rule
435
436    def do_hr(self, attrs):
437        self.formatter.add_hor_rule()
438
439    # --- Image
440
441    def do_img(self, attrs):
442        align = ''
443        alt = '(image)'
444        ismap = ''
445        src = ''
446        width = 0
447        height = 0
448        for attrname, value in attrs:
449            if attrname == 'align':
450                align = value
451            if attrname == 'alt':
452                alt = value
453            if attrname == 'ismap':
454                ismap = value
455            if attrname == 'src':
456                src = value
457            if attrname == 'width':
458                try:
459                    width = int(value)
460                except ValueError:
461                    pass
462            if attrname == 'height':
463                try:
464                    height = int(value)
465                except ValueError:
466                    pass
467        self.handle_image(src, alt, ismap, align, width, height)
468
469    # --- Really Old Unofficial Deprecated Stuff
470
471    def do_plaintext(self, attrs):
472        self.start_pre(attrs)
473        self.setnomoretags()  # Tell SGML parser
474
475    # --- Unhandled tags
476
477    def unknown_starttag(self, tag, attrs):
478        pass
479
480    def unknown_endtag(self, tag):
481        pass
482
483
484def test(args=None):
485    import sys
486    import pysollib.formatter
487
488    if not args:
489        args = sys.argv[1:]
490
491    silent = args and args[0] == '-s'
492    if silent:
493        del args[0]
494
495    if args:
496        fn = args[0]
497    else:
498        fn = 'test.html'
499
500    if fn == '-':
501        data = sys.stdin.read()
502    else:
503        try:
504            with open(fn, 'rt') as fh:
505                data = fh.read()
506        except IOError as msg:
507            print(fn, ":", msg)
508            sys.exit(1)
509
510    if silent:
511        f = pysollib.formatter.NullFormatter()
512    else:
513        f = pysollib.formatter.AbstractFormatter(
514            pysollib.formatter.DumbWriter()
515        )
516
517    p = HTMLParser(f)
518    p.feed(data)
519    p.close()
520
521
522if __name__ == '__main__':
523    test()
524