1#
2# htmlStripper.py
3#
4#  Sample code for stripping HTML markup tags and scripts from
5#  HTML source files.
6#
7# Copyright (c) 2006, 2016, Paul McGuire
8#
9from urllib.request import urlopen
10from pyparsing import (
11    makeHTMLTags,
12    commonHTMLEntity,
13    replaceHTMLEntity,
14    htmlComment,
15    anyOpenTag,
16    anyCloseTag,
17    LineEnd,
18    replaceWith,
19)
20
21scriptOpen, scriptClose = makeHTMLTags("script")
22scriptBody = scriptOpen + scriptOpen.tag_body + scriptClose
23commonHTMLEntity.setParseAction(replaceHTMLEntity)
24
25# get some HTML
26targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary"
27with urlopen(targetURL) as targetPage:
28    targetHTML = targetPage.read().decode("UTF-8")
29
30# first pass, strip out tags and translate entities
31firstPass = (
32    (htmlComment | scriptBody | commonHTMLEntity | anyOpenTag | anyCloseTag)
33    .suppress()
34    .transformString(targetHTML)
35)
36
37# first pass leaves many blank lines, collapse these down
38repeatedNewlines = LineEnd() * (2,)
39repeatedNewlines.setParseAction(replaceWith("\n\n"))
40secondPass = repeatedNewlines.transformString(firstPass)
41
42print(secondPass)
43