1#
2# htmlStripper.py
3#
4#  Sample code for stripping HTML markup tags and scripts from
5#  HTML source files.
6#
7# Copyright (c) 2006, 2016, Paul McGuire
8#
9from contextlib import closing
10import urllib.request, urllib.parse, urllib.error
11from pyparsing import (makeHTMLTags, commonHTMLEntity, replaceHTMLEntity,
12    htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith)
13
14scriptOpen, scriptClose = makeHTMLTags("script")
15scriptBody = scriptOpen + scriptOpen.tag_body + scriptClose
16commonHTMLEntity.setParseAction(replaceHTMLEntity)
17
18# get some HTML
19targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary"
20with closing(urllib.request.urlopen( targetURL )) as targetPage:
21    targetHTML = targetPage.read().decode("UTF-8")
22
23# first pass, strip out tags and translate entities
24firstPass = (htmlComment | scriptBody | commonHTMLEntity |
25             anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML)
26
27# first pass leaves many blank lines, collapse these down
28repeatedNewlines = LineEnd()*(2,)
29repeatedNewlines.setParseAction(replaceWith("\n\n"))
30secondPass = repeatedNewlines.transformString(firstPass)
31
32print(secondPass)
33