1# 2# htmlStripper.py 3# 4# Sample code for stripping HTML markup tags and scripts from 5# HTML source files. 6# 7# Copyright (c) 2006, 2016, Paul McGuire 8# 9from contextlib import closing 10import urllib.request, urllib.parse, urllib.error 11from pyparsing import (makeHTMLTags, commonHTMLEntity, replaceHTMLEntity, 12 htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith) 13 14scriptOpen, scriptClose = makeHTMLTags("script") 15scriptBody = scriptOpen + scriptOpen.tag_body + scriptClose 16commonHTMLEntity.setParseAction(replaceHTMLEntity) 17 18# get some HTML 19targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary" 20with closing(urllib.request.urlopen( targetURL )) as targetPage: 21 targetHTML = targetPage.read().decode("UTF-8") 22 23# first pass, strip out tags and translate entities 24firstPass = (htmlComment | scriptBody | commonHTMLEntity | 25 anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML) 26 27# first pass leaves many blank lines, collapse these down 28repeatedNewlines = LineEnd()*(2,) 29repeatedNewlines.setParseAction(replaceWith("\n\n")) 30secondPass = repeatedNewlines.transformString(firstPass) 31 32print(secondPass) 33