1# 2# htmlStripper.py 3# 4# Sample code for stripping HTML markup tags and scripts from 5# HTML source files. 6# 7# Copyright (c) 2006, 2016, Paul McGuire 8# 9from urllib.request import urlopen 10from pyparsing import ( 11 makeHTMLTags, 12 commonHTMLEntity, 13 replaceHTMLEntity, 14 htmlComment, 15 anyOpenTag, 16 anyCloseTag, 17 LineEnd, 18 replaceWith, 19) 20 21scriptOpen, scriptClose = makeHTMLTags("script") 22scriptBody = scriptOpen + scriptOpen.tag_body + scriptClose 23commonHTMLEntity.setParseAction(replaceHTMLEntity) 24 25# get some HTML 26targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary" 27with urlopen(targetURL) as targetPage: 28 targetHTML = targetPage.read().decode("UTF-8") 29 30# first pass, strip out tags and translate entities 31firstPass = ( 32 (htmlComment | scriptBody | commonHTMLEntity | anyOpenTag | anyCloseTag) 33 .suppress() 34 .transformString(targetHTML) 35) 36 37# first pass leaves many blank lines, collapse these down 38repeatedNewlines = LineEnd() * (2,) 39repeatedNewlines.setParseAction(replaceWith("\n\n")) 40secondPass = repeatedNewlines.transformString(firstPass) 41 42print(secondPass) 43