1"""Bake a Beancount input file's web files to a directory hierarchy. 2 3You provide a Beancount filename, an output directory, and this script 4runs a server and a scraper that puts all the files in the directory, 5and if your output name has an archive suffix, we automatically the 6fetched directory contents to the archive and delete them. 7""" 8__copyright__ = "Copyright (C) 2014-2016 Martin Blais" 9__license__ = "GNU GPLv2" 10 11from os import path 12import functools 13import importlib 14import logging 15import os 16import re 17import shlex 18import shutil 19import subprocess 20import zipfile 21 22import lxml.html 23 24from beancount.utils import scrape 25from beancount.web import web 26from beancount.utils import file_utils 27from beancount.parser import version 28 29 30# Directories where binary files are allowed. 31BINARY_DIRECTORIES = ['resources', 'third_party', 'doc'] 32BINARY_MATCH = re.compile(r'/({}/|favicon.ico$)'.format( 33 '|'.join(BINARY_DIRECTORIES))).match 34 35 36def normalize_filename(url): 37 """Convert URL paths to filenames. Add .html extension if needed. 38 39 Args: 40 url: A string, the url to convert. 41 Returns: 42 A string, possibly with an extension appended. 43 """ 44 if url.endswith('/'): 45 return path.join(url, 'index.html') 46 elif BINARY_MATCH(url): 47 return url 48 else: 49 return url if url.endswith('.html') else (url + '.html') 50 51 52def relativize_links(html, current_url): 53 """Make all the links in the contents string relative to an URL. 54 55 Args: 56 html: An lxml document node. 57 current_url: A string, the URL of the current page, a path to. 58 a file or a directory. If the path represents a directory, the 59 path ends with a /. 60 """ 61 current_dir = path.dirname(current_url) 62 for element, attribute, link, pos in lxml.html.iterlinks(html): 63 if path.isabs(link): 64 relative_link = path.relpath(normalize_filename(link), current_dir) 65 element.set(attribute, relative_link) 66 67 68def remove_links(html, targets): 69 """Convert a list of anchors (<a>) from an HTML tree to spans (<span>). 70 71 Args: 72 html: An lxml document node. 73 targets: A set of string, targets to be removed. 74 """ 75 for element, attribute, link, pos in lxml.html.iterlinks(html): 76 if link in targets: 77 del element.attrib[attribute] 78 element.tag = 'span' 79 element.set('class', 'removed-link') 80 81 82def save_scraped_document(output_dir, url, response, contents, html_root, skipped_urls): 83 """Callback function to process a document being scraped. 84 85 This converts the document to have relative links and writes out the file to 86 the output directory. 87 88 Args: 89 output_dir: A string, the output directory to write. 90 url: A string, the originally requested URL. 91 response: An http response as per urlopen. 92 contents: Bytes, the content of a response. 93 html_root: An lxml root node for the document, optionally. If this is provided, 94 this avoid you having to reprocess it (for performance reasons). 95 skipped_urls: A set of the links from the file that were skipped. 96 """ 97 if response.status != 200: 98 logging.error("Invalid status: %s", response.status) 99 100 # Ignore directories. 101 if url.endswith('/'): 102 return 103 104 # Note that we're saving the file under the non-redirected URL, because this 105 # will have to be opened using files and there are no redirects that way. 106 107 if response.info().get_content_type() == 'text/html': 108 if html_root is None: 109 html_root = lxml.html.document_fromstring(contents) 110 remove_links(html_root, skipped_urls) 111 relativize_links(html_root, url) 112 contents = lxml.html.tostring(html_root, method="html") 113 114 # Compute output filename and write out the relativized contents. 115 output_filename = path.join(output_dir, 116 normalize_filename(url).lstrip('/')) 117 os.makedirs(path.dirname(output_filename), exist_ok=True) 118 with open(output_filename, 'wb') as outfile: 119 outfile.write(contents) 120 121 122def bake_to_directory(webargs, output_dir, render_all_pages=True): 123 """Serve and bake a Beancount's web to a directory. 124 125 Args: 126 webargs: An argparse parsed options object with the web app arguments. 127 output_dir: A directory name. We don't check here whether it exists or not. 128 quiet: A boolean, True to suppress web server fetch log. 129 render_all_pages: If true, fetch the full set of pages, not just the subset that 130 is palatable. 131 Returns: 132 True on success, False otherwise. 133 """ 134 callback = functools.partial(save_scraped_document, output_dir) 135 136 if render_all_pages: 137 ignore_regexps = None 138 else: 139 regexps = [ 140 # Skip the context pages, too slow. 141 r'/context/', 142 # Skip the link pages, too slow. 143 r'/link/', 144 # Skip the component pages... too many. 145 r'/view/component/', 146 # Skip served documents. 147 r'/.*/doc/', 148 # Skip monthly pages. 149 r'/view/year/\d\d\d\d/month/', 150 ] 151 ignore_regexps = '({})'.format('|'.join(regexps)) 152 153 processed_urls, skipped_urls = web.scrape_webapp(webargs, callback, ignore_regexps) 154 155 156def archive(command_template, directory, archive, quiet=False): 157 """Archive the directory to the given tar/gz archive filename. 158 159 Args: 160 command_template: A string, the command template to format with in order 161 to compute the command to run. 162 directory: A string, the name of the directory to archive. 163 archive: A string, the name of the file to output. 164 quiet: A boolean, True to suppress output. 165 Raises: 166 IOError: if the directory does not exist or if the archive name already 167 exists. 168 169 """ 170 directory = path.abspath(directory) 171 archive = path.abspath(archive) 172 if not path.exists(directory): 173 raise IOError("Directory to archive '{}' does not exist".format( 174 directory)) 175 if path.exists(archive): 176 raise IOError("Output archive name '{}' already exists".format( 177 archive)) 178 179 command = command_template.format(directory=directory, 180 dirname=path.dirname(directory), 181 basename=path.basename(directory), 182 archive=archive) 183 184 pipe = subprocess.Popen(shlex.split(command), 185 shell=False, 186 cwd=path.dirname(directory), 187 stdout=subprocess.PIPE if quiet else None, 188 stderr=subprocess.PIPE if quiet else None) 189 _, _ = pipe.communicate() 190 if pipe.returncode != 0: 191 raise OSError("Archive failure") 192 193 194def archive_zip(directory, archive): 195 """Archive the directory to the given tar/gz archive filename. 196 197 Args: 198 directory: A string, the name of the directory to archive. 199 archive: A string, the name of the file to output. 200 """ 201 # Figure out optimal level of compression among the supported ones in this 202 # installation. 203 for spec, compression in [ 204 ('lzma', zipfile.ZIP_LZMA), 205 ('bz2', zipfile.ZIP_BZIP2), 206 ('zlib', zipfile.ZIP_DEFLATED)]: 207 if importlib.util.find_spec(spec): 208 zip_compression = compression 209 break 210 else: 211 # Default is no compression. 212 zip_compression = zipfile.ZIP_STORED 213 214 with file_utils.chdir(directory), zipfile.ZipFile( 215 archive, 'w', compression=zip_compression) as archfile: 216 for root, dirs, files in os.walk(directory): 217 for filename in files: 218 relpath = path.relpath(path.join(root, filename), directory) 219 archfile.write(relpath) 220 221 222ARCHIVERS = { 223 '.tar.gz' : 'tar -C {dirname} -zcvf {archive} {basename}', 224 '.tgz' : 'tar -C {dirname} -zcvf {archive} {basename}', 225 '.tar.bz2' : 'tar -C {dirname} -jcvf {archive} {basename}', 226 '.zip' : archive_zip, 227 } 228 229 230def main(): 231 parser = version.ArgumentParser(description=__doc__) 232 233 web_group = web.add_web_arguments(parser) 234 web_group.set_defaults(port=9475) 235 236 group = parser.add_argument_group("Bake process arguments") 237 238 group.add_argument('output', 239 help=('The output directory or archive name. If you ' 240 'specify a filename with a well-known extension,' 241 'we automatically archive the fetched directory ' 242 'contents to this archive name and delete them.')) 243 244 # In order to be able to bake in a reasonable amount of time, we need to 245 # remove some pages; you can use this switch to do that. 246 group.add_argument('--render-all-pages', '--full', action='store_true', 247 help=("Don't ignore some of the more numerious pages, " 248 "like monthly reports.")) 249 250 opts = parser.parse_args() 251 252 # Figure out the archival method. 253 output_directory, extension = file_utils.path_greedy_split(opts.output) 254 if extension: 255 try: 256 archival_command = ARCHIVERS[extension] 257 except KeyError as exc: 258 raise SystemExit("ERROR: Unknown archiver type '{}'".format(extension)) from exc 259 else: 260 archival_command = None 261 262 # Check pre-conditions on input/output filenames. 263 if not path.exists(opts.filename): 264 raise SystemExit("ERROR: Missing input file '{}'".format(opts.filename)) 265 if path.exists(opts.output): 266 raise SystemExit("ERROR: Output path already exists '{}'".format(opts.output)) 267 if path.exists(output_directory): 268 raise SystemExit( 269 "ERROR: Output directory already exists '{}'".format(output_directory)) 270 271 # Bake to a directory hierarchy of files with local links. 272 bake_to_directory(opts, output_directory, opts.render_all_pages) 273 274 # Verify the bake output files. This is just a sanity checking step. 275 # You can also use "bean-doctor validate_html <file> to run this manually. 276 logging.info('Validating HTML output files & links.') 277 files, missing, empty = scrape.validate_local_links_in_dir(output_directory) 278 logging.info('Validation: %d files processed', len(files)) 279 for target in missing: 280 logging.error("Validation error: Missing '%s'", target) 281 for target in empty: 282 logging.error("Validation error: Empty '%s'", target) 283 284 # Archive if requested. 285 if archival_command is not None: 286 # Normalize the paths and ensure sanity before we start compression. 287 output_directory = path.abspath(output_directory) 288 archive_filename = path.abspath(opts.output) 289 if not path.exists(output_directory): 290 raise IOError("Directory to archive '{}' does not exist".format( 291 output_directory)) 292 if path.exists(archive_filename): 293 raise IOError("Output archive name '{}' already exists".format( 294 archive_filename)) 295 296 # Dispatch to a particular compressor. 297 if isinstance(archival_command, str): 298 archive(archival_command, output_directory, archive_filename, True) 299 elif callable(archival_command): 300 archival_command(output_directory, archive_filename) 301 302 # Delete the output directory. 303 shutil.rmtree(output_directory) 304 305 print("Output in '{}'".format(opts.output)) 306 307 308if __name__ == '__main__': 309 main() 310