1"""Bake a Beancount input file's web files to a directory hierarchy.
2
3You provide a Beancount filename, an output directory, and this script
4runs a server and a scraper that puts all the files in the directory,
5and if your output name has an archive suffix, we automatically the
6fetched directory contents to the archive and delete them.
7"""
8__copyright__ = "Copyright (C) 2014-2016  Martin Blais"
9__license__ = "GNU GPLv2"
10
11from os import path
12import functools
13import importlib
14import logging
15import os
16import re
17import shlex
18import shutil
19import subprocess
20import zipfile
21
22import lxml.html
23
24from beancount.utils import scrape
25from beancount.web import web
26from beancount.utils import file_utils
27from beancount.parser import version
28
29
30# Directories where binary files are allowed.
31BINARY_DIRECTORIES = ['resources', 'third_party', 'doc']
32BINARY_MATCH = re.compile(r'/({}/|favicon.ico$)'.format(
33    '|'.join(BINARY_DIRECTORIES))).match
34
35
36def normalize_filename(url):
37    """Convert URL paths to filenames. Add .html extension if needed.
38
39    Args:
40      url: A string, the url to convert.
41    Returns:
42      A string, possibly with an extension appended.
43    """
44    if url.endswith('/'):
45        return path.join(url, 'index.html')
46    elif BINARY_MATCH(url):
47        return url
48    else:
49        return url if url.endswith('.html') else (url + '.html')
50
51
52def relativize_links(html, current_url):
53    """Make all the links in the contents string relative to an URL.
54
55    Args:
56      html: An lxml document node.
57      current_url: A string, the URL of the current page, a path to.
58        a file or a directory. If the path represents a directory, the
59        path ends with a /.
60    """
61    current_dir = path.dirname(current_url)
62    for element, attribute, link, pos in lxml.html.iterlinks(html):
63        if path.isabs(link):
64            relative_link = path.relpath(normalize_filename(link), current_dir)
65            element.set(attribute, relative_link)
66
67
68def remove_links(html, targets):
69    """Convert a list of anchors (<a>) from an HTML tree to spans (<span>).
70
71    Args:
72      html: An lxml document node.
73      targets: A set of string, targets to be removed.
74    """
75    for element, attribute, link, pos in lxml.html.iterlinks(html):
76        if link in targets:
77            del element.attrib[attribute]
78            element.tag = 'span'
79            element.set('class', 'removed-link')
80
81
82def save_scraped_document(output_dir, url, response, contents, html_root, skipped_urls):
83    """Callback function to process a document being scraped.
84
85    This converts the document to have relative links and writes out the file to
86    the output directory.
87
88    Args:
89      output_dir: A string, the output directory to write.
90      url: A string, the originally requested URL.
91      response: An http response as per urlopen.
92      contents: Bytes, the content of a response.
93      html_root: An lxml root node for the document, optionally. If this is provided,
94        this avoid you having to reprocess it (for performance reasons).
95      skipped_urls: A set of the links from the file that were skipped.
96    """
97    if response.status != 200:
98        logging.error("Invalid status: %s", response.status)
99
100    # Ignore directories.
101    if url.endswith('/'):
102        return
103
104    # Note that we're saving the file under the non-redirected URL, because this
105    # will have to be opened using files and there are no redirects that way.
106
107    if response.info().get_content_type() == 'text/html':
108        if html_root is None:
109            html_root = lxml.html.document_fromstring(contents)
110        remove_links(html_root, skipped_urls)
111        relativize_links(html_root, url)
112        contents = lxml.html.tostring(html_root, method="html")
113
114    # Compute output filename and write out the relativized contents.
115    output_filename = path.join(output_dir,
116                                normalize_filename(url).lstrip('/'))
117    os.makedirs(path.dirname(output_filename), exist_ok=True)
118    with open(output_filename, 'wb') as outfile:
119        outfile.write(contents)
120
121
122def bake_to_directory(webargs, output_dir, render_all_pages=True):
123    """Serve and bake a Beancount's web to a directory.
124
125    Args:
126      webargs: An argparse parsed options object with the web app arguments.
127      output_dir: A directory name. We don't check here whether it exists or not.
128      quiet: A boolean, True to suppress web server fetch log.
129      render_all_pages: If true, fetch the full set of pages, not just the subset that
130        is palatable.
131    Returns:
132      True on success, False otherwise.
133    """
134    callback = functools.partial(save_scraped_document, output_dir)
135
136    if render_all_pages:
137        ignore_regexps = None
138    else:
139        regexps = [
140            # Skip the context pages, too slow.
141            r'/context/',
142            # Skip the link pages, too slow.
143            r'/link/',
144            # Skip the component pages... too many.
145            r'/view/component/',
146            # Skip served documents.
147            r'/.*/doc/',
148            # Skip monthly pages.
149            r'/view/year/\d\d\d\d/month/',
150        ]
151        ignore_regexps = '({})'.format('|'.join(regexps))
152
153    processed_urls, skipped_urls = web.scrape_webapp(webargs, callback, ignore_regexps)
154
155
156def archive(command_template, directory, archive, quiet=False):
157    """Archive the directory to the given tar/gz archive filename.
158
159    Args:
160      command_template: A string, the command template to format with in order
161        to compute the command to run.
162      directory: A string, the name of the directory to archive.
163      archive: A string, the name of the file to output.
164      quiet: A boolean, True to suppress output.
165    Raises:
166      IOError: if the directory does not exist or if the archive name already
167      exists.
168
169    """
170    directory = path.abspath(directory)
171    archive = path.abspath(archive)
172    if not path.exists(directory):
173        raise IOError("Directory to archive '{}' does not exist".format(
174            directory))
175    if path.exists(archive):
176        raise IOError("Output archive name '{}' already exists".format(
177            archive))
178
179    command = command_template.format(directory=directory,
180                                      dirname=path.dirname(directory),
181                                      basename=path.basename(directory),
182                                      archive=archive)
183
184    pipe = subprocess.Popen(shlex.split(command),
185                            shell=False,
186                            cwd=path.dirname(directory),
187                            stdout=subprocess.PIPE if quiet else None,
188                            stderr=subprocess.PIPE if quiet else None)
189    _, _ = pipe.communicate()
190    if pipe.returncode != 0:
191        raise OSError("Archive failure")
192
193
194def archive_zip(directory, archive):
195    """Archive the directory to the given tar/gz archive filename.
196
197    Args:
198      directory: A string, the name of the directory to archive.
199      archive: A string, the name of the file to output.
200    """
201    # Figure out optimal level of compression among the supported ones in this
202    # installation.
203    for spec, compression in [
204            ('lzma', zipfile.ZIP_LZMA),
205            ('bz2', zipfile.ZIP_BZIP2),
206            ('zlib', zipfile.ZIP_DEFLATED)]:
207        if importlib.util.find_spec(spec):
208            zip_compression = compression
209            break
210    else:
211        # Default is no compression.
212        zip_compression = zipfile.ZIP_STORED
213
214    with file_utils.chdir(directory), zipfile.ZipFile(
215            archive, 'w', compression=zip_compression) as archfile:
216        for root, dirs, files in os.walk(directory):
217            for filename in files:
218                relpath = path.relpath(path.join(root, filename), directory)
219                archfile.write(relpath)
220
221
222ARCHIVERS = {
223    '.tar.gz'  : 'tar -C {dirname} -zcvf {archive} {basename}',
224    '.tgz'     : 'tar -C {dirname} -zcvf {archive} {basename}',
225    '.tar.bz2' : 'tar -C {dirname} -jcvf {archive} {basename}',
226    '.zip'     : archive_zip,
227    }
228
229
230def main():
231    parser = version.ArgumentParser(description=__doc__)
232
233    web_group = web.add_web_arguments(parser)
234    web_group.set_defaults(port=9475)
235
236    group = parser.add_argument_group("Bake process arguments")
237
238    group.add_argument('output',
239                       help=('The output directory or archive name. If you '
240                             'specify a filename with a well-known extension,'
241                             'we automatically archive the fetched directory '
242                             'contents to this archive name and delete them.'))
243
244    # In order to be able to bake in a reasonable amount of time, we need to
245    # remove some pages; you can use this switch to do that.
246    group.add_argument('--render-all-pages', '--full', action='store_true',
247                       help=("Don't ignore some of the more numerious pages, "
248                             "like monthly reports."))
249
250    opts = parser.parse_args()
251
252    # Figure out the archival method.
253    output_directory, extension = file_utils.path_greedy_split(opts.output)
254    if extension:
255        try:
256            archival_command = ARCHIVERS[extension]
257        except KeyError as exc:
258            raise SystemExit("ERROR: Unknown archiver type '{}'".format(extension)) from exc
259    else:
260        archival_command = None
261
262    # Check pre-conditions on input/output filenames.
263    if not path.exists(opts.filename):
264        raise SystemExit("ERROR: Missing input file '{}'".format(opts.filename))
265    if path.exists(opts.output):
266        raise SystemExit("ERROR: Output path already exists '{}'".format(opts.output))
267    if path.exists(output_directory):
268        raise SystemExit(
269            "ERROR: Output directory already exists '{}'".format(output_directory))
270
271    # Bake to a directory hierarchy of files with local links.
272    bake_to_directory(opts, output_directory, opts.render_all_pages)
273
274    # Verify the bake output files. This is just a sanity checking step.
275    # You can also use "bean-doctor validate_html <file> to run this manually.
276    logging.info('Validating HTML output files & links.')
277    files, missing, empty = scrape.validate_local_links_in_dir(output_directory)
278    logging.info('Validation: %d files processed', len(files))
279    for target in missing:
280        logging.error("Validation error: Missing '%s'", target)
281    for target in empty:
282        logging.error("Validation error: Empty '%s'", target)
283
284    # Archive if requested.
285    if archival_command is not None:
286        # Normalize the paths and ensure sanity before we start compression.
287        output_directory = path.abspath(output_directory)
288        archive_filename = path.abspath(opts.output)
289        if not path.exists(output_directory):
290            raise IOError("Directory to archive '{}' does not exist".format(
291                output_directory))
292        if path.exists(archive_filename):
293            raise IOError("Output archive name '{}' already exists".format(
294                archive_filename))
295
296        # Dispatch to a particular compressor.
297        if isinstance(archival_command, str):
298            archive(archival_command, output_directory, archive_filename, True)
299        elif callable(archival_command):
300            archival_command(output_directory, archive_filename)
301
302        # Delete the output directory.
303        shutil.rmtree(output_directory)
304
305    print("Output in '{}'".format(opts.output))
306
307
308if __name__ == '__main__':
309    main()
310