1# -*- coding: utf-8 -*-
2# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
3# Copyright (C) 2012-2014 Bastian Kleineidam
4# Copyright (C) 2015-2020 Tobias Gruetzmacher
5
6from __future__ import absolute_import, division, print_function
7
8import re
9import multiprocessing
10from six.moves.urllib.parse import urlsplit
11
12
13# Dictionary with per-host locks.
14_locks = {}
15# Allowed number of connections per host
16MaxConnections = 2
17# Maximum number of strips to get to test a comic
18MaxStrips = 5
19# Match (already-escaped) archive.org URL
20ARCHIVE_ORG_MATCH = re.compile(r'(?<=web\\.archive\\.org/web)/\d+/')
21# Matches some (maybe-escaped - because Python 2) printf-style format specifiers
22PRINTF_MATCH = re.compile(r'\\?%[0-9]*[sd]')
23
24
25def get_lock(host):
26    """Get bounded semphore for given host."""
27    if host not in _locks:
28        _locks[host] = multiprocessing.BoundedSemaphore(MaxConnections)
29    return _locks[host]
30
31
32def test_comicmodule(tmpdir, scraperobj, worker_id):
33    '''Test a scraper. It must be able to traverse backward for at least 5
34    strips from the start, and find strip images on at least 4 pages.'''
35    # Limit number of connections to one host.
36    host = urlsplit(scraperobj.url).hostname
37    with get_lock(host):
38        _test_comic(str(tmpdir), scraperobj)
39
40
41def _test_comic(outdir, scraperobj):
42    num_strips = 0
43    strip = None
44    files = []
45    for strip in scraperobj.getStrips(MaxStrips):
46        files.append(_check_strip(outdir, strip,
47                                  scraperobj.multipleImagesPerStrip))
48
49        if num_strips > 0:
50            _check_stripurl(strip, scraperobj)
51        num_strips += 1
52
53    if scraperobj.prevSearch and not scraperobj.hitFirstStripUrl:
54        # subtract the number of skipped URLs with no image from the expected
55        # image number
56        num_strips_expected = MaxStrips - len(scraperobj.skippedUrls)
57        msg = 'Traversed %d strips instead of %d.' % (num_strips,
58                                                      num_strips_expected)
59        if strip:
60            msg += " Check the prevSearch pattern at %s" % strip.strip_url
61        assert num_strips == num_strips_expected, msg
62        if strip:
63            _check_scraperesult(files, num_strips_expected, strip, scraperobj)
64
65
66def _check_strip(outdir, strip, multipleImagesPerStrip):
67    '''Check that a specific page yields images and the comic module correctly
68    declares if there are multiple images per page.'''
69    images = []
70    files = []
71    for image in strip.getImages():
72        images.append(image.url)
73
74        # write a fake image (to download less)
75        fakeimg = image._fnbase(outdir) + '.fake'
76        with open(fakeimg, 'w') as f:
77            f.write("fake image for testing")
78
79        fn, _ = image.save(outdir)
80        files.append(fn)
81    assert images, 'failed to find images at %s' % strip.strip_url
82    if not multipleImagesPerStrip:
83        assert len(images) == 1, 'found more than 1 image at {}: {}'.format(
84            strip.strip_url, images)
85    return files
86
87
88def _check_scraperesult(saved_images, num_images_expected, strip, scraperobj):
89    '''Check that exactly or for multiple pages at least num_strips images are
90    saved. This checks saved files, ie. it detects duplicate filenames.'''
91    num_images = len(saved_images)
92
93    attrs = (num_images, saved_images, num_images_expected)
94    if scraperobj.multipleImagesPerStrip:
95        err = 'saved %d %s instead of at least %d images' % attrs
96        assert num_images >= num_images_expected, err
97    else:
98        err = 'saved %d %s instead of %d images' % attrs
99        assert num_images == num_images_expected, err
100
101
102def _check_stripurl(strip, scraperobj):
103    if not scraperobj.stripUrl:
104        # no indexing support
105        return
106    # test that the stripUrl regex matches the retrieved strip URL
107    urlmatch = re.escape(scraperobj.stripUrl)
108    urlmatch = PRINTF_MATCH.sub('.+', urlmatch)
109    urlmatch = ARCHIVE_ORG_MATCH.sub(r'/\\d+/', urlmatch)
110    ro = re.compile(urlmatch)
111    mo = ro.match(strip.strip_url)
112    err = 'strip URL {!r} does not match stripUrl pattern {}'.format(
113        strip.strip_url, urlmatch)
114    assert mo is not None, err
115