1# -*- coding: utf-8 -*- 2# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs 3# Copyright (C) 2012-2014 Bastian Kleineidam 4# Copyright (C) 2015-2020 Tobias Gruetzmacher 5 6from __future__ import absolute_import, division, print_function 7 8import re 9import multiprocessing 10from six.moves.urllib.parse import urlsplit 11 12 13# Dictionary with per-host locks. 14_locks = {} 15# Allowed number of connections per host 16MaxConnections = 2 17# Maximum number of strips to get to test a comic 18MaxStrips = 5 19# Match (already-escaped) archive.org URL 20ARCHIVE_ORG_MATCH = re.compile(r'(?<=web\\.archive\\.org/web)/\d+/') 21# Matches some (maybe-escaped - because Python 2) printf-style format specifiers 22PRINTF_MATCH = re.compile(r'\\?%[0-9]*[sd]') 23 24 25def get_lock(host): 26 """Get bounded semphore for given host.""" 27 if host not in _locks: 28 _locks[host] = multiprocessing.BoundedSemaphore(MaxConnections) 29 return _locks[host] 30 31 32def test_comicmodule(tmpdir, scraperobj, worker_id): 33 '''Test a scraper. It must be able to traverse backward for at least 5 34 strips from the start, and find strip images on at least 4 pages.''' 35 # Limit number of connections to one host. 36 host = urlsplit(scraperobj.url).hostname 37 with get_lock(host): 38 _test_comic(str(tmpdir), scraperobj) 39 40 41def _test_comic(outdir, scraperobj): 42 num_strips = 0 43 strip = None 44 files = [] 45 for strip in scraperobj.getStrips(MaxStrips): 46 files.append(_check_strip(outdir, strip, 47 scraperobj.multipleImagesPerStrip)) 48 49 if num_strips > 0: 50 _check_stripurl(strip, scraperobj) 51 num_strips += 1 52 53 if scraperobj.prevSearch and not scraperobj.hitFirstStripUrl: 54 # subtract the number of skipped URLs with no image from the expected 55 # image number 56 num_strips_expected = MaxStrips - len(scraperobj.skippedUrls) 57 msg = 'Traversed %d strips instead of %d.' % (num_strips, 58 num_strips_expected) 59 if strip: 60 msg += " Check the prevSearch pattern at %s" % strip.strip_url 61 assert num_strips == num_strips_expected, msg 62 if strip: 63 _check_scraperesult(files, num_strips_expected, strip, scraperobj) 64 65 66def _check_strip(outdir, strip, multipleImagesPerStrip): 67 '''Check that a specific page yields images and the comic module correctly 68 declares if there are multiple images per page.''' 69 images = [] 70 files = [] 71 for image in strip.getImages(): 72 images.append(image.url) 73 74 # write a fake image (to download less) 75 fakeimg = image._fnbase(outdir) + '.fake' 76 with open(fakeimg, 'w') as f: 77 f.write("fake image for testing") 78 79 fn, _ = image.save(outdir) 80 files.append(fn) 81 assert images, 'failed to find images at %s' % strip.strip_url 82 if not multipleImagesPerStrip: 83 assert len(images) == 1, 'found more than 1 image at {}: {}'.format( 84 strip.strip_url, images) 85 return files 86 87 88def _check_scraperesult(saved_images, num_images_expected, strip, scraperobj): 89 '''Check that exactly or for multiple pages at least num_strips images are 90 saved. This checks saved files, ie. it detects duplicate filenames.''' 91 num_images = len(saved_images) 92 93 attrs = (num_images, saved_images, num_images_expected) 94 if scraperobj.multipleImagesPerStrip: 95 err = 'saved %d %s instead of at least %d images' % attrs 96 assert num_images >= num_images_expected, err 97 else: 98 err = 'saved %d %s instead of %d images' % attrs 99 assert num_images == num_images_expected, err 100 101 102def _check_stripurl(strip, scraperobj): 103 if not scraperobj.stripUrl: 104 # no indexing support 105 return 106 # test that the stripUrl regex matches the retrieved strip URL 107 urlmatch = re.escape(scraperobj.stripUrl) 108 urlmatch = PRINTF_MATCH.sub('.+', urlmatch) 109 urlmatch = ARCHIVE_ORG_MATCH.sub(r'/\\d+/', urlmatch) 110 ro = re.compile(urlmatch) 111 mo = ro.match(strip.strip_url) 112 err = 'strip URL {!r} does not match stripUrl pattern {}'.format( 113 strip.strip_url, urlmatch) 114 assert mo is not None, err 115