1# -*- coding: utf-8 -*-
2# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
3# Copyright (C) 2012-2014 Bastian Kleineidam
4# Copyright (C) 2015-2020 Tobias Gruetzmacher
5
6from __future__ import absolute_import, division, print_function
7
8import os
9import re
10from six.moves.urllib.parse import urljoin
11
12from lxml import html, etree
13from lxml.html.defs import link_attrs as html_link_attrs
14
15try:
16    import cssselect
17except ImportError:
18    cssselect = None
19
20try:
21    import pycountry
22except ImportError:
23    pycountry = None
24
25from . import configuration, http, languages, loader
26from .util import (get_page, makeSequence, get_system_uid, unescape, tagre,
27    normaliseURL, prettyMatcherList, uniq)
28from .comic import ComicStrip
29from .output import out
30from .events import getHandler
31
32
33ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
34
35
36class Scraper(object):
37    '''Base class for all comic scraper, but without a specific scrape
38    implementation.'''
39
40    # The URL for the comic strip
41    url = None
42
43    # A string that is interpolated with the strip index to yield the URL for a
44    # particular strip.
45    stripUrl = None
46
47    # Stop search for previous URLs at this URL
48    firstStripUrl = None
49
50    # if more than one image per URL is expected
51    multipleImagesPerStrip = False
52
53    # set to True if this comic contains adult content
54    adult = False
55
56    # set to True if this comic will not get updated anymore
57    endOfLife = False
58
59    # langauge of the comic (two-letter ISO 639-1 code)
60    lang = 'en'
61
62    # an expression that will locate the URL for the previous strip in a page
63    # this can also be a list or tuple
64    prevSearch = None
65
66    # an expression that will locate the strip image URLs strip in a page
67    # this can also be a list or tuple
68    imageSearch = None
69
70    # an expression to store a text together with the image
71    # sometimes comic strips have additional text info for each comic
72    textSearch = None
73
74    # Is the additional text required or optional?  When it is required (the
75    # default), you see an error message whenever a comic page is encountered
76    # that does not have the text
77    textOptional = False
78
79    # usually the index format help
80    help = ''
81
82    # Specifing a list of HTTP error codes which should be handled as a
83    # successful request.  This is a workaround for some comics which return
84    # regular pages with strange HTTP codes. By default, all HTTP errors raise
85    # exceptions.
86    allow_errors = ()
87
88    # HTTP session for configuration & cookies
89    session = http.default_session
90
91    @classmethod
92    def getmodules(cls):
93        name = cls.__name__
94        if hasattr(cls, 'name'):
95            name = cls.name
96        return [cls(name)]
97
98    @property
99    def indexes(self):
100        return self._indexes
101
102    @indexes.setter
103    def indexes(self, val):
104        if val:
105            self._indexes = tuple(sorted(val))
106
107    def __init__(self, name):
108        """Initialize internal variables."""
109        self.name = name
110        self.urls = set()
111        self._indexes = tuple()
112        self.skippedUrls = set()
113        self.hitFirstStripUrl = False
114
115    def __hash__(self):
116        """Get hash value from name and index list."""
117        return hash((self.name, self.indexes))
118
119    def shouldSkipUrl(self, url, data):
120        """Determine if search for images in given URL should be skipped."""
121        return False
122
123    def getComicStrip(self, url, data):
124        """Get comic strip downloader for given URL and data."""
125        imageUrls = self.fetchUrls(url, data, self.imageSearch)
126        # map modifier function on image URLs
127        imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls]
128        # remove duplicate URLs
129        imageUrls = uniq(imageUrls)
130        if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
131            out.warn(
132                u"Found %d images instead of 1 at %s with expressions %s" %
133                (len(imageUrls), url, prettyMatcherList(self.imageSearch)))
134            image = imageUrls[0]
135            out.warn(u"Choosing image %s" % image)
136            imageUrls = (image,)
137        elif not imageUrls:
138            out.warn(u"Found no images at %s with expressions %s" % (url,
139                     prettyMatcherList(self.imageSearch)))
140        if self.textSearch:
141            text = self.fetchText(url, data, self.textSearch,
142                                  optional=self.textOptional)
143        else:
144            text = None
145        return ComicStrip(self, url, imageUrls, text=text)
146
147    def getStrips(self, maxstrips=None):
148        """Get comic strips."""
149        if maxstrips:
150            word = u"strip" if maxstrips == 1 else "strips"
151            msg = u'Retrieving %d %s' % (maxstrips, word)
152        else:
153            msg = u'Retrieving all strips'
154        if self.indexes:
155            if len(self.indexes) == 1:
156                msg += u" for index %s" % self.indexes[0]
157            else:
158                msg += u" for indexes %s" % self.indexes
159            # Always call starter() since it might initialize cookies.
160            # See for example Oglaf comic.
161            self.starter()
162            urls = [self.getIndexStripUrl(index) for index in self.indexes]
163        else:
164            urls = [self.starter()]
165        if self.adult:
166            msg += u" (including adult content)"
167        out.info(msg)
168        for url in urls:
169            for strip in self.getStripsFor(url, maxstrips):
170                yield strip
171
172    def getStripsFor(self, url, maxstrips):
173        """Get comic strips for an URL. If maxstrips is a positive number, stop after
174        retrieving the given number of strips."""
175        self.hitFirstStripUrl = False
176        seen_urls = set()
177        while url:
178            out.info(u'Get strip URL %s' % url, level=1)
179            data = self.getPage(url)
180            if self.shouldSkipUrl(url, data):
181                out.info(u'Skipping URL %s' % url)
182                self.skippedUrls.add(url)
183            else:
184                try:
185                    yield self.getComicStrip(url, data)
186                except ValueError as msg:
187                    # image not found
188                    out.exception(msg)
189            if self.isfirststrip(url):
190                out.debug(u"Stop at first URL %s" % url)
191                self.hitFirstStripUrl = True
192                break
193            if maxstrips is not None:
194                maxstrips -= 1
195                if maxstrips <= 0:
196                    break
197            prevUrl = self.getPrevUrl(url, data)
198            seen_urls.add(url)
199            if prevUrl in seen_urls:
200                # avoid recursive URL loops
201                out.warn(u"Already seen previous URL %r" % prevUrl)
202                break
203            url = prevUrl
204
205    def isfirststrip(self, url):
206        """Check if the specified URL is the first strip of a comic. This is
207        specially for comics taken from archive.org, since the base URL of
208        archive.org changes whenever pages are taken from a different
209        snapshot."""
210        if not self.firstStripUrl:
211            return False
212        firsturl = ARCHIVE_ORG_URL.sub('', self.firstStripUrl)
213        currenturl = ARCHIVE_ORG_URL.sub('', url)
214        return firsturl == currenturl
215
216    def getPrevUrl(self, url, data):
217        """Find previous URL."""
218        prevUrl = None
219        if self.prevSearch:
220            try:
221                prevUrl = self.fetchUrl(url, data, self.prevSearch)
222            except ValueError as msg:
223                # assume there is no previous URL, but print a warning
224                out.warn(u"%s Assuming no previous comic strips exist." % msg)
225            else:
226                prevUrl = self.link_modifier(url, prevUrl)
227                out.debug(u"Found previous URL %s" % prevUrl)
228                getHandler().comicPageLink(self, url, prevUrl)
229        return prevUrl
230
231    def getIndexStripUrl(self, index):
232        """Get comic strip URL from index."""
233        return self.stripUrl % index
234
235    def starter(self):
236        """Get starter URL from where to scrape comic strips."""
237        return self.url
238
239    def namer(self, image_url, page_url):
240        """Return filename for given image and page URL."""
241        return None
242
243    def link_modifier(self, fromurl, tourl):
244        """Optional modification of parsed link (previous/back/latest) URLs.
245        Useful if there are domain redirects. The default implementation does
246        not modify the URL.
247        """
248        return tourl
249
250    def imageUrlModifier(self, image_url, data):
251        """Optional modification of parsed image URLs. Useful if the URL
252        needs to be fixed before usage. The default implementation does
253        not modify the URL. The given data is the URL page data.
254        """
255        return image_url
256
257    def vote(self):
258        """Cast a public vote for this comic."""
259        uid = get_system_uid()
260        data = {"name": self.name.replace('/', '_'), "uid": uid}
261        response = self.session.post(configuration.VoteUrl, data=data)
262        response.raise_for_status()
263
264    def get_download_dir(self, basepath):
265        """Try to find the corect download directory, ignoring case
266        differences."""
267        path = basepath
268        for part in self.name.split('/'):
269            done = False
270            if (os.path.isdir(path) and
271               not os.path.isdir(os.path.join(path, part))):
272                for entry in os.listdir(path):
273                    if (entry.lower() == part.lower() and
274                       os.path.isdir(os.path.join(path, entry))):
275                        path = os.path.join(path, entry)
276                        done = True
277                        break
278            if not done:
279                path = os.path.join(path, part)
280        return path
281
282    def getCompleteFile(self, basepath):
283        """Get filename indicating all comics are downloaded."""
284        dirname = self.get_download_dir(basepath)
285        return os.path.join(dirname, "complete.txt")
286
287    def isComplete(self, basepath):
288        """Check if all comics are downloaded."""
289        return os.path.isfile(self.getCompleteFile(basepath))
290
291    def setComplete(self, basepath):
292        """Set complete flag for this comic, ie. all comics are downloaded."""
293        if self.endOfLife:
294            filename = self.getCompleteFile(basepath)
295            if not os.path.exists(filename):
296                with open(filename, 'w') as f:
297                    f.write('All comics should be downloaded here.')
298
299    def getPage(self, url):
300        """
301        Fetch a page and return the opaque repesentation for the data parameter
302        of fetchUrls and fetchText.
303
304        Implementation notes: While this base class does not restrict how the
305        returned data is structured, subclasses (specific scrapers) should
306        specify how this data works, since the stracture is passed into
307        different methods which can be defined by comic modules and these
308        methods should be able to use the data if they so desire... (Affected
309        methods: shouldSkipUrl, imageUrlModifier)
310        """
311        return get_page(url, self.session, allow_errors=self.allow_errors)
312
313    def fetchUrls(self, url, data, urlsearch):
314        raise ValueError("No implementation for fetchUrls!")
315
316    def fetchUrl(self, url, data, urlsearch):
317        return self.fetchUrls(url, data, urlsearch)[0]
318
319    def fetchText(self, url, data, textsearch, optional):
320        raise ValueError("No implementation for fetchText!")
321
322    def getDisabledReasons(self):
323        """
324        Get a dict of reasons why this comic module is disabled. The key is a
325        short (unique) identifier, the value is a string explaining why the
326        module is deactivated. If the module is not disabled, just return an
327        empty dict.
328        """
329        return {}
330
331    def language(self):
332        """
333        Return language of the comic as a human-readable language name instead
334        of a 2-character ISO639-1 code.
335        """
336        lang = 'Unknown (%s)' % self.lang
337        if pycountry is None:
338            if self.lang in languages.Languages:
339                lang = languages.Languages[self.lang]
340        else:
341            try:
342                lang = pycountry.languages.get(alpha_2=self.lang).name
343            except KeyError:
344                try:
345                    lang = pycountry.languages.get(alpha2=self.lang).name
346                except KeyError:
347                    pass
348        return lang
349
350
351class _BasicScraper(Scraper):
352    """
353    Scraper base class that matches regular expressions against HTML pages.
354
355    Subclasses of this scraper should use compiled regular expressions as
356    values for prevSearch, imageSearch and textSearch.
357
358    Implementation note: The return value of getPage is a tuple: the first
359    element is the raw HTML page text, the second element is the base URL (if
360    any).
361    """
362
363    BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
364
365    def getPage(self, url):
366        content = super(_BasicScraper, self).getPage(url).text
367        # determine base URL
368        baseUrl = None
369        match = self.BASE_SEARCH.search(content)
370        if match:
371            baseUrl = match.group(1)
372        else:
373            baseUrl = url
374        return (content, baseUrl)
375
376    def fetchUrls(self, url, data, urlSearch):
377        """Search all entries for given URL pattern(s) in a HTML page."""
378        searchUrls = []
379        searches = makeSequence(urlSearch)
380        for search in searches:
381            for match in search.finditer(data[0]):
382                searchUrl = match.group(1)
383                if not searchUrl:
384                    raise ValueError("Pattern %s matched empty URL at %s." %
385                                     (search.pattern, url))
386                out.debug(u'matched URL %r with pattern %s' %
387                          (searchUrl, search.pattern))
388                searchUrls.append(normaliseURL(urljoin(data[1], searchUrl)))
389            if searchUrls:
390                # do not search other links if one pattern matched
391                break
392        if not searchUrls:
393            patterns = [x.pattern for x in searches]
394            raise ValueError("Patterns %s not found at URL %s." %
395                             (patterns, url))
396        return searchUrls
397
398    def fetchText(self, url, data, textSearch, optional):
399        """Search text entry for given text pattern in a HTML page."""
400        if textSearch:
401            match = textSearch.search(data[0])
402            if match:
403                text = match.group(1)
404                out.debug(u'matched text %r with pattern %s' %
405                          (text, textSearch.pattern))
406                return unescape(text).strip()
407            if optional:
408                return None
409            else:
410                raise ValueError("Pattern %s not found at URL %s." %
411                                 (textSearch.pattern, url))
412        else:
413            return None
414
415
416class _ParserScraper(Scraper):
417    """
418    Scraper base class that uses a HTML parser and XPath expressions.
419
420    All links are resolved before XPath searches are applied, so all URLs are
421    absolute!
422
423    Subclasses of this class should use XPath expressions as values for
424    prevSearch, imageSearch and textSearch. When the XPath directly selects an
425    attribute, it is used as the output.
426
427    All those searches try to do something intelligent when they match a
428    complete HTML Element: prevSearch and imageSearch try to find a "link
429    attribute" and use that as URL. textSearch strips all tags from the content
430    of the HTML element and returns that.
431    """
432
433    BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
434
435    # Taken directly from LXML
436    XML_DECL = re.compile(
437        r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
438
439    NS = {
440        "re": "http://exslt.org/regular-expressions"
441    }
442
443    # Switch between CSS and XPath selectors for this class. Since CSS needs
444    # another Python module, XPath is the default for now.
445    css = False
446
447    # Activate a workaround for unescaped < characters on libxml version older
448    # then 2.9.3. This is disabled by default since most sites are not THAT
449    # broken ;)
450    broken_html_bugfix = False
451
452    def getPage(self, url):
453        page = super(_ParserScraper, self).getPage(url)
454        if page.encoding:
455            # Requests figured out the encoding, so we can deliver Unicode to
456            # LXML. Unfortunatly, LXML feels betrayed if there is still an XML
457            # declaration with (probably wrong!) encoding at the top of the
458            # document. Web browsers ignore such if the encoding was specified
459            # in the HTTP header and so do we.
460            text = self.XML_DECL.sub('\1\2', page.text, count=1)
461            tree = self._parse_page(text)
462        else:
463            tree = self._parse_page(page.content)
464        tree.make_links_absolute(url)
465        return tree
466
467    def _parse_page(self, data):
468        if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3):
469            def fix_not_open_tags(match):
470                fix = (len(match.group(1)) * '&lt;') + match.group(2)
471                out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
472                         match.group(0), fix), level=2)
473                return fix
474            data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
475
476        tree = html.document_fromstring(data)
477        return tree
478
479    def fetchUrls(self, url, data, urlSearch):
480        """Search all entries for given XPath in a HTML page."""
481        searchUrls = []
482        for match, search in self._matchPattern(data, urlSearch):
483            searchUrl = None
484            try:
485                for attrib in html_link_attrs:
486                    if attrib in match.attrib:
487                        searchUrl = match.get(attrib)
488            except AttributeError:
489                searchUrl = str(match)
490            out.debug(u'Matched URL %r with pattern %s' % (searchUrl, search))
491            if searchUrl is not None:
492                searchUrls.append(searchUrl)
493
494        if not searchUrls:
495            raise ValueError("XPath %s not found at URL %s." %
496                             (urlSearch, url))
497        return searchUrls
498
499    def fetchText(self, url, data, textSearch, optional):
500        """Search text entry for given text XPath in a HTML page."""
501        if not textSearch:
502            return None
503        text = []
504        for match, search in self._matchPattern(data, textSearch):
505            try:
506                text.append(match.text_content())
507            except AttributeError:
508                text.append(match)
509            out.debug(u'Matched text %r with XPath %s' % (text, search))
510        text = u' '.join(text)
511        if text.strip() == '':
512            if optional:
513                return None
514            else:
515                raise ValueError("XPath %s did not match anything at URL %s." %
516                                 (textSearch, url))
517        return text.strip()
518
519    def _matchPattern(self, data, patterns):
520        if self.css:
521            searchFun = data.cssselect
522        else:
523            def searchFun(s):
524                return data.xpath(s, namespaces=self.NS)
525        patterns = makeSequence(patterns)
526        for search in patterns:
527            matched = False
528            for match in searchFun(search):
529                matched = True
530                yield match, search
531
532            if matched and not self.multipleImagesPerStrip:
533                # do not search other links if one pattern matched
534                break
535
536    def getDisabledReasons(self):
537        res = {}
538        if self.css and cssselect is None:
539            res['css'] = (u"This module needs the cssselect " +
540                          u"(python-cssselect) python module which is " +
541                          u"not installed.")
542        return res
543
544
545def find_scrapers(comic, multiple_allowed=False):
546    """Get a list comic scraper objects.
547
548    Can return more than one entry if multiple_allowed is True, else it raises
549    a ValueError if multiple modules match. The match is a case insensitive
550    substring search.
551    """
552    if not comic:
553        raise ValueError("empty comic name")
554    candidates = []
555    cname = comic.lower()
556    for scrapers in get_scrapers(include_removed=True):
557        lname = scrapers.name.lower()
558        if lname == cname:
559            # perfect match
560            if not multiple_allowed:
561                return [scrapers]
562            else:
563                candidates.append(scrapers)
564        elif cname in lname and scrapers.url:
565            candidates.append(scrapers)
566    if len(candidates) > 1 and not multiple_allowed:
567        comics = ", ".join(x.name for x in candidates)
568        raise ValueError('multiple comics found: %s' % comics)
569    elif not candidates:
570        raise ValueError('comic %r not found' % comic)
571    return candidates
572
573
574_scrapers = None
575
576
577def get_scrapers(include_removed=False):
578    """Find all comic scraper classes in the plugins directory.
579    The result is cached.
580    @return: list of Scraper classes
581    @rtype: list of Scraper
582    """
583    global _scrapers
584    if _scrapers is None:
585        out.debug(u"Loading comic modules...")
586        modules = loader.get_modules('plugins')
587        plugins = list(loader.get_plugins(modules, Scraper))
588        _scrapers = sorted([m for x in plugins for m in x.getmodules()],
589                           key=lambda p: p.name)
590        check_scrapers()
591        out.debug(u"... %d modules loaded from %d classes." % (
592            len(_scrapers), len(plugins)))
593    if include_removed:
594        return _scrapers
595    else:
596        return [x for x in _scrapers if x.url]
597
598
599def check_scrapers():
600    """Check for duplicate scraper names."""
601    d = {}
602    for scraper in _scrapers:
603        name = scraper.name.lower()
604        if name in d:
605            name1 = scraper.name
606            name2 = d[name].name
607            raise ValueError('duplicate scrapers %s and %s found' %
608                             (name1, name2))
609        d[name] = scraper
610