1# -*- coding: utf-8 -*- 2# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs 3# Copyright (C) 2012-2014 Bastian Kleineidam 4# Copyright (C) 2015-2020 Tobias Gruetzmacher 5 6from __future__ import absolute_import, division, print_function 7 8import os 9import re 10from six.moves.urllib.parse import urljoin 11 12from lxml import html, etree 13from lxml.html.defs import link_attrs as html_link_attrs 14 15try: 16 import cssselect 17except ImportError: 18 cssselect = None 19 20try: 21 import pycountry 22except ImportError: 23 pycountry = None 24 25from . import configuration, http, languages, loader 26from .util import (get_page, makeSequence, get_system_uid, unescape, tagre, 27 normaliseURL, prettyMatcherList, uniq) 28from .comic import ComicStrip 29from .output import out 30from .events import getHandler 31 32 33ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/') 34 35 36class Scraper(object): 37 '''Base class for all comic scraper, but without a specific scrape 38 implementation.''' 39 40 # The URL for the comic strip 41 url = None 42 43 # A string that is interpolated with the strip index to yield the URL for a 44 # particular strip. 45 stripUrl = None 46 47 # Stop search for previous URLs at this URL 48 firstStripUrl = None 49 50 # if more than one image per URL is expected 51 multipleImagesPerStrip = False 52 53 # set to True if this comic contains adult content 54 adult = False 55 56 # set to True if this comic will not get updated anymore 57 endOfLife = False 58 59 # langauge of the comic (two-letter ISO 639-1 code) 60 lang = 'en' 61 62 # an expression that will locate the URL for the previous strip in a page 63 # this can also be a list or tuple 64 prevSearch = None 65 66 # an expression that will locate the strip image URLs strip in a page 67 # this can also be a list or tuple 68 imageSearch = None 69 70 # an expression to store a text together with the image 71 # sometimes comic strips have additional text info for each comic 72 textSearch = None 73 74 # Is the additional text required or optional? When it is required (the 75 # default), you see an error message whenever a comic page is encountered 76 # that does not have the text 77 textOptional = False 78 79 # usually the index format help 80 help = '' 81 82 # Specifing a list of HTTP error codes which should be handled as a 83 # successful request. This is a workaround for some comics which return 84 # regular pages with strange HTTP codes. By default, all HTTP errors raise 85 # exceptions. 86 allow_errors = () 87 88 # HTTP session for configuration & cookies 89 session = http.default_session 90 91 @classmethod 92 def getmodules(cls): 93 name = cls.__name__ 94 if hasattr(cls, 'name'): 95 name = cls.name 96 return [cls(name)] 97 98 @property 99 def indexes(self): 100 return self._indexes 101 102 @indexes.setter 103 def indexes(self, val): 104 if val: 105 self._indexes = tuple(sorted(val)) 106 107 def __init__(self, name): 108 """Initialize internal variables.""" 109 self.name = name 110 self.urls = set() 111 self._indexes = tuple() 112 self.skippedUrls = set() 113 self.hitFirstStripUrl = False 114 115 def __hash__(self): 116 """Get hash value from name and index list.""" 117 return hash((self.name, self.indexes)) 118 119 def shouldSkipUrl(self, url, data): 120 """Determine if search for images in given URL should be skipped.""" 121 return False 122 123 def getComicStrip(self, url, data): 124 """Get comic strip downloader for given URL and data.""" 125 imageUrls = self.fetchUrls(url, data, self.imageSearch) 126 # map modifier function on image URLs 127 imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls] 128 # remove duplicate URLs 129 imageUrls = uniq(imageUrls) 130 if len(imageUrls) > 1 and not self.multipleImagesPerStrip: 131 out.warn( 132 u"Found %d images instead of 1 at %s with expressions %s" % 133 (len(imageUrls), url, prettyMatcherList(self.imageSearch))) 134 image = imageUrls[0] 135 out.warn(u"Choosing image %s" % image) 136 imageUrls = (image,) 137 elif not imageUrls: 138 out.warn(u"Found no images at %s with expressions %s" % (url, 139 prettyMatcherList(self.imageSearch))) 140 if self.textSearch: 141 text = self.fetchText(url, data, self.textSearch, 142 optional=self.textOptional) 143 else: 144 text = None 145 return ComicStrip(self, url, imageUrls, text=text) 146 147 def getStrips(self, maxstrips=None): 148 """Get comic strips.""" 149 if maxstrips: 150 word = u"strip" if maxstrips == 1 else "strips" 151 msg = u'Retrieving %d %s' % (maxstrips, word) 152 else: 153 msg = u'Retrieving all strips' 154 if self.indexes: 155 if len(self.indexes) == 1: 156 msg += u" for index %s" % self.indexes[0] 157 else: 158 msg += u" for indexes %s" % self.indexes 159 # Always call starter() since it might initialize cookies. 160 # See for example Oglaf comic. 161 self.starter() 162 urls = [self.getIndexStripUrl(index) for index in self.indexes] 163 else: 164 urls = [self.starter()] 165 if self.adult: 166 msg += u" (including adult content)" 167 out.info(msg) 168 for url in urls: 169 for strip in self.getStripsFor(url, maxstrips): 170 yield strip 171 172 def getStripsFor(self, url, maxstrips): 173 """Get comic strips for an URL. If maxstrips is a positive number, stop after 174 retrieving the given number of strips.""" 175 self.hitFirstStripUrl = False 176 seen_urls = set() 177 while url: 178 out.info(u'Get strip URL %s' % url, level=1) 179 data = self.getPage(url) 180 if self.shouldSkipUrl(url, data): 181 out.info(u'Skipping URL %s' % url) 182 self.skippedUrls.add(url) 183 else: 184 try: 185 yield self.getComicStrip(url, data) 186 except ValueError as msg: 187 # image not found 188 out.exception(msg) 189 if self.isfirststrip(url): 190 out.debug(u"Stop at first URL %s" % url) 191 self.hitFirstStripUrl = True 192 break 193 if maxstrips is not None: 194 maxstrips -= 1 195 if maxstrips <= 0: 196 break 197 prevUrl = self.getPrevUrl(url, data) 198 seen_urls.add(url) 199 if prevUrl in seen_urls: 200 # avoid recursive URL loops 201 out.warn(u"Already seen previous URL %r" % prevUrl) 202 break 203 url = prevUrl 204 205 def isfirststrip(self, url): 206 """Check if the specified URL is the first strip of a comic. This is 207 specially for comics taken from archive.org, since the base URL of 208 archive.org changes whenever pages are taken from a different 209 snapshot.""" 210 if not self.firstStripUrl: 211 return False 212 firsturl = ARCHIVE_ORG_URL.sub('', self.firstStripUrl) 213 currenturl = ARCHIVE_ORG_URL.sub('', url) 214 return firsturl == currenturl 215 216 def getPrevUrl(self, url, data): 217 """Find previous URL.""" 218 prevUrl = None 219 if self.prevSearch: 220 try: 221 prevUrl = self.fetchUrl(url, data, self.prevSearch) 222 except ValueError as msg: 223 # assume there is no previous URL, but print a warning 224 out.warn(u"%s Assuming no previous comic strips exist." % msg) 225 else: 226 prevUrl = self.link_modifier(url, prevUrl) 227 out.debug(u"Found previous URL %s" % prevUrl) 228 getHandler().comicPageLink(self, url, prevUrl) 229 return prevUrl 230 231 def getIndexStripUrl(self, index): 232 """Get comic strip URL from index.""" 233 return self.stripUrl % index 234 235 def starter(self): 236 """Get starter URL from where to scrape comic strips.""" 237 return self.url 238 239 def namer(self, image_url, page_url): 240 """Return filename for given image and page URL.""" 241 return None 242 243 def link_modifier(self, fromurl, tourl): 244 """Optional modification of parsed link (previous/back/latest) URLs. 245 Useful if there are domain redirects. The default implementation does 246 not modify the URL. 247 """ 248 return tourl 249 250 def imageUrlModifier(self, image_url, data): 251 """Optional modification of parsed image URLs. Useful if the URL 252 needs to be fixed before usage. The default implementation does 253 not modify the URL. The given data is the URL page data. 254 """ 255 return image_url 256 257 def vote(self): 258 """Cast a public vote for this comic.""" 259 uid = get_system_uid() 260 data = {"name": self.name.replace('/', '_'), "uid": uid} 261 response = self.session.post(configuration.VoteUrl, data=data) 262 response.raise_for_status() 263 264 def get_download_dir(self, basepath): 265 """Try to find the corect download directory, ignoring case 266 differences.""" 267 path = basepath 268 for part in self.name.split('/'): 269 done = False 270 if (os.path.isdir(path) and 271 not os.path.isdir(os.path.join(path, part))): 272 for entry in os.listdir(path): 273 if (entry.lower() == part.lower() and 274 os.path.isdir(os.path.join(path, entry))): 275 path = os.path.join(path, entry) 276 done = True 277 break 278 if not done: 279 path = os.path.join(path, part) 280 return path 281 282 def getCompleteFile(self, basepath): 283 """Get filename indicating all comics are downloaded.""" 284 dirname = self.get_download_dir(basepath) 285 return os.path.join(dirname, "complete.txt") 286 287 def isComplete(self, basepath): 288 """Check if all comics are downloaded.""" 289 return os.path.isfile(self.getCompleteFile(basepath)) 290 291 def setComplete(self, basepath): 292 """Set complete flag for this comic, ie. all comics are downloaded.""" 293 if self.endOfLife: 294 filename = self.getCompleteFile(basepath) 295 if not os.path.exists(filename): 296 with open(filename, 'w') as f: 297 f.write('All comics should be downloaded here.') 298 299 def getPage(self, url): 300 """ 301 Fetch a page and return the opaque repesentation for the data parameter 302 of fetchUrls and fetchText. 303 304 Implementation notes: While this base class does not restrict how the 305 returned data is structured, subclasses (specific scrapers) should 306 specify how this data works, since the stracture is passed into 307 different methods which can be defined by comic modules and these 308 methods should be able to use the data if they so desire... (Affected 309 methods: shouldSkipUrl, imageUrlModifier) 310 """ 311 return get_page(url, self.session, allow_errors=self.allow_errors) 312 313 def fetchUrls(self, url, data, urlsearch): 314 raise ValueError("No implementation for fetchUrls!") 315 316 def fetchUrl(self, url, data, urlsearch): 317 return self.fetchUrls(url, data, urlsearch)[0] 318 319 def fetchText(self, url, data, textsearch, optional): 320 raise ValueError("No implementation for fetchText!") 321 322 def getDisabledReasons(self): 323 """ 324 Get a dict of reasons why this comic module is disabled. The key is a 325 short (unique) identifier, the value is a string explaining why the 326 module is deactivated. If the module is not disabled, just return an 327 empty dict. 328 """ 329 return {} 330 331 def language(self): 332 """ 333 Return language of the comic as a human-readable language name instead 334 of a 2-character ISO639-1 code. 335 """ 336 lang = 'Unknown (%s)' % self.lang 337 if pycountry is None: 338 if self.lang in languages.Languages: 339 lang = languages.Languages[self.lang] 340 else: 341 try: 342 lang = pycountry.languages.get(alpha_2=self.lang).name 343 except KeyError: 344 try: 345 lang = pycountry.languages.get(alpha2=self.lang).name 346 except KeyError: 347 pass 348 return lang 349 350 351class _BasicScraper(Scraper): 352 """ 353 Scraper base class that matches regular expressions against HTML pages. 354 355 Subclasses of this scraper should use compiled regular expressions as 356 values for prevSearch, imageSearch and textSearch. 357 358 Implementation note: The return value of getPage is a tuple: the first 359 element is the raw HTML page text, the second element is the base URL (if 360 any). 361 """ 362 363 BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)')) 364 365 def getPage(self, url): 366 content = super(_BasicScraper, self).getPage(url).text 367 # determine base URL 368 baseUrl = None 369 match = self.BASE_SEARCH.search(content) 370 if match: 371 baseUrl = match.group(1) 372 else: 373 baseUrl = url 374 return (content, baseUrl) 375 376 def fetchUrls(self, url, data, urlSearch): 377 """Search all entries for given URL pattern(s) in a HTML page.""" 378 searchUrls = [] 379 searches = makeSequence(urlSearch) 380 for search in searches: 381 for match in search.finditer(data[0]): 382 searchUrl = match.group(1) 383 if not searchUrl: 384 raise ValueError("Pattern %s matched empty URL at %s." % 385 (search.pattern, url)) 386 out.debug(u'matched URL %r with pattern %s' % 387 (searchUrl, search.pattern)) 388 searchUrls.append(normaliseURL(urljoin(data[1], searchUrl))) 389 if searchUrls: 390 # do not search other links if one pattern matched 391 break 392 if not searchUrls: 393 patterns = [x.pattern for x in searches] 394 raise ValueError("Patterns %s not found at URL %s." % 395 (patterns, url)) 396 return searchUrls 397 398 def fetchText(self, url, data, textSearch, optional): 399 """Search text entry for given text pattern in a HTML page.""" 400 if textSearch: 401 match = textSearch.search(data[0]) 402 if match: 403 text = match.group(1) 404 out.debug(u'matched text %r with pattern %s' % 405 (text, textSearch.pattern)) 406 return unescape(text).strip() 407 if optional: 408 return None 409 else: 410 raise ValueError("Pattern %s not found at URL %s." % 411 (textSearch.pattern, url)) 412 else: 413 return None 414 415 416class _ParserScraper(Scraper): 417 """ 418 Scraper base class that uses a HTML parser and XPath expressions. 419 420 All links are resolved before XPath searches are applied, so all URLs are 421 absolute! 422 423 Subclasses of this class should use XPath expressions as values for 424 prevSearch, imageSearch and textSearch. When the XPath directly selects an 425 attribute, it is used as the output. 426 427 All those searches try to do something intelligent when they match a 428 complete HTML Element: prevSearch and imageSearch try to find a "link 429 attribute" and use that as URL. textSearch strips all tags from the content 430 of the HTML element and returns that. 431 """ 432 433 BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])') 434 435 # Taken directly from LXML 436 XML_DECL = re.compile( 437 r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U) 438 439 NS = { 440 "re": "http://exslt.org/regular-expressions" 441 } 442 443 # Switch between CSS and XPath selectors for this class. Since CSS needs 444 # another Python module, XPath is the default for now. 445 css = False 446 447 # Activate a workaround for unescaped < characters on libxml version older 448 # then 2.9.3. This is disabled by default since most sites are not THAT 449 # broken ;) 450 broken_html_bugfix = False 451 452 def getPage(self, url): 453 page = super(_ParserScraper, self).getPage(url) 454 if page.encoding: 455 # Requests figured out the encoding, so we can deliver Unicode to 456 # LXML. Unfortunatly, LXML feels betrayed if there is still an XML 457 # declaration with (probably wrong!) encoding at the top of the 458 # document. Web browsers ignore such if the encoding was specified 459 # in the HTTP header and so do we. 460 text = self.XML_DECL.sub('\1\2', page.text, count=1) 461 tree = self._parse_page(text) 462 else: 463 tree = self._parse_page(page.content) 464 tree.make_links_absolute(url) 465 return tree 466 467 def _parse_page(self, data): 468 if self.broken_html_bugfix and etree.LIBXML_VERSION < (2, 9, 3): 469 def fix_not_open_tags(match): 470 fix = (len(match.group(1)) * '<') + match.group(2) 471 out.warn("Found possibly broken HTML '%s', fixing as '%s'" % ( 472 match.group(0), fix), level=2) 473 return fix 474 data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data) 475 476 tree = html.document_fromstring(data) 477 return tree 478 479 def fetchUrls(self, url, data, urlSearch): 480 """Search all entries for given XPath in a HTML page.""" 481 searchUrls = [] 482 for match, search in self._matchPattern(data, urlSearch): 483 searchUrl = None 484 try: 485 for attrib in html_link_attrs: 486 if attrib in match.attrib: 487 searchUrl = match.get(attrib) 488 except AttributeError: 489 searchUrl = str(match) 490 out.debug(u'Matched URL %r with pattern %s' % (searchUrl, search)) 491 if searchUrl is not None: 492 searchUrls.append(searchUrl) 493 494 if not searchUrls: 495 raise ValueError("XPath %s not found at URL %s." % 496 (urlSearch, url)) 497 return searchUrls 498 499 def fetchText(self, url, data, textSearch, optional): 500 """Search text entry for given text XPath in a HTML page.""" 501 if not textSearch: 502 return None 503 text = [] 504 for match, search in self._matchPattern(data, textSearch): 505 try: 506 text.append(match.text_content()) 507 except AttributeError: 508 text.append(match) 509 out.debug(u'Matched text %r with XPath %s' % (text, search)) 510 text = u' '.join(text) 511 if text.strip() == '': 512 if optional: 513 return None 514 else: 515 raise ValueError("XPath %s did not match anything at URL %s." % 516 (textSearch, url)) 517 return text.strip() 518 519 def _matchPattern(self, data, patterns): 520 if self.css: 521 searchFun = data.cssselect 522 else: 523 def searchFun(s): 524 return data.xpath(s, namespaces=self.NS) 525 patterns = makeSequence(patterns) 526 for search in patterns: 527 matched = False 528 for match in searchFun(search): 529 matched = True 530 yield match, search 531 532 if matched and not self.multipleImagesPerStrip: 533 # do not search other links if one pattern matched 534 break 535 536 def getDisabledReasons(self): 537 res = {} 538 if self.css and cssselect is None: 539 res['css'] = (u"This module needs the cssselect " + 540 u"(python-cssselect) python module which is " + 541 u"not installed.") 542 return res 543 544 545def find_scrapers(comic, multiple_allowed=False): 546 """Get a list comic scraper objects. 547 548 Can return more than one entry if multiple_allowed is True, else it raises 549 a ValueError if multiple modules match. The match is a case insensitive 550 substring search. 551 """ 552 if not comic: 553 raise ValueError("empty comic name") 554 candidates = [] 555 cname = comic.lower() 556 for scrapers in get_scrapers(include_removed=True): 557 lname = scrapers.name.lower() 558 if lname == cname: 559 # perfect match 560 if not multiple_allowed: 561 return [scrapers] 562 else: 563 candidates.append(scrapers) 564 elif cname in lname and scrapers.url: 565 candidates.append(scrapers) 566 if len(candidates) > 1 and not multiple_allowed: 567 comics = ", ".join(x.name for x in candidates) 568 raise ValueError('multiple comics found: %s' % comics) 569 elif not candidates: 570 raise ValueError('comic %r not found' % comic) 571 return candidates 572 573 574_scrapers = None 575 576 577def get_scrapers(include_removed=False): 578 """Find all comic scraper classes in the plugins directory. 579 The result is cached. 580 @return: list of Scraper classes 581 @rtype: list of Scraper 582 """ 583 global _scrapers 584 if _scrapers is None: 585 out.debug(u"Loading comic modules...") 586 modules = loader.get_modules('plugins') 587 plugins = list(loader.get_plugins(modules, Scraper)) 588 _scrapers = sorted([m for x in plugins for m in x.getmodules()], 589 key=lambda p: p.name) 590 check_scrapers() 591 out.debug(u"... %d modules loaded from %d classes." % ( 592 len(_scrapers), len(plugins))) 593 if include_removed: 594 return _scrapers 595 else: 596 return [x for x in _scrapers if x.url] 597 598 599def check_scrapers(): 600 """Check for duplicate scraper names.""" 601 d = {} 602 for scraper in _scrapers: 603 name = scraper.name.lower() 604 if name in d: 605 name1 = scraper.name 606 name2 = d[name].name 607 raise ValueError('duplicate scrapers %s and %s found' % 608 (name1, name2)) 609 d[name] = scraper 610