1# -*- coding: utf-8 -*- 2 3# Copyright 2018-2021 Mike Fährmann 4# 5# This program is free software; you can redistribute it and/or modify 6# it under the terms of the GNU General Public License version 2 as 7# published by the Free Software Foundation. 8 9"""Extractors for https://idol.sankakucomplex.com/""" 10 11from .sankaku import SankakuExtractor 12from .common import Message 13from ..cache import cache 14from .. import text, util, exception 15import collections 16import re 17 18 19class IdolcomplexExtractor(SankakuExtractor): 20 """Base class for idolcomplex extractors""" 21 category = "idolcomplex" 22 cookienames = ("login", "pass_hash") 23 cookiedomain = "idol.sankakucomplex.com" 24 root = "https://" + cookiedomain 25 request_interval = 5.0 26 27 def __init__(self, match): 28 SankakuExtractor.__init__(self, match) 29 self.logged_in = True 30 self.start_page = 1 31 self.start_post = 0 32 self.extags = self.config("tags", False) 33 34 def items(self): 35 self.login() 36 data = self.metadata() 37 38 for post_id in util.advance(self.post_ids(), self.start_post): 39 post = self._parse_post(post_id) 40 url = post["file_url"] 41 post.update(data) 42 text.nameext_from_url(url, post) 43 yield Message.Directory, post 44 yield Message.Url, url, post 45 46 def skip(self, num): 47 self.start_post += num 48 return num 49 50 def post_ids(self): 51 """Return an iterable containing all relevant post ids""" 52 53 def login(self): 54 if self._check_cookies(self.cookienames): 55 return 56 username, password = self._get_auth_info() 57 if username: 58 cookies = self._login_impl(username, password) 59 self._update_cookies(cookies) 60 else: 61 self.logged_in = False 62 63 @cache(maxage=90*24*3600, keyarg=1) 64 def _login_impl(self, username, password): 65 self.log.info("Logging in as %s", username) 66 67 url = self.root + "/user/authenticate" 68 data = { 69 "url" : "", 70 "user[name]" : username, 71 "user[password]": password, 72 "commit" : "Login", 73 } 74 response = self.request(url, method="POST", data=data) 75 76 if not response.history or response.url != self.root + "/user/home": 77 raise exception.AuthenticationError() 78 cookies = response.history[0].cookies 79 return {c: cookies[c] for c in self.cookienames} 80 81 def _parse_post(self, post_id): 82 """Extract metadata of a single post""" 83 url = self.root + "/post/show/" + post_id 84 page = self.request(url, retries=10).text 85 extr = text.extract 86 87 tags , pos = extr(page, "<title>", " | ") 88 vavg , pos = extr(page, "itemprop=ratingValue>", "<", pos) 89 vcnt , pos = extr(page, "itemprop=reviewCount>", "<", pos) 90 _ , pos = extr(page, "Posted: <", "", pos) 91 created, pos = extr(page, ' title="', '"', pos) 92 rating = extr(page, "<li>Rating: ", "<", pos)[0] 93 94 file_url, pos = extr(page, '<li>Original: <a href="', '"', pos) 95 if file_url: 96 width , pos = extr(page, '>', 'x', pos) 97 height, pos = extr(page, '', ' ', pos) 98 else: 99 width , pos = extr(page, '<object width=', ' ', pos) 100 height, pos = extr(page, 'height=', '>', pos) 101 file_url = extr(page, '<embed src="', '"', pos)[0] 102 103 data = { 104 "id": text.parse_int(post_id), 105 "md5": file_url.rpartition("/")[2].partition(".")[0], 106 "tags": text.unescape(tags), 107 "vote_average": text.parse_float(vavg), 108 "vote_count": text.parse_int(vcnt), 109 "created_at": created, 110 "rating": (rating or "?")[0].lower(), 111 "file_url": "https:" + text.unescape(file_url), 112 "width": text.parse_int(width), 113 "height": text.parse_int(height), 114 } 115 116 if self.extags: 117 tags = collections.defaultdict(list) 118 tags_html = text.extract(page, '<ul id=tag-sidebar>', '</ul>')[0] 119 pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)') 120 for tag_type, tag_name in pattern.findall(tags_html or ""): 121 tags[tag_type].append(text.unquote(tag_name)) 122 for key, value in tags.items(): 123 data["tags_" + key] = " ".join(value) 124 125 return data 126 127 128class IdolcomplexTagExtractor(IdolcomplexExtractor): 129 """Extractor for images from idol.sankakucomplex.com by search-tags""" 130 subcategory = "tag" 131 directory_fmt = ("{category}", "{search_tags}") 132 archive_fmt = "t_{search_tags}_{id}" 133 pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)" 134 test = ( 135 ("https://idol.sankakucomplex.com/?tags=lyumos", { 136 "count": 5, 137 "range": "18-22", 138 "pattern": r"https://is\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}" 139 r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+", 140 }), 141 ("https://idol.sankakucomplex.com/?tags=order:favcount", { 142 "count": 5, 143 "range": "18-22", 144 }), 145 ("https://idol.sankakucomplex.com" 146 "/?tags=lyumos+wreath&page=3&next=694215"), 147 ) 148 per_page = 20 149 150 def __init__(self, match): 151 IdolcomplexExtractor.__init__(self, match) 152 query = text.parse_query(match.group(1)) 153 self.tags = text.unquote(query.get("tags", "").replace("+", " ")) 154 self.start_page = text.parse_int(query.get("page"), 1) 155 self.next = text.parse_int(query.get("next"), 0) 156 157 def skip(self, num): 158 if self.next: 159 self.start_post += num 160 else: 161 pages, posts = divmod(num, self.per_page) 162 self.start_page += pages 163 self.start_post += posts 164 return num 165 166 def metadata(self): 167 if not self.next: 168 max_page = 50 if self.logged_in else 25 169 if self.start_page > max_page: 170 self.log.info("Traversing from page %d to page %d", 171 max_page, self.start_page) 172 self.start_post += self.per_page * (self.start_page - max_page) 173 self.start_page = max_page 174 175 tags = self.tags.split() 176 if not self.logged_in and len(tags) > 4: 177 raise exception.StopExtraction( 178 "Non-members can only search up to 4 tags at once") 179 return {"search_tags": " ".join(tags)} 180 181 def post_ids(self): 182 params = {"tags": self.tags} 183 184 if self.next: 185 params["next"] = self.next 186 else: 187 params["page"] = self.start_page 188 189 while True: 190 page = self.request(self.root, params=params, retries=10).text 191 pos = page.find("<div id=more-popular-posts-link>") + 1 192 yield from text.extract_iter(page, '" id=p', '>', pos) 193 194 next_url = text.extract(page, 'next-page-url="', '"', pos)[0] 195 if not next_url: 196 return 197 198 next_params = text.parse_query(text.unescape( 199 next_url).lstrip("?/")) 200 201 if "next" in next_params: 202 # stop if the same "next" value occurs twice in a row (#265) 203 if "next" in params and params["next"] == next_params["next"]: 204 return 205 next_params["page"] = "2" 206 params = next_params 207 208 209class IdolcomplexPoolExtractor(IdolcomplexExtractor): 210 """Extractor for image-pools from idol.sankakucomplex.com""" 211 subcategory = "pool" 212 directory_fmt = ("{category}", "pool", "{pool}") 213 archive_fmt = "p_{pool}_{id}" 214 pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pool/show/(\d+)" 215 test = ("https://idol.sankakucomplex.com/pool/show/145", { 216 "count": 3, 217 }) 218 per_page = 24 219 220 def __init__(self, match): 221 IdolcomplexExtractor.__init__(self, match) 222 self.pool_id = match.group(1) 223 224 def skip(self, num): 225 pages, posts = divmod(num, self.per_page) 226 self.start_page += pages 227 self.start_post += posts 228 return num 229 230 def metadata(self): 231 return {"pool": self.pool_id} 232 233 def post_ids(self): 234 url = self.root + "/pool/show/" + self.pool_id 235 params = {"page": self.start_page} 236 237 while True: 238 page = self.request(url, params=params, retries=10).text 239 ids = list(text.extract_iter(page, '" id=p', '>')) 240 241 yield from ids 242 if len(ids) < self.per_page: 243 return 244 params["page"] += 1 245 246 247class IdolcomplexPostExtractor(IdolcomplexExtractor): 248 """Extractor for single images from idol.sankakucomplex.com""" 249 subcategory = "post" 250 archive_fmt = "{id}" 251 pattern = r"(?:https?://)?idol\.sankakucomplex\.com/post/show/(\d+)" 252 test = ("https://idol.sankakucomplex.com/post/show/694215", { 253 "content": "694ec2491240787d75bf5d0c75d0082b53a85afd", 254 "options": (("tags", True),), 255 "keyword": { 256 "tags_character": "shani_(the_witcher)", 257 "tags_copyright": "the_witcher", 258 "tags_idol": str, 259 "tags_medium": str, 260 "tags_general": str, 261 }, 262 }) 263 264 def __init__(self, match): 265 IdolcomplexExtractor.__init__(self, match) 266 self.post_id = match.group(1) 267 268 def post_ids(self): 269 return (self.post_id,) 270