1import re 2import zlib 3from pathlib import Path 4from urllib import parse 5 6from jinja2 import Template 7from loguru import logger 8 9from flexget import plugin 10from flexget.entry import Entry 11from flexget.event import event 12from flexget.utils.cached_input import cached 13from flexget.utils.soup import get_soup 14 15logger = logger.bind(name='html') 16 17 18class InputHtml: 19 """ 20 Parses urls from html page. Usefull on sites which have direct download 21 links of any type (mp3, jpg, torrent, ...). 22 23 Many anime-fansubbers do not provide RSS-feed, this works well in many cases. 24 25 Configuration expects url parameter. 26 27 Note: This returns ALL links on url so you need to configure filters 28 to match only to desired content. 29 """ 30 31 schema = { 32 'oneOf': [ 33 {'type': 'string'}, 34 { 35 'type': 'object', 36 'properties': { 37 'url': {'type': 'string', 'format': 'url'}, 38 'username': {'type': 'string'}, 39 'password': {'type': 'string'}, 40 'dump': {'type': 'string'}, 41 'title_from': {'type': 'string'}, 42 'allow_empty_links': {'type': 'boolean'}, 43 'links_re': {'type': 'array', 'items': {'type': 'string', 'format': 'regex'}}, 44 'increment': { 45 'oneOf': [ 46 {'type': 'boolean'}, 47 { 48 'type': 'object', 49 'properties': { 50 'from': {'type': 'integer'}, 51 'to': {'type': 'integer'}, 52 'name': {'type': 'string'}, 53 'step': {'type': 'integer'}, 54 'stop_when_empty': {'type': 'boolean'}, 55 'entries_count': {'type': 'integer'}, 56 }, 57 'additionalProperties': False, 58 }, 59 ] 60 }, 61 }, 62 'required': ['url'], 63 'additionalProperties': False, 64 }, 65 ] 66 } 67 68 def build_config(self, config): 69 def get_auth_from_url(): 70 """Moves basic authentication from url to username and password fields""" 71 parts = list(parse.urlsplit(config['url'])) 72 split = parts[1].split('@') 73 if len(split) > 1: 74 auth = split[0].split(':') 75 if len(auth) == 2: 76 config['username'], config['password'] = auth[0], auth[1] 77 else: 78 logger.warning('Invalid basic authentication in url: {}', config['url']) 79 parts[1] = split[1] 80 config['url'] = parse.urlunsplit(parts) 81 82 if isinstance(config, str): 83 config = {'url': config} 84 get_auth_from_url() 85 return config 86 87 @cached('html') 88 @plugin.internet(logger) 89 def on_task_input(self, task, config): 90 config = self.build_config(config) 91 92 auth = None 93 if config.get('username') and config.get('password'): 94 logger.debug( 95 'Basic auth enabled. User: {} Password: {}', config['username'], config['password'] 96 ) 97 auth = (config['username'], config['password']) 98 99 increment = config.get('increment') 100 base_url = config['url'] 101 if increment: 102 entries = None 103 if not isinstance(increment, dict): 104 increment = {} 105 current = increment.get('from', 0) 106 to = increment.get('to') 107 step = increment.get('step', 1) 108 base_url = config['url'] 109 entries_count = increment.get('entries_count', 500) 110 stop_when_empty = increment.get('stop_when_empty', True) 111 increment_name = increment.get('name', 'i') 112 113 template_url = Template(base_url) 114 template_dump = None 115 if 'dump' in config: 116 dump_name = config['dump'] 117 if dump_name: 118 template_dump = Template(dump_name) 119 120 while to is None or current < to: 121 render_ctx = {increment_name: current} 122 url = template_url.render(**render_ctx) 123 dump_name = None 124 if template_dump: 125 dump_name = template_dump.render(**render_ctx) 126 new_entries = self._request_url(task, config, url, auth, dump_name) 127 if not entries: 128 entries = new_entries 129 else: 130 entries.extend(new_entries) 131 if stop_when_empty and not new_entries: 132 break 133 if entries_count and len(entries) >= entries_count: 134 break 135 current += step 136 return entries 137 else: 138 return self._request_url(task, config, base_url, auth, dump_name=config.get('dump')) 139 140 def _request_url(self, task, config, url, auth, dump_name=None): 141 logger.verbose('Requesting: {}', url) 142 page = task.requests.get(url, auth=auth) 143 logger.verbose('Response: {} ({})', page.status_code, page.reason) 144 soup = get_soup(page.content) 145 146 # dump received content into a file 147 if dump_name: 148 logger.verbose('Dumping: {}', dump_name) 149 data = soup.prettify() 150 with open(dump_name, 'w', encoding='utf-8') as f: 151 f.write(data) 152 153 return self.create_entries(url, soup, config) 154 155 def _title_from_link(self, link, log_link): 156 title = link.text 157 # longshot from next element (?) 158 if not title: 159 title = link.next.string 160 if title is None: 161 logger.debug('longshot failed for {}', log_link) 162 return None 163 return title or None 164 165 @staticmethod 166 def _title_from_url(url): 167 parts = parse.urlsplit(url) 168 name = '' 169 if parts.scheme == 'magnet': 170 match = re.search(r'(?:&dn(?:\.\d)?=)(.+?)(?:&)', parts.query) 171 if match: 172 name = match.group(1) 173 else: 174 name = Path(parts.path).name 175 return parse.unquote_plus(name) 176 177 def create_entries(self, page_url, soup, config): 178 179 queue = [] 180 duplicates = {} 181 duplicate_limit = 4 182 183 def title_exists(title): 184 """Helper method. Return True if title is already added to entries""" 185 for entry in queue: 186 if entry['title'] == title: 187 return True 188 189 for link in soup.find_all('a'): 190 # not a valid link 191 if not link.has_attr('href'): 192 continue 193 # no content in the link 194 if not link.contents and not config.get('allow_empty_links', False): 195 continue 196 197 url = link['href'] 198 # fix broken urls 199 if url.startswith('//'): 200 url = 'http:' + url 201 elif not url.startswith('http://') or not url.startswith('https://'): 202 url = parse.urljoin(page_url, url) 203 204 log_link = url 205 log_link = log_link.replace('\n', '') 206 log_link = log_link.replace('\r', '') 207 208 # get only links matching regexp 209 regexps = config.get('links_re', None) 210 if regexps: 211 accept = False 212 for regexp in regexps: 213 if re.search(regexp, url): 214 accept = True 215 if not accept: 216 logger.debug('url does not match any "links_re": {}', url) 217 continue 218 219 title_from = config.get('title_from', 'auto') 220 if title_from == 'url': 221 title = self._title_from_url(url) 222 logger.debug('title from url: {}', title) 223 elif title_from == 'title': 224 if not link.has_attr('title'): 225 logger.warning("Link `{}` doesn't have title attribute, ignored.", log_link) 226 continue 227 title = link['title'] 228 logger.debug('title from title: {}', title) 229 elif title_from == 'auto': 230 title = self._title_from_link(link, log_link) 231 if title is None: 232 continue 233 # automatic mode, check if title is unique 234 # if there are too many duplicate titles, switch to title_from: url 235 if title_exists(title): 236 # ignore index links as a counter 237 if 'index' in title and len(title) < 10: 238 logger.debug('ignored index title {}', title) 239 continue 240 duplicates.setdefault(title, 0) 241 duplicates[title] += 1 242 if duplicates[title] > duplicate_limit: 243 # if from url seems to be bad choice use title 244 from_url = self._title_from_url(url) 245 switch_to = 'url' 246 for ext in ('.html', '.php'): 247 if from_url.endswith(ext): 248 switch_to = 'title' 249 logger.info( 250 "Link names seem to be useless, auto-configuring 'title_from: {}'. This may not work well, you might need to configure it yourself.", 251 switch_to, 252 ) 253 config['title_from'] = switch_to 254 # start from the beginning ... 255 return self.create_entries(page_url, soup, config) 256 elif title_from == 'link' or title_from == 'contents': 257 # link from link name 258 title = self._title_from_link(link, log_link) 259 if title is None: 260 continue 261 logger.debug('title from link: {}', title) 262 else: 263 raise plugin.PluginError('Unknown title_from value %s' % title_from) 264 265 if not title: 266 logger.warning('title could not be determined for link {}', log_link) 267 continue 268 269 # strip unicode white spaces 270 title = title.replace('\u200B', '').strip() 271 272 # in case the title contains xxxxxxx.torrent - foooo.torrent clean it a bit (get up to first .torrent) 273 # TODO: hack 274 if title.lower().find('.torrent') > 0: 275 title = title[: title.lower().find('.torrent')] 276 277 if title_exists(title): 278 # title link should be unique, add CRC32 to end if it's not 279 hash = zlib.crc32(url.encode("utf-8")) 280 crc32 = '%08X' % (hash & 0xFFFFFFFF) 281 title = '%s [%s]' % (title, crc32) 282 # truly duplicate, title + url crc already exists in queue 283 if title_exists(title): 284 continue 285 logger.debug('uniqued title to {}', title) 286 287 entry = Entry() 288 entry['url'] = url 289 entry['title'] = title 290 291 if 'username' in config and 'password' in config: 292 entry['download_auth'] = (config['username'], config['password']) 293 294 queue.append(entry) 295 296 # add from queue to task 297 return queue 298 299 300@event('plugin.register') 301def register_plugin(): 302 plugin.register(InputHtml, 'html', api_ver=2) 303