1import re
2import zlib
3from pathlib import Path
4from urllib import parse
6from jinja2 import Template
7from loguru import logger
9from flexget import plugin
10from flexget.entry import Entry
11from flexget.event import event
12from flexget.utils.cached_input import cached
13from flexget.utils.soup import get_soup
15logger = logger.bind(name='html')
18class InputHtml:
19    """
20    Parses urls from html page. Usefull on sites which have direct download
21    links of any type (mp3, jpg, torrent, ...).
23    Many anime-fansubbers do not provide RSS-feed, this works well in many cases.
25    Configuration expects url parameter.
27    Note: This returns ALL links on url so you need to configure filters
28    to match only to desired content.
29    """
31    schema = {
32        'oneOf': [
33            {'type': 'string'},
34            {
35                'type': 'object',
36                'properties': {
37                    'url': {'type': 'string', 'format': 'url'},
38                    'username': {'type': 'string'},
39                    'password': {'type': 'string'},
40                    'dump': {'type': 'string'},
41                    'title_from': {'type': 'string'},
42                    'allow_empty_links': {'type': 'boolean'},
43                    'links_re': {'type': 'array', 'items': {'type': 'string', 'format': 'regex'}},
44                    'increment': {
45                        'oneOf': [
46                            {'type': 'boolean'},
47                            {
48                                'type': 'object',
49                                'properties': {
50                                    'from': {'type': 'integer'},
51                                    'to': {'type': 'integer'},
52                                    'name': {'type': 'string'},
53                                    'step': {'type': 'integer'},
54                                    'stop_when_empty': {'type': 'boolean'},
55                                    'entries_count': {'type': 'integer'},
56                                },
57                                'additionalProperties': False,
58                            },
59                        ]
60                    },
61                },
62                'required': ['url'],
63                'additionalProperties': False,
64            },
65        ]
66    }
68    def build_config(self, config):
69        def get_auth_from_url():
70            """Moves basic authentication from url to username and password fields"""
71            parts = list(parse.urlsplit(config['url']))
72            split = parts[1].split('@')
73            if len(split) > 1:
74                auth = split[0].split(':')
75                if len(auth) == 2:
76                    config['username'], config['password'] = auth[0], auth[1]
77                else:
78                    logger.warning('Invalid basic authentication in url: {}', config['url'])
79                parts[1] = split[1]
80                config['url'] = parse.urlunsplit(parts)
82        if isinstance(config, str):
83            config = {'url': config}
84        get_auth_from_url()
85        return config
87    @cached('html')
88    @plugin.internet(logger)
89    def on_task_input(self, task, config):
90        config = self.build_config(config)
92        auth = None
93        if config.get('username') and config.get('password'):
94            logger.debug(
95                'Basic auth enabled. User: {} Password: {}', config['username'], config['password']
96            )
97            auth = (config['username'], config['password'])
99        increment = config.get('increment')
100        base_url = config['url']
101        if increment:
102            entries = None
103            if not isinstance(increment, dict):
104                increment = {}
105            current = increment.get('from', 0)
106            to = increment.get('to')
107            step = increment.get('step', 1)
108            base_url = config['url']
109            entries_count = increment.get('entries_count', 500)
110            stop_when_empty = increment.get('stop_when_empty', True)
111            increment_name = increment.get('name', 'i')
113            template_url = Template(base_url)
114            template_dump = None
115            if 'dump' in config:
116                dump_name = config['dump']
117                if dump_name:
118                    template_dump = Template(dump_name)
120            while to is None or current < to:
121                render_ctx = {increment_name: current}
122                url = template_url.render(**render_ctx)
123                dump_name = None
124                if template_dump:
125                    dump_name = template_dump.render(**render_ctx)
126                new_entries = self._request_url(task, config, url, auth, dump_name)
127                if not entries:
128                    entries = new_entries
129                else:
130                    entries.extend(new_entries)
131                if stop_when_empty and not new_entries:
132                    break
133                if entries_count and len(entries) >= entries_count:
134                    break
135                current += step
136            return entries
137        else:
138            return self._request_url(task, config, base_url, auth, dump_name=config.get('dump'))
140    def _request_url(self, task, config, url, auth, dump_name=None):
141        logger.verbose('Requesting: {}', url)
142        page = task.requests.get(url, auth=auth)
143        logger.verbose('Response: {} ({})', page.status_code, page.reason)
144        soup = get_soup(page.content)
146        # dump received content into a file
147        if dump_name:
148            logger.verbose('Dumping: {}', dump_name)
149            data = soup.prettify()
150            with open(dump_name, 'w', encoding='utf-8') as f:
151                f.write(data)
153        return self.create_entries(url, soup, config)
155    def _title_from_link(self, link, log_link):
156        title = link.text
157        # longshot from next element (?)
158        if not title:
159            title = link.next.string
160            if title is None:
161                logger.debug('longshot failed for {}', log_link)
162                return None
163        return title or None
165    @staticmethod
166    def _title_from_url(url):
167        parts = parse.urlsplit(url)
168        name = ''
169        if parts.scheme == 'magnet':
170            match = re.search(r'(?:&dn(?:\.\d)?=)(.+?)(?:&)', parts.query)
171            if match:
172                name = match.group(1)
173        else:
174            name = Path(parts.path).name
175        return parse.unquote_plus(name)
177    def create_entries(self, page_url, soup, config):
179        queue = []
180        duplicates = {}
181        duplicate_limit = 4
183        def title_exists(title):
184            """Helper method. Return True if title is already added to entries"""
185            for entry in queue:
186                if entry['title'] == title:
187                    return True
189        for link in soup.find_all('a'):
190            # not a valid link
191            if not link.has_attr('href'):
192                continue
193            # no content in the link
194            if not link.contents and not config.get('allow_empty_links', False):
195                continue
197            url = link['href']
198            # fix broken urls
199            if url.startswith('//'):
200                url = 'http:' + url
201            elif not url.startswith('http://') or not url.startswith('https://'):
202                url = parse.urljoin(page_url, url)
204            log_link = url
205            log_link = log_link.replace('\n', '')
206            log_link = log_link.replace('\r', '')
208            # get only links matching regexp
209            regexps = config.get('links_re', None)
210            if regexps:
211                accept = False
212                for regexp in regexps:
213                    if re.search(regexp, url):
214                        accept = True
215                if not accept:
216                    logger.debug('url does not match any "links_re": {}', url)
217                    continue
219            title_from = config.get('title_from', 'auto')
220            if title_from == 'url':
221                title = self._title_from_url(url)
222                logger.debug('title from url: {}', title)
223            elif title_from == 'title':
224                if not link.has_attr('title'):
225                    logger.warning("Link `{}` doesn't have title attribute, ignored.", log_link)
226                    continue
227                title = link['title']
228                logger.debug('title from title: {}', title)
229            elif title_from == 'auto':
230                title = self._title_from_link(link, log_link)
231                if title is None:
232                    continue
233                # automatic mode, check if title is unique
234                # if there are too many duplicate titles, switch to title_from: url
235                if title_exists(title):
236                    # ignore index links as a counter
237                    if 'index' in title and len(title) < 10:
238                        logger.debug('ignored index title {}', title)
239                        continue
240                    duplicates.setdefault(title, 0)
241                    duplicates[title] += 1
242                    if duplicates[title] > duplicate_limit:
243                        # if from url seems to be bad choice use title
244                        from_url = self._title_from_url(url)
245                        switch_to = 'url'
246                        for ext in ('.html', '.php'):
247                            if from_url.endswith(ext):
248                                switch_to = 'title'
249                        logger.info(
250                            "Link names seem to be useless, auto-configuring 'title_from: {}'. This may not work well, you might need to configure it yourself.",
251                            switch_to,
252                        )
253                        config['title_from'] = switch_to
254                        # start from the beginning  ...
255                        return self.create_entries(page_url, soup, config)
256            elif title_from == 'link' or title_from == 'contents':
257                # link from link name
258                title = self._title_from_link(link, log_link)
259                if title is None:
260                    continue
261                logger.debug('title from link: {}', title)
262            else:
263                raise plugin.PluginError('Unknown title_from value %s' % title_from)
265            if not title:
266                logger.warning('title could not be determined for link {}', log_link)
267                continue
269            # strip unicode white spaces
270            title = title.replace('\u200B', '').strip()
272            # in case the title contains xxxxxxx.torrent - foooo.torrent clean it a bit (get up to first .torrent)
273            # TODO: hack
274            if title.lower().find('.torrent') > 0:
275                title = title[: title.lower().find('.torrent')]
277            if title_exists(title):
278                # title link should be unique, add CRC32 to end if it's not
279                hash = zlib.crc32(url.encode("utf-8"))
280                crc32 = '%08X' % (hash & 0xFFFFFFFF)
281                title = '%s [%s]' % (title, crc32)
282                # truly duplicate, title + url crc already exists in queue
283                if title_exists(title):
284                    continue
285                logger.debug('uniqued title to {}', title)
287            entry = Entry()
288            entry['url'] = url
289            entry['title'] = title
291            if 'username' in config and 'password' in config:
292                entry['download_auth'] = (config['username'], config['password'])
294            queue.append(entry)
296        # add from queue to task
297        return queue
301def register_plugin():
302    plugin.register(InputHtml, 'html', api_ver=2)