1import re
2import zlib
3from pathlib import Path
4from urllib import parse
5
6from jinja2 import Template
7from loguru import logger
8
9from flexget import plugin
10from flexget.entry import Entry
11from flexget.event import event
12from flexget.utils.cached_input import cached
13from flexget.utils.soup import get_soup
14
15logger = logger.bind(name='html')
16
17
18class InputHtml:
19    """
20    Parses urls from html page. Usefull on sites which have direct download
21    links of any type (mp3, jpg, torrent, ...).
22
23    Many anime-fansubbers do not provide RSS-feed, this works well in many cases.
24
25    Configuration expects url parameter.
26
27    Note: This returns ALL links on url so you need to configure filters
28    to match only to desired content.
29    """
30
31    schema = {
32        'oneOf': [
33            {'type': 'string'},
34            {
35                'type': 'object',
36                'properties': {
37                    'url': {'type': 'string', 'format': 'url'},
38                    'username': {'type': 'string'},
39                    'password': {'type': 'string'},
40                    'dump': {'type': 'string'},
41                    'title_from': {'type': 'string'},
42                    'allow_empty_links': {'type': 'boolean'},
43                    'links_re': {'type': 'array', 'items': {'type': 'string', 'format': 'regex'}},
44                    'increment': {
45                        'oneOf': [
46                            {'type': 'boolean'},
47                            {
48                                'type': 'object',
49                                'properties': {
50                                    'from': {'type': 'integer'},
51                                    'to': {'type': 'integer'},
52                                    'name': {'type': 'string'},
53                                    'step': {'type': 'integer'},
54                                    'stop_when_empty': {'type': 'boolean'},
55                                    'entries_count': {'type': 'integer'},
56                                },
57                                'additionalProperties': False,
58                            },
59                        ]
60                    },
61                },
62                'required': ['url'],
63                'additionalProperties': False,
64            },
65        ]
66    }
67
68    def build_config(self, config):
69        def get_auth_from_url():
70            """Moves basic authentication from url to username and password fields"""
71            parts = list(parse.urlsplit(config['url']))
72            split = parts[1].split('@')
73            if len(split) > 1:
74                auth = split[0].split(':')
75                if len(auth) == 2:
76                    config['username'], config['password'] = auth[0], auth[1]
77                else:
78                    logger.warning('Invalid basic authentication in url: {}', config['url'])
79                parts[1] = split[1]
80                config['url'] = parse.urlunsplit(parts)
81
82        if isinstance(config, str):
83            config = {'url': config}
84        get_auth_from_url()
85        return config
86
87    @cached('html')
88    @plugin.internet(logger)
89    def on_task_input(self, task, config):
90        config = self.build_config(config)
91
92        auth = None
93        if config.get('username') and config.get('password'):
94            logger.debug(
95                'Basic auth enabled. User: {} Password: {}', config['username'], config['password']
96            )
97            auth = (config['username'], config['password'])
98
99        increment = config.get('increment')
100        base_url = config['url']
101        if increment:
102            entries = None
103            if not isinstance(increment, dict):
104                increment = {}
105            current = increment.get('from', 0)
106            to = increment.get('to')
107            step = increment.get('step', 1)
108            base_url = config['url']
109            entries_count = increment.get('entries_count', 500)
110            stop_when_empty = increment.get('stop_when_empty', True)
111            increment_name = increment.get('name', 'i')
112
113            template_url = Template(base_url)
114            template_dump = None
115            if 'dump' in config:
116                dump_name = config['dump']
117                if dump_name:
118                    template_dump = Template(dump_name)
119
120            while to is None or current < to:
121                render_ctx = {increment_name: current}
122                url = template_url.render(**render_ctx)
123                dump_name = None
124                if template_dump:
125                    dump_name = template_dump.render(**render_ctx)
126                new_entries = self._request_url(task, config, url, auth, dump_name)
127                if not entries:
128                    entries = new_entries
129                else:
130                    entries.extend(new_entries)
131                if stop_when_empty and not new_entries:
132                    break
133                if entries_count and len(entries) >= entries_count:
134                    break
135                current += step
136            return entries
137        else:
138            return self._request_url(task, config, base_url, auth, dump_name=config.get('dump'))
139
140    def _request_url(self, task, config, url, auth, dump_name=None):
141        logger.verbose('Requesting: {}', url)
142        page = task.requests.get(url, auth=auth)
143        logger.verbose('Response: {} ({})', page.status_code, page.reason)
144        soup = get_soup(page.content)
145
146        # dump received content into a file
147        if dump_name:
148            logger.verbose('Dumping: {}', dump_name)
149            data = soup.prettify()
150            with open(dump_name, 'w', encoding='utf-8') as f:
151                f.write(data)
152
153        return self.create_entries(url, soup, config)
154
155    def _title_from_link(self, link, log_link):
156        title = link.text
157        # longshot from next element (?)
158        if not title:
159            title = link.next.string
160            if title is None:
161                logger.debug('longshot failed for {}', log_link)
162                return None
163        return title or None
164
165    @staticmethod
166    def _title_from_url(url):
167        parts = parse.urlsplit(url)
168        name = ''
169        if parts.scheme == 'magnet':
170            match = re.search(r'(?:&dn(?:\.\d)?=)(.+?)(?:&)', parts.query)
171            if match:
172                name = match.group(1)
173        else:
174            name = Path(parts.path).name
175        return parse.unquote_plus(name)
176
177    def create_entries(self, page_url, soup, config):
178
179        queue = []
180        duplicates = {}
181        duplicate_limit = 4
182
183        def title_exists(title):
184            """Helper method. Return True if title is already added to entries"""
185            for entry in queue:
186                if entry['title'] == title:
187                    return True
188
189        for link in soup.find_all('a'):
190            # not a valid link
191            if not link.has_attr('href'):
192                continue
193            # no content in the link
194            if not link.contents and not config.get('allow_empty_links', False):
195                continue
196
197            url = link['href']
198            # fix broken urls
199            if url.startswith('//'):
200                url = 'http:' + url
201            elif not url.startswith('http://') or not url.startswith('https://'):
202                url = parse.urljoin(page_url, url)
203
204            log_link = url
205            log_link = log_link.replace('\n', '')
206            log_link = log_link.replace('\r', '')
207
208            # get only links matching regexp
209            regexps = config.get('links_re', None)
210            if regexps:
211                accept = False
212                for regexp in regexps:
213                    if re.search(regexp, url):
214                        accept = True
215                if not accept:
216                    logger.debug('url does not match any "links_re": {}', url)
217                    continue
218
219            title_from = config.get('title_from', 'auto')
220            if title_from == 'url':
221                title = self._title_from_url(url)
222                logger.debug('title from url: {}', title)
223            elif title_from == 'title':
224                if not link.has_attr('title'):
225                    logger.warning("Link `{}` doesn't have title attribute, ignored.", log_link)
226                    continue
227                title = link['title']
228                logger.debug('title from title: {}', title)
229            elif title_from == 'auto':
230                title = self._title_from_link(link, log_link)
231                if title is None:
232                    continue
233                # automatic mode, check if title is unique
234                # if there are too many duplicate titles, switch to title_from: url
235                if title_exists(title):
236                    # ignore index links as a counter
237                    if 'index' in title and len(title) < 10:
238                        logger.debug('ignored index title {}', title)
239                        continue
240                    duplicates.setdefault(title, 0)
241                    duplicates[title] += 1
242                    if duplicates[title] > duplicate_limit:
243                        # if from url seems to be bad choice use title
244                        from_url = self._title_from_url(url)
245                        switch_to = 'url'
246                        for ext in ('.html', '.php'):
247                            if from_url.endswith(ext):
248                                switch_to = 'title'
249                        logger.info(
250                            "Link names seem to be useless, auto-configuring 'title_from: {}'. This may not work well, you might need to configure it yourself.",
251                            switch_to,
252                        )
253                        config['title_from'] = switch_to
254                        # start from the beginning  ...
255                        return self.create_entries(page_url, soup, config)
256            elif title_from == 'link' or title_from == 'contents':
257                # link from link name
258                title = self._title_from_link(link, log_link)
259                if title is None:
260                    continue
261                logger.debug('title from link: {}', title)
262            else:
263                raise plugin.PluginError('Unknown title_from value %s' % title_from)
264
265            if not title:
266                logger.warning('title could not be determined for link {}', log_link)
267                continue
268
269            # strip unicode white spaces
270            title = title.replace('\u200B', '').strip()
271
272            # in case the title contains xxxxxxx.torrent - foooo.torrent clean it a bit (get up to first .torrent)
273            # TODO: hack
274            if title.lower().find('.torrent') > 0:
275                title = title[: title.lower().find('.torrent')]
276
277            if title_exists(title):
278                # title link should be unique, add CRC32 to end if it's not
279                hash = zlib.crc32(url.encode("utf-8"))
280                crc32 = '%08X' % (hash & 0xFFFFFFFF)
281                title = '%s [%s]' % (title, crc32)
282                # truly duplicate, title + url crc already exists in queue
283                if title_exists(title):
284                    continue
285                logger.debug('uniqued title to {}', title)
286
287            entry = Entry()
288            entry['url'] = url
289            entry['title'] = title
290
291            if 'username' in config and 'password' in config:
292                entry['download_auth'] = (config['username'], config['password'])
293
294            queue.append(entry)
295
296        # add from queue to task
297        return queue
298
299
300@event('plugin.register')
301def register_plugin():
302    plugin.register(InputHtml, 'html', api_ver=2)
303