1# -*- coding: utf-8 -*-
2#
3# This file is part of urlwatch (https://thp.io/2008/urlwatch/).
4# Copyright (c) 2008-2021 Thomas Perl <m@thp.io>
5# All rights reserved.
6#
7# Redistribution and use in source and binary forms, with or without
8# modification, are permitted provided that the following conditions
9# are met:
10#
11# 1. Redistributions of source code must retain the above copyright
12#    notice, this list of conditions and the following disclaimer.
13# 2. Redistributions in binary form must reproduce the above copyright
14#    notice, this list of conditions and the following disclaimer in the
15#    documentation and/or other materials provided with the distribution.
16# 3. The name of the author may not be used to endorse or promote products
17#    derived from this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
31import re
32import logging
33import itertools
34import os
35import html.parser
36import hashlib
37import json
38import yaml
39import sys
40import subprocess
41import io
42import csv
43
44from enum import Enum
45from lxml import etree
46from lxml.cssselect import CSSSelector
47
48from xml.dom import minidom
49
50from .util import TrackSubClasses, import_module_from_source
51
52from .html2txt import html2text
53from .ical2txt import ical2text
54
55try:
56    from bs4 import BeautifulSoup
57except ImportError:
58    BeautifulSoup = None
59
60try:
61    import jsbeautifier
62except ImportError:
63    jsbeautifier = None
64
65try:
66    import cssbeautifier
67except ImportError:
68    cssbeautifier = None
69
70try:
71    import pdftotext
72except ImportError:
73    pdftotext = None
74
75try:
76    import pytesseract
77except ImportError:
78    pytesseract = None
79
80try:
81    from PIL import Image
82except ImportError:
83    Image = None
84
85try:
86    import jq
87except ImportError:
88    jq = None
89
90logger = logging.getLogger(__name__)
91
92
93class FilterBase(object, metaclass=TrackSubClasses):
94    __subclasses__ = {}
95    __anonymous_subclasses__ = []
96
97    def __init__(self, job, state):
98        self.job = job
99        self.state = state
100
101    @classmethod
102    def filter_documentation(cls):
103        result = []
104        for sc in TrackSubClasses.sorted_by_kind(cls):
105            default_subfilter = getattr(sc, '__default_subfilter__', None)
106            result.extend((
107                '  * %s - %s' % (sc.__kind__, sc.__doc__),
108            ))
109            if hasattr(sc, '__supported_subfilters__'):
110                for key, doc in sc.__supported_subfilters__.items():
111                    result.append('      %s%s%s ... %s' % ('[' if key == default_subfilter else '', key,
112                                                           ']' if key == default_subfilter else '', doc))
113        result.append('\n[] ... Parameter can be supplied as unnamed value\n')
114        return '\n'.join(result)
115
116    @classmethod
117    def auto_process(cls, state, data):
118        filters = itertools.chain((filtercls for _, filtercls in
119                                   sorted(cls.__subclasses__.items(), key=lambda k_v: k_v[0])),
120                                  cls.__anonymous_subclasses__)
121
122        for filtercls in filters:
123            filter_instance = filtercls(state.job, state)
124            if filter_instance.match():
125                logger.info('Auto-applying filter %r to %s', filter_instance, state.job.get_location())
126                # filters require a subfilter argument
127                data = filter_instance.filter(data, None)
128
129        return data
130
131    @classmethod
132    def normalize_filter_list(cls, filter_spec):
133        for filter_kind, subfilter in cls._internal_normalize_filter_list(filter_spec):
134            filtercls = cls.__subclasses__.get(filter_kind, None)
135
136            if filtercls is None:
137                raise ValueError('Unknown filter kind: {} (subfilter {})'.format(filter_kind, subfilter))
138
139            if getattr(filtercls, '__no_subfilter__', False) and subfilter:
140                raise ValueError('No subfilters supported for {}'.format(filter_kind))
141
142            if isinstance(subfilter, dict) and hasattr(filtercls, '__supported_subfilters__'):
143                provided_keys = set(subfilter.keys())
144                allowed_keys = set(filtercls.__supported_subfilters__.keys())
145                unknown_keys = provided_keys.difference(allowed_keys)
146                if unknown_keys and '<any>' not in allowed_keys:
147                    raise ValueError('Filter "{}" does not support subfilter(s): {} (supported: {})'.format(filter_kind,
148                                                                                                            unknown_keys,
149                                                                                                            allowed_keys))
150
151            yield filter_kind, subfilter
152
153    @classmethod
154    def _internal_normalize_filter_list(cls, filter_spec):
155        if isinstance(filter_spec, str):
156            old_filter_spec = filter_spec
157
158            # Legacy string-based filter list specification:
159            # "filter1:param1,filter2,filter3,filter4:param4"
160            filter_spec = [dict([filter_kind.split(':', 1)]) if ':' in filter_kind else filter_kind
161                           for filter_kind in filter_spec.split(',')]
162
163            logger.warning('String-based filter definitions (%s) are deprecated, please convert to dict-style (see https://urlwatch.readthedocs.io/en/latest/deprecated.html):\n\n%s',
164                           old_filter_spec, yaml.dump(filter_spec, default_flow_style=False))
165
166        if isinstance(filter_spec, list):
167            for item in filter_spec:
168                if isinstance(item, str):
169                    filter_kind, subfilter = item, None
170                elif isinstance(item, dict):
171                    filter_kind, subfilter = next(iter(item.items()))
172
173                filtercls = cls.__subclasses__.get(filter_kind, None)
174
175                if isinstance(subfilter, dict):
176                    yield filter_kind, subfilter
177                elif subfilter is None:
178                    yield filter_kind, {}
179                elif hasattr(filtercls, '__default_subfilter__'):
180                    yield filter_kind, {getattr(filtercls, '__default_subfilter__'): subfilter}
181                else:
182                    yield filter_kind, subfilter
183
184    @classmethod
185    def process(cls, filter_kind, subfilter, state, data):
186        logger.info('Applying filter %r, subfilter %r to %s', filter_kind, subfilter, state.job.get_location())
187        filtercls = cls.__subclasses__.get(filter_kind, None)
188        return filtercls(state.job, state).filter(data, subfilter)
189
190    @classmethod
191    def filter_chain_needs_bytes(cls, filter):
192        # If the first filter is a bytes filter, return content in bytes instead of
193        # in unicode as that's what's required by the library used by that filter
194        first_filter = next(cls.normalize_filter_list(filter), None)
195        if first_filter is not None:
196            filter_kind, subfilter = first_filter
197            return cls.is_bytes_filter_kind(filter_kind)
198
199        return False
200
201    @classmethod
202    def is_bytes_filter_kind(cls, filter_kind):
203        return (filter_kind in [name for name, class_ in cls.__subclasses__.items()
204                                if getattr(class_, '__uses_bytes__', False)])
205
206    def match(self):
207        return False
208
209    def filter(self, data, subfilter):
210        raise NotImplementedError()
211
212
213class AutoMatchFilter(FilterBase):
214    """Automatically matches subclass filters with a given location"""
215    MATCH = None
216
217    def match(self):
218        if self.MATCH is None:
219            return False
220
221        d = self.job.to_dict()
222        result = all(d.get(k, None) == v for k, v in self.MATCH.items())
223        logger.debug('Matching %r with %r result: %r', self, self.job, result)
224        return result
225
226
227class RegexMatchFilter(FilterBase):
228    """Same as AutoMatchFilter but matching is done with regexes"""
229    MATCH = None
230
231    def match(self):
232        if self.MATCH is None:
233            return False
234
235        d = self.job.to_dict()
236
237        # It's a match if we have at least one key/value pair that matches,
238        # and no key/value pairs that do not match
239        matches = [v.match(d[k]) for k, v in self.MATCH.items() if k in d]
240        result = len(matches) > 0 and all(matches)
241        logger.debug('Matching %r with %r result: %r', self, self.job, result)
242        return result
243
244
245class LegacyHooksPyFilter(FilterBase):
246    FILENAME = os.path.expanduser('~/.urlwatch/lib/hooks.py')
247
248    def __init__(self, job, state):
249        super().__init__(job, state)
250
251        self.hooks = None
252        if os.path.exists(self.FILENAME):
253            try:
254                self.hooks = import_module_from_source('legacy_hooks', self.FILENAME)
255            except Exception as e:
256                logger.error('Could not load legacy hooks file: %s', e)
257
258    def match(self):
259        return self.hooks is not None
260
261    def filter(self, data, subfilter):
262        if subfilter is not None:
263            logger.warning('Legacy hooks filter does not have any subfilter -- ignored')
264
265        try:
266            result = self.hooks.filter(self.job.get_location(), data)
267            if result is None:
268                result = data
269            return result
270        except Exception as e:
271            logger.warning('Could not apply legacy hooks filter: %s', e)
272            return data
273
274
275class BeautifyFilter(FilterBase):
276    """Beautify HTML"""
277
278    __kind__ = 'beautify'
279
280    __no_subfilter__ = True
281
282    def filter(self, data, subfilter):
283        if BeautifulSoup is None:
284            raise ImportError('Please install BeautifulSoup')
285
286        soup = BeautifulSoup(data, features="lxml")
287
288        if jsbeautifier is not None:
289            scripts = soup.find_all('script')
290            for script in scripts:
291                if script.string is not None:
292                    beautified_js = jsbeautifier.beautify(script.string)
293                    script.string = beautified_js
294        else:
295            logger.info('"jsbeautifier" is not installed, will not beautify <script> tags')
296
297        if cssbeautifier is not None:
298            styles = soup.find_all('style')
299            for style in styles:
300                if style.string is not None:
301                    beautified_css = cssbeautifier.beautify(style.string)
302                    style.string = beautified_css
303        else:
304            logger.info('"cssbeautifier" is not installed, will not beautify <style> tags')
305
306        return soup.prettify()
307
308
309class Html2TextFilter(FilterBase):
310    """Convert HTML to plaintext"""
311
312    __kind__ = 'html2text'
313
314    __supported_subfilters__ = {
315        'method': 'Method to use for conversion (default: re)',
316        '<any>': 'Method-specific options passed to html2text',
317    }
318
319    __default_subfilter__ = 'method'
320
321    def filter(self, data, subfilter):
322        if 'method' in subfilter:
323            method = subfilter['method']
324            del subfilter['method']
325            options = subfilter
326        else:
327            method = 're'
328            options = {}
329
330        return html2text(data, baseurl=getattr(self.job, 'url', getattr(self.job, 'navigate', '')),
331                         method=method, options=options)
332
333
334class Csv2TextFilter(FilterBase):
335    """Convert CSV to plaintext"""
336
337    __kind__ = 'csv2text'
338
339    __supported_subfilters__ = {
340        'format_message': 'A format string with the headers that will be outputted for each csv '
341                          'line (header will be lower-cased)',
342        'ignore_header': 'If your format string is number based, but the CSV has headers, '
343                         'this flag will force ignoring the header.',
344        'has_header': 'If specified and true - use the first line as a header. '
345                      'If false - force ignore first line as header (treat it as data). '
346                      'If not specified csv.Sniffer will be used.',
347    }
348
349    __default_subfilter__ = 'format_message'
350
351    def filter(self, data, subfilter):
352        has_header_config = subfilter.get('has_header')
353
354        if has_header_config is None:
355            has_header = csv.Sniffer().has_header(data)
356        else:
357            has_header = has_header_config
358
359        reader = csv.reader(data.split('\n'))
360        data_list = list(reader)
361        header = None
362
363        if has_header:
364            header = data_list.pop(0)
365
366        header = [i.lower() for i in header]
367        message = subfilter['format_message']
368        ignore_header = subfilter['ignore_header']
369
370        lines = []
371        for i in data_list:
372            if header and not ignore_header:
373                legend = dict(zip(header, i))
374                lines.append(message.format(**legend))
375            else:
376                lines.append(message.format(*i))
377
378        return '\n'.join(lines)
379
380
381class Pdf2TextFilter(FilterBase):
382    """Convert PDF to plaintext"""
383    # Requires data to be in bytes (not unicode)
384    # Dependency: pdftotext (https://github.com/jalan/pdftotext), itself based
385    # on poppler (https://poppler.freedesktop.org/)
386    # Note: check pdftotext website for OS-specific dependencies for install
387
388    __kind__ = 'pdf2text'
389    __uses_bytes__ = True
390
391    __supported_subfilters__ = {
392        'password': 'PDF password for decryption',
393    }
394
395    def filter(self, data, subfilter):
396        # data must be bytes
397        if not isinstance(data, bytes):
398            raise ValueError('The pdf2text filter needs bytes input (is it the first filter?)')
399
400        if pdftotext is None:
401            raise ImportError('Please install pdftotext')
402
403        return '\n\n'.join(pdftotext.PDF(io.BytesIO(data), password=subfilter.get('password', '')))
404
405
406class Ical2TextFilter(FilterBase):
407    """Convert iCalendar to plaintext"""
408
409    __kind__ = 'ical2text'
410
411    __no_subfilter__ = True
412
413    def filter(self, data, subfilter):
414        return ical2text(data)
415
416
417class JsonFormatFilter(FilterBase):
418    """Convert to formatted json"""
419
420    __kind__ = 'format-json'
421
422    __supported_subfilters__ = {
423        'indentation': 'Indentation level for pretty-printing',
424    }
425
426    __default_subfilter__ = 'indentation'
427
428    def filter(self, data, subfilter):
429        indentation = int(subfilter.get('indentation', 4))
430        parsed_json = json.loads(data)
431        return json.dumps(parsed_json, ensure_ascii=False, sort_keys=True, indent=indentation, separators=(',', ': '))
432
433
434class PrettyXMLFilter(FilterBase):
435    """Pretty-print XML"""
436
437    __kind__ = 'pretty-xml'
438
439    __supported_subfilters__ = {
440        'indentation': 'Indentation level for pretty-printing',
441    }
442
443    __default_subfilter__ = 'indentation'
444
445    def filter(self, data, subfilter):
446        indentation = int(subfilter.get('indentation', 2))
447        return minidom.parseString(data).toprettyxml(indent=' ' * indentation)
448
449
450class GrepFilter(FilterBase):
451    """Filter only lines matching a regular expression"""
452
453    __kind__ = 'grep'
454
455    __supported_subfilters__ = {
456        're': 'Lines matching this expression are kept (required)',
457    }
458
459    __default_subfilter__ = 're'
460
461    def filter(self, data, subfilter):
462        if 're' not in subfilter:
463            raise ValueError('The grep filter needs a regular expression')
464
465        return '\n'.join(line for line in data.splitlines()
466                         if re.search(subfilter['re'], line) is not None)
467
468
469class InverseGrepFilter(FilterBase):
470    """Remove lines matching a regular expression"""
471
472    __kind__ = 'grepi'
473
474    __supported_subfilters__ = {
475        're': 'Lines matching this expression are removed (required)',
476    }
477
478    __default_subfilter__ = 're'
479
480    def filter(self, data, subfilter):
481        if 're' not in subfilter:
482            raise ValueError('The inverse grep filter needs a regular expression')
483
484        return '\n'.join(line for line in data.splitlines()
485                         if re.search(subfilter['re'], line) is None)
486
487
488class StripFilter(FilterBase):
489    """Strip leading and trailing whitespace"""
490
491    __kind__ = 'strip'
492
493    __no_subfilter__ = True
494
495    def filter(self, data, subfilter):
496        return data.strip()
497
498
499class FilterBy(Enum):
500    ATTRIBUTE = 1
501    TAG = 2
502
503
504class ElementsBy(html.parser.HTMLParser):
505    def __init__(self, filter_by, name, value=None):
506        super().__init__()
507
508        self._filter_by = filter_by
509        if self._filter_by == FilterBy.ATTRIBUTE:
510            self._attributes = {name: value}
511        else:
512            self._name = name
513
514        self._result = []
515        self._inside = False
516        self._elts = []
517
518    def get_html(self):
519        return ''.join(self._result)
520
521    def handle_starttag(self, tag, attrs):
522        ad = dict(attrs)
523
524        if self._filter_by == FilterBy.ATTRIBUTE and all(ad.get(k, None) == v for k, v in self._attributes.items()):
525            self._inside = True
526        elif self._filter_by == FilterBy.TAG and tag == self._name:
527            self._inside = True
528
529        if self._inside:
530            self._result.append('<%s%s%s>' % (tag, ' ' if attrs else '',
531                                              ' '.join('%s="%s"' % (k, v) for k, v in attrs)))
532            self._elts.append(tag)
533
534    def handle_endtag(self, tag):
535        if self._inside:
536            self._result.append('</%s>' % (tag,))
537            if tag in self._elts:
538                t = self._elts.pop()
539                while t != tag and self._elts:
540                    t = self._elts.pop()
541            if not self._elts:
542                self._inside = False
543
544    def handle_data(self, data):
545        if self._inside:
546            self._result.append(data)
547
548
549class GetElementById(FilterBase):
550    """Get an HTML element by its ID"""
551
552    __kind__ = 'element-by-id'
553
554    __supported_subfilters__ = {
555        'id': 'ID of the element to filter for (required)',
556    }
557
558    __default_subfilter__ = 'id'
559
560    def filter(self, data, subfilter):
561        if 'id' not in subfilter:
562            raise ValueError('Need an element ID for filtering')
563
564        element_by_id = ElementsBy(FilterBy.ATTRIBUTE, 'id', subfilter['id'])
565        element_by_id.feed(data)
566        return element_by_id.get_html()
567
568
569class GetElementByClass(FilterBase):
570    """Get all HTML elements by class"""
571
572    __kind__ = 'element-by-class'
573
574    __supported_subfilters__ = {
575        'class': 'HTML class attribute to filter for (required)',
576    }
577
578    __default_subfilter__ = 'class'
579
580    def filter(self, data, subfilter):
581        if 'class' not in subfilter:
582            raise ValueError('Need an element class for filtering')
583
584        element_by_class = ElementsBy(FilterBy.ATTRIBUTE, 'class', subfilter['class'])
585        element_by_class.feed(data)
586        return element_by_class.get_html()
587
588
589class GetElementByStyle(FilterBase):
590    """Get all HTML elements by style"""
591
592    __kind__ = 'element-by-style'
593
594    __supported_subfilters__ = {
595        'style': 'HTML style attribute value to filter for (required)',
596    }
597
598    __default_subfilter__ = 'style'
599
600    def filter(self, data, subfilter):
601        if 'style' not in subfilter:
602            raise ValueError('Need an element style for filtering')
603
604        element_by_style = ElementsBy(FilterBy.ATTRIBUTE, 'style', subfilter['style'])
605        element_by_style.feed(data)
606        return element_by_style.get_html()
607
608
609class GetElementByTag(FilterBase):
610    """Get an HTML element by its tag"""
611
612    __kind__ = 'element-by-tag'
613
614    __supported_subfilters__ = {
615        'tag': 'HTML tag name to filter for (required)',
616    }
617
618    __default_subfilter__ = 'tag'
619
620    def filter(self, data, subfilter):
621        if 'tag' not in subfilter:
622            raise ValueError('Need a tag for filtering')
623
624        element_by_tag = ElementsBy(FilterBy.TAG, subfilter['tag'])
625        element_by_tag.feed(data)
626        return element_by_tag.get_html()
627
628
629class Sha1Filter(FilterBase):
630    """Calculate the SHA-1 checksum of the content"""
631
632    __kind__ = 'sha1sum'
633
634    __no_subfilter__ = True
635
636    def filter(self, data, subfilter):
637        sha = hashlib.sha1()
638        sha.update(data.encode('utf-8', 'ignore'))
639        return sha.hexdigest()
640
641
642class HexdumpFilter(FilterBase):
643    """Convert binary data to hex dump format"""
644
645    __kind__ = 'hexdump'
646
647    __no_subfilter__ = True
648
649    def filter(self, data, subfilter):
650        data = bytearray(data.encode('utf-8', 'ignore'))
651        blocks = [data[i * 16:(i + 1) * 16] for i in range(int((len(data) + (16 - 1)) / 16))]
652        return '\n'.join('%s  %s' % (' '.join('%02x' % c for c in block),
653                                     ''.join((chr(c) if (c > 31 and c < 127) else '.')
654                                             for c in block)) for block in blocks)
655
656
657class LxmlParser:
658    EXPR_NAMES = {'css': 'a CSS selector',
659                  'xpath': 'an XPath expression'}
660
661    def __init__(self, filter_kind, subfilter, expr_key):
662        self.filter_kind = filter_kind
663        if expr_key not in subfilter:
664            raise ValueError('Need %s for filtering' % (self.EXPR_NAMES[filter_kind],))
665        self.expression = subfilter[expr_key]
666        self.method = subfilter.get('method', 'html')
667        self.exclude = subfilter.get('exclude')
668        self.namespaces = subfilter.get('namespaces')
669        self.skip = int(subfilter.get('skip', 0))
670        self.maxitems = int(subfilter.get('maxitems', 0))
671        if self.method not in ('html', 'xml'):
672            raise ValueError('%s method must be "html" or "xml", got %r' % (filter_kind, self.method))
673        if self.method == 'html' and self.namespaces is not None:
674            raise ValueError('Namespace prefixes only supported with "xml" method.')
675        self.parser = (etree.HTMLParser if self.method == 'html' else etree.XMLParser)()
676        self.data = ''
677
678    def feed(self, data):
679        self.data += data
680
681    def _to_string(self, element):
682        # Handle "/text()" selector, which returns lxml.etree._ElementUnicodeResult (Issue #282)
683        if isinstance(element, str):
684            return element
685
686        return etree.tostring(element, pretty_print=True, method=self.method, encoding='unicode', with_tail=False)
687
688    @staticmethod
689    def _remove_element(element):
690        parent = element.getparent()
691        if parent is None:
692            # Do not exclude root element
693            return
694        if isinstance(element, etree._ElementUnicodeResult):
695            if element.is_tail:
696                parent.tail = None
697            elif element.is_text:
698                parent.text = None
699            elif element.is_attribute:
700                del parent.attrib[element.attrname]
701        else:
702            previous = element.getprevious()
703            if element.tail is not None:
704                if previous is not None:
705                    previous.tail = previous.tail + element.tail if previous.tail else element.tail
706                else:
707                    parent.text = parent.text + element.tail if parent.text else element.tail
708            parent.remove(element)
709
710    def _reevaluate(self, element):
711        if self._orphaned(element):
712            return None
713        if isinstance(element, etree._ElementUnicodeResult):
714            parent = element.getparent()
715            if parent is None:
716                return element
717            if element.is_tail:
718                return parent.tail
719            elif element.is_text:
720                return parent.text
721            elif element.is_attribute:
722                return parent.attrib.get(element.attrname)
723        else:
724            return element
725
726    def _orphaned(self, element):
727        if isinstance(element, etree._ElementUnicodeResult):
728            parent = element.getparent()
729            if ((element.is_tail and parent.tail is None)
730                    or (element.is_text and parent.text is None)
731                    or (element.is_attribute and parent.attrib.get(element.attrname) is None)):
732                return True
733            else:
734                element = parent
735        try:
736            tree = element.getroottree()
737            path = tree.getpath(element)
738            return element is not tree.xpath(path, namespaces=self.namespaces)[0]
739        except (ValueError, IndexError):
740            return True
741
742    def _get_filtered_elements(self):
743        try:
744            root = etree.fromstring(self.data, self.parser)
745        except ValueError:
746            # Strip XML declaration, for example: '<?xml version="1.0" encoding="utf-8"?>'
747            # for https://heronebag.com/blog/index.xml, an error happens, as we get a
748            # a (Unicode) string, but the XML contains its own "encoding" declaration
749            self.data = re.sub(r'^<[?]xml[^>]*[?]>', '', self.data)
750            # Retry parsing with XML declaration removed (Fixes #281)
751            root = etree.fromstring(self.data, self.parser)
752        if root is None:
753            return []
754        excluded_elems = None
755        if self.filter_kind == 'css':
756            selected_elems = CSSSelector(self.expression,
757                                         namespaces=self.namespaces).evaluate(root)
758            excluded_elems = CSSSelector(self.exclude,
759                                         namespaces=self.namespaces).evaluate(root) if self.exclude else None
760        elif self.filter_kind == 'xpath':
761            selected_elems = root.xpath(self.expression, namespaces=self.namespaces)
762            excluded_elems = root.xpath(self.exclude, namespaces=self.namespaces) if self.exclude else None
763        if excluded_elems is not None:
764            for el in excluded_elems:
765                self._remove_element(el)
766        return [el for el in map(self._reevaluate, selected_elems) if el is not None]
767
768    def get_filtered_data(self):
769        elements = list(self._get_filtered_elements())
770        if self.skip:
771            elements = elements[self.skip:]
772        if self.maxitems:
773            elements = elements[:self.maxitems]
774        return '\n'.join(self._to_string(element) for element in elements)
775
776
777LXML_PARSER_COMMON_SUBFILTERS = {
778    'method': 'The method (html or xml) used for parsing',
779    'exclude': 'Elements to remove from the final result',
780    'namespaces': 'Mapping of XML namespaces for matching',
781    'skip': 'Number of elements to skip from the beginning (default: 0)',
782    'maxitems': 'Maximum number of items to return (default: all)',
783}
784
785
786class CssFilter(FilterBase):
787    """Filter XML/HTML using CSS selectors"""
788
789    __kind__ = 'css'
790
791    __supported_subfilters__ = {
792        'selector': 'The CSS selector to use for filtering (required)',
793        **LXML_PARSER_COMMON_SUBFILTERS,
794    }
795
796    __default_subfilter__ = 'selector'
797
798    def filter(self, data, subfilter):
799        lxml_parser = LxmlParser('css', subfilter, 'selector')
800        lxml_parser.feed(data)
801        return lxml_parser.get_filtered_data()
802
803
804class XPathFilter(FilterBase):
805    """Filter XML/HTML using XPath expressions"""
806
807    __kind__ = 'xpath'
808
809    __supported_subfilters__ = {
810        'path': 'The XPath to use for filtering (required)',
811        **LXML_PARSER_COMMON_SUBFILTERS,
812    }
813
814    __default_subfilter__ = 'path'
815
816    def filter(self, data, subfilter):
817        lxml_parser = LxmlParser('xpath', subfilter, 'path')
818        lxml_parser.feed(data)
819        return lxml_parser.get_filtered_data()
820
821
822class RegexSub(FilterBase):
823    """Replace text with regular expressions using Python's re.sub"""
824
825    __kind__ = 're.sub'
826
827    __supported_subfilters__ = {
828        'pattern': 'Regular expression to search for (required)',
829        'repl': 'Replacement string (default: empty string)',
830    }
831
832    __default_subfilter__ = 'pattern'
833
834    def filter(self, data, subfilter):
835        if 'pattern' not in subfilter:
836            raise ValueError('{} needs a pattern'.format(self.__kind__))
837
838        # Default: Replace with empty string if no "repl" value is set
839        return re.sub(subfilter['pattern'], subfilter.get('repl', ''), data)
840
841
842class SortFilter(FilterBase):
843    """Sort input items"""
844
845    __kind__ = 'sort'
846
847    __supported_subfilters__ = {
848        'reverse': 'Set to true to reverse sorting order',
849        'separator': 'Item separator (default: newline)',
850    }
851
852    __default_subfilter__ = 'separator'
853
854    def filter(self, data, subfilter):
855        reverse = (isinstance(subfilter, dict) and subfilter.get('reverse', False) is True)
856        separator = subfilter.get('separator', '\n')
857        return separator.join(sorted(data.split(separator), key=str.casefold, reverse=reverse))
858
859
860class RemoveDuplicateLinesFilter(FilterBase):
861    """Remove duplicate lines"""
862
863    __kind__ = 'remove-duplicate-lines'
864
865    __supported_subfilters__ = {
866        'separator': 'Item separator (default: newline)',
867    }
868
869    __default_subfilter__ = 'separator'
870
871    def filter(self, data, subfilter):
872        separator = subfilter.get('separator', '\n')
873        data_lines = data.split(separator)
874
875        def get_unique_lines(lines):
876            seen = set()
877            for line in lines:
878                if line not in seen:
879                    yield line
880                    seen.add(line)
881
882        return separator.join(get_unique_lines(data_lines))
883
884
885class ReverseFilter(FilterBase):
886    """Reverse input items"""
887
888    __kind__ = 'reverse'
889
890    __supported_subfilters__ = {
891        'separator': 'Item separator (default: newline)',
892    }
893
894    __default_subfilter__ = 'separator'
895
896    def filter(self, data, subfilter):
897        separator = subfilter.get('separator', '\n')
898        return separator.join(reversed(data.split(separator)))
899
900
901class ShellPipeFilter(FilterBase):
902    """Filter using a shell command"""
903
904    __kind__ = 'shellpipe'
905
906    __supported_subfilters__ = {
907        'command': 'Shell command to execute for filtering (required)',
908    }
909
910    __default_subfilter__ = 'command'
911
912    def filter(self, data, subfilter):
913        if 'command' not in subfilter:
914            raise ValueError('{} filter needs a command'.format(self.__kind__))
915
916        encoding = sys.getdefaultencoding()
917
918        # Work on a copy to not modify the outside environment
919        env = dict(os.environ)
920        env.update({
921            'URLWATCH_JOB_NAME': self.job.pretty_name() if self.job else '',
922            'URLWATCH_JOB_LOCATION': self.job.get_location() if self.job else '',
923        })
924
925        return subprocess.check_output(subfilter['command'], shell=True,
926                                       input=data.encode(encoding), env=env).decode(encoding)
927
928
929class OCRFilter(FilterBase):
930    """Convert text in images to plaintext using Tesseract OCR"""
931
932    __kind__ = 'ocr'
933    __uses_bytes__ = True
934
935    __supported_subfilters__ = {
936        'language': 'Language of the text (e.g. "fra" or "eng+fra")',
937        'timeout': 'Timeout (in seconds) for OCR (default 10 seconds)',
938    }
939
940    def filter(self, data, subfilter):
941        if not isinstance(data, bytes):
942            raise ValueError('The ocr filter needs bytes input (is it the first filter?)')
943
944        language = subfilter.get('language', None)
945        timeout = int(subfilter.get('timeout', 10))
946
947        if pytesseract is None:
948            raise ImportError('Please install pytesseract')
949
950        if Image is None:
951            raise ImportError('Please install Pillow/PIL')
952
953        return pytesseract.image_to_string(Image.open(io.BytesIO(data)), lang=language, timeout=timeout)
954
955
956class JQFilter(FilterBase):
957    """Parse, transform, and extract data from json as text using `jq`"""
958
959    __kind__ = 'jq'
960
961    __supported_subfilters__ = {
962        'query': 'jq query function to execute on data',
963    }
964
965    __default_subfilter__ = 'query'
966
967    def filter(self, data, subfilter):
968
969        try:
970            jsondata = json.loads(data)
971        except ValueError:
972            raise ValueError('The url response contained invalid JSON')
973
974        if 'query' not in subfilter:
975            raise ValueError('{} filter needs a query'.format(self.__kind__))
976
977        if jq is None:
978            raise ImportError('Please install jq')
979
980        return jq.text(subfilter['query'], jsondata)
981