1# -*- coding: UTF-8 -*-
2#   Copyright 2009-2018 Oli Schacher
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#
16#
17#
18import logging
19import os
20import sys
21import time
22import socket
23import uuid
24import threading
25from fuglu.localStringEncoding import force_uString, force_bString
26try:
27    from html.parser import HTMLParser
28except ImportError:
29    from HTMLParser import HTMLParser
30
31HAVE_BEAUTIFULSOUP = False
32BS_VERSION = 0
33try:
34    import bs4 as BeautifulSoup
35    HAVE_BEAUTIFULSOUP = True
36    BS_VERSION = 4
37except ImportError:
38    pass
39
40if not HAVE_BEAUTIFULSOUP:
41    try:
42        import BeautifulSoup
43        HAVE_BEAUTIFULSOUP = True
44        BS_VERSION = 3
45    except ImportError:
46        pass
47
48import email
49import re
50try:
51    import configparser
52except ImportError:
53    import ConfigParser as configparser
54import datetime
55from string import Template
56from email.header import Header
57
58# constants
59
60DUNNO = 0  # go on
61ACCEPT = 1  # accept message, no further tests
62DELETE = 2  # blackhole, no further tests
63REJECT = 3  # reject, no further tests
64DEFER = 4  # defer, no further tests
65
66ALLCODES = {
67    'DUNNO': DUNNO,
68    'ACCEPT': ACCEPT,
69    'DELETE': DELETE,
70    'REJECT': REJECT,
71    'DEFER': DEFER,
72}
73
74
75def actioncode_to_string(actioncode):
76    """Return the human readable string for this code"""
77    for key, val in list(ALLCODES.items()):
78        if val == actioncode:
79            return key
80    if actioncode is None:
81        return "NULL ACTION CODE"
82    return 'INVALID ACTION CODE %s' % actioncode
83
84
85def string_to_actioncode(actionstring, config=None):
86    """return the code for this action"""
87    upper = actionstring.upper().strip()
88
89    # support DISCARD as alias for DELETE
90    if upper == 'DISCARD':
91        upper = 'DELETE'
92
93    if config is not None:
94        if upper == 'DEFAULTHIGHSPAMACTION':
95            confval = config.get('spam', 'defaulthighspamaction').upper()
96            if confval not in ALLCODES:
97                return None
98            return ALLCODES[confval]
99
100        if upper == 'DEFAULTLOWSPAMACTION':
101            confval = config.get('spam', 'defaultlowspamaction').upper()
102            if confval not in ALLCODES:
103                return None
104            return ALLCODES[confval]
105
106        if upper == 'DEFAULTVIRUSACTION':
107            confval = config.get('virus', 'defaultvirusaction').upper()
108            if confval not in ALLCODES:
109                return None
110            return ALLCODES[confval]
111
112    if upper not in ALLCODES:
113        return None
114    return ALLCODES[upper]
115
116
117def apply_template(templatecontent, suspect, values=None, valuesfunction=None):
118    """Replace templatecontent variables as defined in http://fumail.github.io/fuglu/plugins-index.html#template-variables
119    with actual values from suspect
120    the calling function can pass additional values by passing a values dict
121
122    if valuesfunction is not none, it is called with the final dict with all built-in and passed values
123    and allows further modifications, like SQL escaping etc
124    """
125    if values is None:
126        values = {}
127
128    default_template_values(suspect, values)
129
130    if valuesfunction is not None:
131        values = valuesfunction(values)
132
133    template = Template(force_uString(templatecontent))
134
135    message = template.safe_substitute(values)
136    return message
137
138
139def default_template_values(suspect, values=None):
140    """Return a dict with default template variables applicable for this suspect
141    if values is not none, fill the values dict instead of returning a new one"""
142
143    if values is None:
144        values = {}
145
146    values['id'] = suspect.id
147    values['timestamp'] = suspect.timestamp
148    values['from_address'] = suspect.from_address
149    values['to_address'] = suspect.to_address
150    values['from_domain'] = suspect.from_domain
151    values['to_domain'] = suspect.to_domain
152    values['subject'] = suspect.get_message_rep()['subject']
153    values['date'] = str(datetime.date.today())
154    values['time'] = time.strftime('%X')
155    return values
156
157HOSTNAME = socket.gethostname()
158
159
160def yesno(val):
161    """returns the string 'yes' for values that evaluate to True, 'no' otherwise"""
162    if val:
163        return 'yes'
164    else:
165        return 'no'
166
167
168class Suspect(object):
169
170    """
171    The suspect represents the message to be scanned. Each scannerplugin will be presented
172    with a suspect and may modify the tags or even the message content itself.
173    """
174
175    def __init__(self, from_address, recipients, tempfile):
176        self.source = None
177        """holds the message source if set directly"""
178
179        self._msgrep = None
180        """holds a copy of the message representation"""
181
182        # tags set by plugins
183        self.tags = {}
184        self.tags['virus'] = {}
185        self.tags['blocked'] = {}
186        self.tags['spam'] = {}
187        self.tags['highspam'] = {}
188        self.tags['decisions'] = []
189        self.tags['scantimes'] = []
190
191        # temporary file containing the message source
192        self.tempfile = tempfile
193
194        # stuff set from smtp transaction
195        self.size = os.path.getsize(tempfile)
196        self.from_address = from_address
197
198        # backwards compatibility, recipients can be a single address
199        if isinstance(recipients, list):
200            self.recipients = recipients
201        else:
202            self.recipients = [recipients, ]
203
204        # basic email validitiy check - nothing more than necessary for our internal assumptions
205        for rec in self.recipients:
206            if rec is None:
207                raise ValueError("Recipient address can not be None")
208            if not re.match(r"[^@]+@[^@]+$", rec):
209                raise ValueError("Invalid recipient address: %s"%rec)
210
211
212        # additional basic information
213        self.timestamp = time.time()
214        self.id = self._generate_id()
215
216        # headers which are prepended before re-injecting the message
217        self.addheaders = {}
218
219        if self.from_address is None:
220            self.from_address = ''
221
222        if self.from_address!='' and  not re.match(r"[^@]+@[^@]+$", self.from_address):
223            raise ValueError("invalid sender address: %s"%self.from_address)
224
225        self.clientinfo = None
226        """holds client info tuple: helo, ip, reversedns"""
227
228    @property
229    def to_address(self):
230        """Returns the first recipient address"""
231        try:
232            return self.recipients[0]
233        except IndexError:
234            return None
235
236    @to_address.setter
237    def to_address(self, recipient):
238        """Sets a single recipient for this suspect, removing all others"""
239        self.recipients=[recipient,]
240
241    @property
242    def to_localpart(self):
243        """Returns the local part of the first recipient"""
244        try:
245            return self.to_address.rsplit('@', 1)[0]
246        except Exception:
247            logging.getLogger('suspect').error('could not extract localpart from recipient address %s' % self.to_address)
248            return None
249
250    @property
251    def to_domain(self):
252        """Returns the local part of the first recipient"""
253        try:
254            return self.to_address.rsplit('@', 1)[1]
255        except Exception:
256            logging.getLogger('suspect').error('could not extract domain from recipient address %s' % self.from_address)
257            return None
258
259
260    @property
261    def from_localpart(self):
262        if self.from_address == '':
263            return ''
264
265        else:
266            try:
267               return self.from_address.rsplit('@', 1)[0]
268            except Exception:
269                logging.getLogger('suspect').error('could not extract localpart from sender address %s'%self.from_address)
270                return None
271
272    @property
273    def from_domain(self):
274        if self.from_address == '':
275            return ''
276
277        else:
278            try:
279                return self.from_address.rsplit('@', 1)[1]
280            except Exception:
281                logging.getLogger('suspect').error('could not extract domain from sender address %s' % self.from_address)
282                return None
283
284
285    def _generate_id(self):
286        """
287        returns a unique id (a string of 32 hex characters)
288        """
289        return uuid.uuid4().hex
290
291    def debug(self, message):
292        """Add a line to the debug log if debugging is enabled for this message"""
293        if not self.get_tag('debug'):
294            return
295        isotime = datetime.datetime.now().isoformat()
296        fp = self.get_tag('debugfile')
297        try:
298            fp.write('%s %s\n' % (isotime, message))
299            fp.flush()
300        except Exception as e:
301            logging.getLogger('suspect').error(
302                'Could not write to logfile: %s' % e)
303
304    def get_tag(self, key, defaultvalue=None):
305        """returns the tag value. if the tag is not found, return defaultvalue instead (None if no defaultvalue passed)"""
306        if key not in self.tags:
307            return defaultvalue
308        return self.tags[key]
309
310    def set_tag(self, key, value):
311        """Set a new tag"""
312        self.tags[key] = value
313
314    def is_highspam(self):
315        """Returns True if ANY of the spam engines tagged this suspect as high spam"""
316        for key in list(self.tags['highspam'].keys()):
317            val = self.tags['highspam'][key]
318            if val:
319                return True
320        return False
321
322    def is_spam(self):
323        """Returns True if ANY of the spam engines tagged this suspect as spam"""
324        for key in list(self.tags['spam'].keys()):
325            val = self.tags['spam'][key]
326            if val:
327                return True
328        return False
329
330    def is_blocked(self):
331        """Returns True if ANY plugin tagged this suspect as blocked"""
332        for key in list(self.tags['blocked'].keys()):
333            val = self.tags['blocked'][key]
334            if val:
335                return True
336        return False
337
338    def is_virus(self):
339        """Returns True if ANY of the antivirus engines tagged this suspect as infected"""
340        for key in list(self.tags['virus'].keys()):
341            val = self.tags['virus'][key]
342            if val:
343                return True
344        return False
345
346    def is_ham(self):
347        """Returns True if message is neither considered to be spam, virus or blocked"""
348        if self.is_spam() or self.is_virus() or self.is_blocked() or self.is_highspam():
349            return False
350        return True
351
352    def update_subject(self, subject_cb, **cb_params):
353        """
354        update/alter the message subject
355        :param subject_cb: callback function that alters the subject. must accept a string and return a string
356        :param cb_params: additional parameters to be passed to subject_cb
357        :return: True if subject was altered, False otherwise
358        """
359        msgrep = self.get_message_rep()
360        oldsubj = msgrep.get("subject","")
361        newsubj = subject_cb(oldsubj, **cb_params)
362        if oldsubj != newsubj:
363            del msgrep["subject"]
364            msgrep["subject"] = newsubj
365            self.set_message_rep(msgrep)
366            if self.get_tag('origsubj') is None:
367                self.set_tag('origsubj', oldsubj)
368            return True
369        return False
370
371
372    def add_header(self, key, value, immediate=False):
373        """adds a header to the message. by default, headers will added when re-injecting the message back to postfix
374        if you set immediate=True the message source will be replaced immediately. Only set this to true if a header must be
375        visible to later plugins (eg. for spamassassin rules), otherwise, leave as False which is faster.
376        """
377        if immediate:
378            # is ignore the right thing to do here?
379            value = value.encode('UTF-8', 'ignore')
380            hdr = Header(value, header_name=key, continuation_ws=' ')
381            hdrline = "%s: %s\n" % (key, hdr.encode())
382            src = force_bString(hdrline) + force_bString(self.get_source())
383            self.set_source(src)
384        else:
385            self.addheaders[key] = value
386
387    def addheader(self, key, value, immediate=False):
388        """old name for add_header"""
389        return self.add_header(key, value, immediate)
390
391    def get_current_decision_code(self):
392        dectag = self.get_tag('decisions')
393        if dectag is None:
394            return DUNNO
395        try:
396            pluginname, code = dectag[-1]
397            return code
398        except Exception:
399            return DUNNO
400
401    def _short_tag_rep(self):
402        """return a tag representation suitable for logging, with some tags stripped, some shortened"""
403        blacklist = ['decisions', 'scantimes', 'debugfile']
404        tagscopy = {}
405
406        for k, v in self.tags.items():
407            if k in blacklist:
408                continue
409
410            try:
411                strrep = str(v)
412            except Exception:  # Unicodedecode errors and stuff like that
413                continue
414
415            therep = v
416
417            maxtaglen = 100
418            if len(strrep) > maxtaglen:
419                therep = strrep[:maxtaglen] + "..."
420
421            # specialfixes
422            if k == 'SAPlugin.spamscore' and not isinstance(v, str):
423                therep = "%.2f" % v
424
425            tagscopy[k] = therep
426        return str(tagscopy)
427
428    def log_format(self, template=None):
429        addvals = {
430            'size': self.size,
431            'spam': yesno(self.is_spam()),
432            'highspam': yesno(self.is_highspam()),
433            'blocked': yesno(self.is_blocked()),
434            'virus': yesno(self.is_virus()),
435            'modified': yesno(self.is_modified()),
436            'decision': actioncode_to_string(self.get_current_decision_code()),
437            'tags': self._short_tag_rep(),
438            'fulltags': str(self.tags),
439        }
440        return apply_template(template, self, addvals)
441
442    def __str__(self):
443        """representation good for logging"""
444        return self.log_format("Suspect ${id}: from=${from_address} to=${to_address} size=${size} spam=${spam} blocked=${blocked} virus=${virus} modified=${modified} decision=${decision} tags=${tags}")
445
446    def get_message_rep(self):
447        """returns the python email api representation of this suspect"""
448        # do we have a cached instance already?
449        if self._msgrep is not None:
450            return self._msgrep
451
452        if self.source is not None:
453            if sys.version_info > (3,):
454                # Python 3 and larger
455                # the basic "str" type is unicode
456                if isinstance(self.source, str):
457                    msgrep = email.message_from_string(self.source)
458                else:
459                    msgrep = email.message_from_bytes(self.source)
460            else:
461                # Python 2.x
462                msgrep = email.message_from_string(self.source)
463
464            self._msgrep = msgrep
465            return msgrep
466        else:
467            if sys.version_info > (3,):
468                # Python 3 and larger
469                # file should be binary...
470
471                # IMPORTANT: It is possible to use email.message_from_bytes BUT this will automatically replace
472                #            '\r\n' in the message (_payload) by '\n' and the endtoend_test.py will fail!
473                tmpSource = self.get_original_source()
474                msgrep = email.message_from_bytes(tmpSource)
475            else:
476                # Python 2.x
477                with open(self.tempfile, 'r') as fh:
478                    msgrep = email.message_from_file(fh)
479            self._msgrep = msgrep
480            return msgrep
481
482    def getMessageRep(self):
483        """old name for get_message_rep"""
484        return self.get_message_rep()
485
486    def set_message_rep(self, msgrep):
487        """replace the message content. this must be a standard python email representation
488        Warning: setting the source via python email representation seems to break dkim signatures!
489        """
490        if sys.version_info > (3,):
491            # Python 3 and larger
492            # stick to bytes...
493            try:
494                self.set_source(msgrep.as_bytes())
495            except AttributeError:
496                self.set_source(force_bString(msgrep.as_string()))
497        else:
498            # Python 2.x
499            self.set_source(msgrep.as_string())
500
501        # order is important, set_source sets source to None
502        self._msgrep = msgrep
503
504    def setMessageRep(self, msgrep):
505        """old name for set_message_rep"""
506        return self.set_message_rep(msgrep)
507
508    def is_modified(self):
509        """returns true if the message source has been modified"""
510        return self.source is not None
511
512    def get_source(self, maxbytes=None):
513        """returns the current message source, possibly changed by plugins"""
514        if self.source is not None:
515            return self.source[:maxbytes]
516        else:
517            return self.get_original_source(maxbytes)
518
519    def getSource(self, maxbytes=None):
520        """old name for get_source"""
521        return self.get_source(maxbytes)
522
523    def set_source(self, source):
524        self.source = source
525        self._msgrep = None
526
527    def setSource(self, source):
528        """old name for set_source"""
529        return self.set_source(source)
530
531    def get_original_source(self, maxbytes=None):
532        """returns the original, unmodified message source"""
533        readbytes = -1
534        if maxbytes is not None:
535            readbytes = maxbytes
536        try:
537            with open(self.tempfile, 'rb') as fh:
538                source = fh.read(readbytes)
539        except Exception as e:
540            logging.getLogger('fuglu.suspect').error(
541                'Cannot retrieve original source from tempfile %s : %s' % (self.tempfile, str(e)))
542            raise e
543        return source
544
545    def getOriginalSource(self, maxbytes=None):
546        """old name for get_original_source"""
547        return self.get_original_source(maxbytes)
548
549    def get_headers(self):
550        """returns the message headers as string"""
551        headers = re.split(
552            b'(?:\n\n)|(?:\r\n\r\n)', self.get_source(maxbytes=1048576), 1)[0]
553        return headers
554
555    def get_client_info(self, config=None):
556        """returns information about the client that submitted this message.
557        (helo,ip,reversedns)
558
559        In before-queue mode this info is extracted using the XFORWARD SMTP protocol extension.
560
561        In after-queue mode this information is extracted from the message Received: headers and therefore probably not 100% reliable
562        all information is returned as-is, this means for example, that non-fcrdns client will show 'unknown' as reverse dns value.
563
564        if no config object is passed, the first parseable Received header is used. otherwise, the config is used to determine the correct boundary MTA (trustedhostregex / boundarydistance)
565        """
566        if self.clientinfo is not None:
567            return self.clientinfo
568
569        if config is None:
570            clientinfo = self.client_info_from_rcvd()
571
572        else:
573            clientinfo = self.client_info_from_rcvd(config.get(
574                'environment', 'trustedhostsregex'), config.getint('environment', 'boundarydistance'))
575        self.clientinfo = clientinfo
576        return clientinfo
577
578    def client_info_from_rcvd(self, ignoreregex=None, skip=0):
579        """returns information about the client that submitted this message.
580        (helo,ip,reversedns)
581
582        This information is extracted from the message Received: headers and therefore probably not 100% reliable
583        all information is returned as-is, this means for example, that non-fcrdns client will show 'unknown' as reverse dns value.
584
585        if ignoreregex is not None, all results which match this regex in either helo,ip or reversedns will be ignored
586
587        By default, this method starts searching at the top Received Header. Set a higher skip value to start searching further down.
588
589        both these arguments can be used to filter received headers from local systems in order to get the information from a boundary MTA
590
591        returns None if the client info can not be found or if all applicable values are filtered by skip/ignoreregex
592        """
593        ignorere = None
594        if ignoreregex is not None and ignoreregex != '':
595            ignorere = re.compile(ignoreregex)
596
597        unknown = None
598
599        receivedheaders = self.get_message_rep().get_all('Received')
600        if receivedheaders is None:
601            return unknown
602
603        for rcvdline in receivedheaders[skip:]:
604            h_rev_ip = self._parse_rcvd_header(rcvdline)
605            if h_rev_ip is None:
606                return unknown
607
608            helo, revdns, ip = h_rev_ip
609
610            # check if hostname or ip matches the ignore re, try next header if
611            # it does
612            if ignorere is not None:
613                excludematch = ignorere.search(ip)
614                if excludematch is not None:
615                    continue
616
617                excludematch = ignorere.search(revdns)
618                if excludematch is not None:
619                    continue
620
621                excludematch = ignorere.search(helo)
622                if excludematch is not None:
623                    continue
624
625            clientinfo = helo, ip, revdns
626            return clientinfo
627        # we should only land here if we only have received headers in
628        # mynetworks
629        return unknown
630
631    def _parse_rcvd_header(self, rcvdline):
632        """return tuple HELO,REVERSEDNS,IP from received Header line, or None, if extraction fails"""
633        receivedpattern = re.compile(
634            "^from\s(?P<helo>[^\s]+)\s\((?P<revdns>[^\s]+)\s\[(?:IPv6:)?(?P<ip>(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})|(?:[0-9a-f:]{3,40}))\]\)")
635        match = receivedpattern.search(rcvdline)
636        if match is None:
637            return None
638        return match.groups()
639
640
641# it is important that this class explicitly extends from object, or
642# __subclasses__() will not work!
643
644
645class BasicPlugin(object):
646
647    """Base class for all plugins"""
648
649    def __init__(self, config, section=None):
650        if section is None:
651            self.section = self.__class__.__name__
652        else:
653            self.section = section
654
655        self.config = config
656        self.requiredvars = {}
657
658    def _logger(self):
659        """returns the logger for this plugin"""
660        myclass = self.__class__.__name__
661        loggername = "fuglu.plugin.%s" % myclass
662        return logging.getLogger(loggername)
663
664    def lint(self):
665        return self.checkConfig()
666
667    def checkConfig(self):
668        """old name for check_config"""
669        return self.check_config()
670
671    def check_config(self):
672        """Print missing / non-default configuration settings"""
673        allOK = True
674
675        # old config style
676        if isinstance(self.requiredvars, (tuple, list)):
677            for configvar in self.requiredvars:
678                if isinstance(self.requiredvars, tuple):
679                    (section, config) = configvar
680                else:
681                    config = configvar
682                    section = self.section
683                try:
684                    self.config.get(section, config)
685                except configparser.NoOptionError:
686                    print("Missing configuration value [%s] :: %s" % (
687                        section, config))
688                    allOK = False
689                except configparser.NoSectionError:
690                    print("Missing configuration section %s" % section)
691                    allOK = False
692
693        # new config style
694        if isinstance(self.requiredvars, dict):
695            for config, infodic in self.requiredvars.items():
696                section = self.section
697                if 'section' in infodic:
698                    section = infodic['section']
699
700                try:
701                    var = self.config.get(section, config)
702                    if 'validator' in infodic:
703                        if not infodic["validator"](var):
704                            print("Validation failed for [%s] :: %s" % (
705                                section, config))
706                            allOK = False
707                except configparser.NoSectionError:
708                    print("Missing configuration section [%s] :: %s" % (
709                        section, config))
710                    allOK = False
711                except configparser.NoOptionError:
712                    print("Missing configuration value [%s] :: %s" % (
713                        section, config))
714                    allOK = False
715
716        return allOK
717
718    def __str__(self):
719        classname = self.__class__.__name__
720        if self.section == classname:
721            return classname
722        else:
723            return '%s(%s)' % (classname, self.section)
724
725
726class ScannerPlugin(BasicPlugin):
727
728    """Scanner Plugin Base Class"""
729
730    def examine(self, suspect):
731        self._logger().warning('Unimplemented examine() method')
732
733
734class PrependerPlugin(BasicPlugin):
735
736    """Prepender Plugins - Plugins run before the scanners that can influence
737    the list of scanners being run for a certain message"""
738
739    def pluginlist(self, suspect, pluginlist):
740        """return the modified pluginlist or None for no change"""
741        self._logger().warning('Unimplemented pluginlist() method')
742        return None
743
744
745class AppenderPlugin(BasicPlugin):
746
747    """Appender Plugins are run after the scan process (and after the re-injection if the message
748    was accepted)"""
749
750    def process(self, suspect, decision):
751        self._logger().warning('Unimplemented process() method')
752
753
754class SuspectFilter(object):
755
756    """Allows filtering Suspect based on header/tag/body regexes"""
757
758    def __init__(self, filename):
759        self.filename = filename
760        self.patterns = []
761
762        self.reloadinterval = 30
763        self.lastreload = 0
764        self.logger = logging.getLogger('fuglu.suspectfilter')
765
766        if filename is not None:
767            self._reloadifnecessary()
768        self.stripre = re.compile(r'<[^>]*?>')
769
770    def _reloadifnecessary(self):
771        now = time.time()
772        # check if reloadinterval has passed
773        if now - self.lastreload < self.reloadinterval:
774            return
775        if self.file_changed():
776            self._reload()
777
778    def _load_simplestyle_line(self, line):
779        sp = line.split(None, 2)
780        if len(sp) < 2:
781            raise Exception(
782                """"Invalid line '%s' in Rulefile %s. Ignoring.""" % (line, self.filename))
783
784        args = None
785        if len(sp) == 3:
786            args = sp[2]
787
788        fieldname = sp[0]
789        # strip ending : (request AXB)
790        if fieldname.endswith(':'):
791            fieldname = fieldname[:-1]
792        regex = sp[1]
793        try:
794            pattern = re.compile(regex, re.IGNORECASE | re.DOTALL)
795        except Exception as e:
796            raise Exception(
797                'Could not compile regex %s in file %s (%s)' % (regex, self.filename, e))
798
799        tup = (fieldname, pattern, args)
800        return tup
801
802    def _load_perlstyle_line(self, line):
803        patt = r"""(?P<fieldname>[a-zA-Z0-9\-\.\_\:]+)[:]?\s+\/(?P<regex>(?:\\.|[^/\\])*)/(?P<flags>[IiMm]+)?((?:\s*$)|(?:\s+(?P<args>.*)))$"""
804        m = re.match(patt, line)
805        if m is None:
806            return None
807
808        groups = m.groupdict()
809        regex = groups['regex']
810        flags = groups['flags']
811        if flags is None:
812            flags = []
813        args = groups['args']
814        if args is not None and args.strip() == '':
815            args = None
816        fieldname = groups['fieldname']
817        if fieldname.endswith(':'):
818            fieldname = fieldname[:-1]
819
820        reflags = 0
821        for flag in flags:
822            flag = flag.lower()
823            if flag == 'i':
824                reflags |= re.I
825            if flag == 'm':
826                reflags |= re.M
827
828        try:
829            pattern = re.compile(regex, reflags)
830        except Exception as e:
831            raise Exception(
832                'Could not compile regex %s in file %s (%s)' % (regex, self.filename, e))
833
834        tup = (fieldname, pattern, args)
835        return tup
836
837    def _reload(self):
838        self.logger.info('Reloading Rulefile %s' % self.filename)
839        statinfo = os.stat(self.filename)
840        ctime = statinfo.st_ctime
841        self.lastreload = ctime
842        with open(self.filename, 'r') as fp:
843            lines = fp.readlines()
844        newpatterns = []
845
846        for line in lines:
847            line = line.strip()
848            if line == "":
849                continue
850            if line.startswith('#'):
851                continue
852
853            # try advanced regex line
854            #<headername> /regex/<flags> <arguments>
855            try:
856                tup = self._load_perlstyle_line(line)
857                if tup is not None:
858                    newpatterns.append(tup)
859                    continue
860            except Exception as e:
861                self.logger.error(
862                    "perl style line failed %s, error: %s" % (line, str(e)))
863                continue
864
865            # line shold be "headername    regex    arguments"
866            try:
867                tup = self._load_simplestyle_line(line)
868                newpatterns.append(tup)
869                continue
870            except Exception as e:
871                self.logger.error(str(e))
872                continue
873
874        self.patterns = newpatterns
875
876    def strip_text(self, content, remove_tags=None, replace_nbsp=True, use_bfs=True):
877        """Strip HTML Tags from content, replace newline with space (like Spamassassin)"""
878
879        if remove_tags is None:
880            remove_tags = ['script', 'style']
881
882        # content may land as a bytes object in py3, so we have to convert to a string so we can
883        # replace newline with space
884        # if it's unicode, we don't convert
885        if isinstance(content, bytes):  # in py2 bytes is an alias for str, no change
886            content = str(content)
887        content = content.replace("\n", " ")
888
889        if HAVE_BEAUTIFULSOUP and use_bfs:
890            if BS_VERSION >= 4:
891                soup = BeautifulSoup.BeautifulSoup(content, "lxml")
892            else:
893                soup = BeautifulSoup.BeautifulSoup(content)
894            for r in remove_tags:
895                [x.extract() for x in soup.findAll(r)]
896
897            if BS_VERSION >= 4:
898                stripped = soup.get_text()
899                if replace_nbsp:
900                    stripped = stripped.replace(u'\xa0', u' ')
901                return stripped
902            else:
903                stripped = ''.join(
904                    # Can retain unicode check since BS < 4 is Py2 only
905                    [e for e in soup.recursiveChildGenerator() \
906                        if isinstance(e, unicode) \
907                        and not isinstance(e, BeautifulSoup.Declaration) \
908                        and not isinstance(e, BeautifulSoup.ProcessingInstruction) \
909                        and not isinstance(e, BeautifulSoup.Comment)])
910                if replace_nbsp:
911                    stripped = stripped.replace(u'\xa0', u' ')
912                return stripped
913
914        # no BeautifulSoup available, let's try a modified version of pyzor's
915        # html stripper
916        stripper = HTMLStripper(strip_tags=remove_tags)
917
918        try:
919            # always try to replace nbsp as HTMLStripper would just remove them
920            content = content.replace("&nbsp;", " ").replace("&#xa0;", " ").replace("&#160;", " ")
921        except Exception:
922            pass
923
924        try:
925            stripper.feed(content)
926            return stripper.get_stripped_data()
927        except Exception:  # ignore parsing/encoding errors
928            pass
929        # use regex replace
930        return re.sub(self.stripre, '', content)
931
932    def get_decoded_textparts(self, messagerep):
933        """Returns a list of all text contents"""
934        textparts = []
935        for part in messagerep.walk():
936            payload = None
937            if part.get_content_maintype() == 'text' and (not part.is_multipart()):
938                payload = part.get_payload(None, True)
939
940            #multipart/mixed are text by default as well
941            if part.get_content_maintype() == 'multipart' and part.get_content_subtype() == 'mixed':
942                payload = part.get_payload(None, True)
943
944            # payload can be None even if it was returned from part.get_payload()
945            if payload is not None:
946                textparts.append(payload)
947        return textparts
948
949    def get_field(self, suspect, headername):
950        """return a list of mail header values or special values. If the value can not be found, an empty list is returned.
951
952        headers:
953            just the headername for normal headers
954            mime:headername for attached mime part headers
955
956        envelope data:
957            envelope_from (or from_address)
958            envelope_to (or to_address)
959            from_domain
960            to_domain
961            clientip
962            clienthostname (fcrdns or 'unknown')
963            clienthelo
964
965        tags
966            @tagname
967
968        body source:
969            body:full -> (full source, encoded)
970            body:stripped (or just 'body') : -> returns text/* bodyparts with tags and newlines stripped
971            body:raw -> decoded raw message body parts
972
973
974        """
975        # builtins
976        if headername == 'envelope_from' or headername == 'from_address':
977            return [suspect.from_address, ]
978        if headername == 'envelope_to' or headername == 'to_address':
979            return suspect.recipients
980        if headername == 'from_domain':
981            return [suspect.from_domain, ]
982        if headername == 'to_domain':
983            return [suspect.to_domain, ]
984        if headername == 'body:full':
985            return [suspect.get_original_source()]
986
987        if headername in ['clientip', 'clienthostname', 'clienthelo']:
988            clinfo = suspect.get_client_info()
989            if clinfo is None:
990                return []
991            if headername == 'clienthelo':
992                return [clinfo[0], ]
993            if headername == 'clientip':
994                return [clinfo[1], ]
995            if headername == 'clienthostname':
996                return [clinfo[2], ]
997
998        # if it starts with a @ we return a tag, not a header
999        if headername[0:1] == '@':
1000            tagname = headername[1:]
1001            tagval = suspect.get_tag(tagname)
1002            if tagval is None:
1003                return []
1004            if isinstance(tagval, list):
1005                return tagval
1006            return [tagval]
1007
1008        messagerep = suspect.get_message_rep()
1009
1010        # body rules on decoded text parts
1011        if headername == 'body:raw':
1012            return self.get_decoded_textparts(messagerep)
1013
1014        if headername == 'body' or headername == 'body:stripped':
1015            return list(map(self.strip_text, self.get_decoded_textparts(messagerep)))
1016
1017        if headername.startswith('mime:'):
1018            allvalues = []
1019            realheadername = headername[5:]
1020            for part in messagerep.walk():
1021                hdrslist = self._get_headers(realheadername, part)
1022                allvalues.extend(hdrslist)
1023            return allvalues
1024
1025        # standard header
1026        return self._get_headers(headername, messagerep)
1027
1028    def _get_headers(self, headername, payload):
1029        valuelist = []
1030        if '*' in headername:
1031            regex = re.escape(headername)
1032            regex = regex.replace('\*', '.*')
1033            patt = re.compile(regex, re.IGNORECASE)
1034
1035            for h in list(payload.keys()):
1036                if re.match(patt, h) is not None:
1037                    valuelist.extend(payload.get_all(h,[]))
1038        else:
1039            valuelist = payload.get_all(headername,[])
1040
1041        return valuelist
1042
1043    def matches(self, suspect, extended=False):
1044        """returns (True,arg) if any regex matches, (False,None) otherwise
1045
1046        if extended=True, returns all available info about the match in a tuple:
1047        True, (fieldname, matchedvalue, arg, regex)
1048        """
1049        self._reloadifnecessary()
1050
1051        for tup in self.patterns:
1052            (fieldname, pattern, arg) = tup
1053            vals = self.get_field(suspect, fieldname)
1054            if vals is None or len(vals) == 0:
1055                self.logger.debug('No field %s found' % fieldname)
1056                continue
1057
1058            for val in vals:
1059                if val is None:
1060                    continue
1061
1062                strval = str(val)
1063                if pattern.search(strval):
1064                    self.logger.debug("""MATCH field %s (arg '%s') regex '%s' against value '%s'""" % (
1065                        fieldname, arg, pattern.pattern, val))
1066                    suspect.debug("message matches rule in %s: field=%s arg=%s regex=%s content=%s" % (
1067                        self.filename, fieldname, arg, pattern.pattern, val))
1068                    if extended:
1069                        return True, (fieldname, strval, arg, pattern.pattern)
1070                    else:
1071                        return True, arg
1072                else:
1073                    self.logger.debug("""NO MATCH field %s (arg '%s') regex '%s' against value '%s'""" % (
1074                        fieldname, arg, pattern.pattern, val))
1075
1076        self.logger.debug('No match found')
1077        suspect.debug("message does not match any rule in %s" % self.filename)
1078        return False, None
1079
1080    def get_args(self, suspect, extended=False):
1081        """returns all args of matched regexes in a list
1082        if extended=True:  returns a list of tuples with all available information:
1083        (fieldname, matchedvalue, arg, regex)
1084        """
1085        ret = []
1086        self._reloadifnecessary()
1087        for tup in self.patterns:
1088            (fieldname, pattern, arg) = tup
1089            vals = self.get_field(suspect, fieldname)
1090            if vals is None or len(vals) == 0:
1091                self.logger.debug('No field %s found' % fieldname)
1092                continue
1093            for val in vals:
1094                if val is None:
1095                    continue
1096                strval = str(val)
1097                if pattern.search(strval) is not None:
1098                    self.logger.debug("""MATCH field %s (arg '%s') regex '%s' against value '%s'""" % (
1099                        fieldname, arg, pattern.pattern, val))
1100                    suspect.debug("message matches rule in %s: field=%s arg=%s regex=%s content=%s" % (
1101                        self.filename, fieldname, arg, pattern.pattern, val))
1102                    if extended:
1103                        ret.append((fieldname, strval, arg, pattern.pattern))
1104                    else:
1105                        ret.append(arg)
1106                else:
1107                    self.logger.debug("""NO MATCH field %s (arg '%s') regex '%s' against value '%s'""" % (
1108                        fieldname, arg, pattern.pattern, val))
1109
1110        return ret
1111
1112    def getArgs(self, suspect):
1113        """old name for get_args"""
1114        return self.get_args(suspect)
1115
1116    def file_changed(self):
1117        """Return True if the file has changed on disks since the last reload"""
1118        if not os.path.isfile(self.filename):
1119            return False
1120        statinfo = os.stat(self.filename)
1121        ctime = statinfo.st_ctime
1122        if ctime > self.lastreload:
1123            return True
1124        return False
1125
1126    def lint(self):
1127        """check file and print warnings to console. returns True if everything is ok, False otherwise"""
1128        if not os.path.isfile(self.filename):
1129            print("SuspectFilter file not found: %s" % self.filename)
1130            return False
1131        with open(self.filename, 'r') as fp:
1132            lines = fp.readlines()
1133        lineno = 0
1134        for line in lines:
1135            lineno += 1
1136            line = line.strip()
1137            if line == "":
1138                continue
1139            if line.startswith('#'):
1140                continue
1141            try:
1142                tup = self._load_perlstyle_line(line)
1143                if tup is not None:
1144                    continue
1145                self._load_simplestyle_line(line)
1146            except Exception as e:
1147                print("Error in SuspectFilter file '%s', lineno %s , line '%s' : %s" % (
1148                    self.filename, lineno, line, str(e)))
1149                return False
1150        return True
1151
1152
1153class HTMLStripper(HTMLParser):
1154
1155    def __init__(self, strip_tags=None):
1156        HTMLParser.__init__(self)
1157        self.strip_tags = strip_tags or ['script', 'style']
1158        self.reset()
1159        self.collect = True
1160        self.stripped_data = []
1161
1162    def handle_data(self, data):
1163        if data and self.collect:
1164            self.stripped_data.append(data)
1165
1166    def handle_starttag(self, tag, attrs):
1167        HTMLParser.handle_starttag(self, tag, attrs)
1168        if tag.lower() in self.strip_tags:
1169            self.collect = False
1170
1171    def handle_endtag(self, tag):
1172        HTMLParser.handle_endtag(self, tag)
1173        if tag.lower() in self.strip_tags:
1174            self.collect = True
1175
1176    def get_stripped_data(self):
1177        return ''.join(self.stripped_data)
1178
1179
1180class FileList(object):
1181
1182    """Map all lines from a textfile into a list. If the file is changed, the list is refreshed automatically
1183    Each line can be run through a callback filter which can change or remove the content.
1184
1185    filename: The textfile which should be mapped to a list. This can be changed at runtime. If None, an empty list will be returned.
1186    strip: remove leading/trailing whitespace from each line. Note that the newline character is always stripped
1187    skip_empty: skip empty lines (if used in combination with strip: skip all lines with only whitespace)
1188    skip_comments: skip lines starting with #
1189    lowercase: lowercase each line
1190    additional_filters: function or list of functions which will be called for each line on reload.
1191        Each function accept a single argument and must return a (possibly modified) line or None to skip this line
1192    minimum_time_between_reloads: number of seconds to cache the list before it will be reloaded if the file changes
1193    """
1194
1195    def __init__(self, filename=None, strip=True, skip_empty=True, skip_comments=True, lowercase=False, additional_filters=None, minimum_time_between_reloads=5):
1196        self.filename = filename
1197        self.minium_time_between_reloads = minimum_time_between_reloads
1198        self._lastreload = 0
1199        self.linefilters = []
1200        self.content = []
1201        self.logger = logging.getLogger('%s.filelist' % __package__)
1202        self.lock = threading.Lock()
1203
1204        # we always strip newline
1205        self.linefilters.append(lambda x: x.rstrip('\r\n'))
1206
1207        if strip:
1208            self.linefilters.append(lambda x: x.strip())
1209
1210        if skip_empty:
1211            self.linefilters.append(lambda x: x if x != '' else None)
1212
1213        if skip_comments:
1214            self.linefilters.append(
1215                lambda x: None if x.strip().startswith('#') else x)
1216
1217        if lowercase:
1218            self.linefilters.append(lambda x: x.lower())
1219
1220        if additional_filters is not None:
1221            if isinstance(additional_filters, list):
1222                self.linefilters.extend(additional_filters)
1223            else:
1224                self.linefilters.append(additional_filters)
1225
1226        if filename is not None:
1227            self._reload_if_necessary()
1228
1229    def _reload_if_necessary(self):
1230        """Calls _reload if the file has been changed since the last reload"""
1231        now = time.time()
1232        # check if reloadinterval has passed
1233        if now - self._lastreload < self.minium_time_between_reloads:
1234            return
1235        if not self.file_changed():
1236            return
1237        if not self.lock.acquire():
1238            return
1239        try:
1240            self._reload()
1241        finally:
1242            self.lock.release()
1243
1244    def _reload(self):
1245        """Reload the file and build the list"""
1246        self.logger.info('Reloading file %s' % self.filename)
1247        statinfo = os.stat(self.filename)
1248        ctime = statinfo.st_ctime
1249        self._lastreload = ctime
1250        with open(self.filename, 'r') as fp:
1251            lines = fp.readlines()
1252        newcontent = []
1253
1254        for line in lines:
1255            for func in self.linefilters:
1256                line = func(line)
1257                if line is None:
1258                    break
1259
1260            if line is not None:
1261                newcontent.append(line)
1262
1263        self.content = newcontent
1264
1265    def file_changed(self):
1266        """Return True if the file has changed on disks since the last reload"""
1267        if not os.path.isfile(self.filename):
1268            return False
1269        statinfo = os.stat(self.filename)
1270        ctime = statinfo.st_ctime
1271        if ctime > self._lastreload:
1272            return True
1273        return False
1274
1275    def get_list(self):
1276        """Returns the current list. If the file has been changed since the last call, it will rebuild the list automatically."""
1277        self._reload_if_necessary()
1278        return self.content
1279
1280
1281
1282class Cache(object):
1283    """
1284    Simple local cache object.
1285    cached data will expire after a defined interval
1286    """
1287
1288    def __init__(self, cachetime=30, cleanupinterval=300):
1289        self.cache={}
1290        self.cachetime=cachetime
1291        self.cleanupinterval=cleanupinterval
1292        self.lock=threading.Lock()
1293        self.logger=logging.getLogger("%s.settingscache" % __package__)
1294
1295        t = threading.Thread(target=self.clear_cache_thread)
1296        t.daemon = True
1297        t.start()
1298
1299
1300    def put_cache(self,key,obj):
1301        try:
1302            gotlock=self.lock.acquire(True)
1303            if gotlock:
1304                self.cache[key]=(obj,time.time())
1305        except Exception as e:
1306            self.logger.exception(e)
1307        finally:
1308            if gotlock:
1309                self.lock.release()
1310
1311
1312    def get_cache(self,key):
1313        try:
1314            gotlock=self.lock.acquire(True)
1315            if not gotlock:
1316                return None
1317
1318            ret=None
1319
1320            if key in self.cache:
1321                obj,instime=self.cache[key]
1322                now=time.time()
1323                if now-instime<self.cachetime:
1324                    ret=obj
1325                else:
1326                    del self.cache[key]
1327
1328        except Exception as e:
1329            self.logger.exception(e)
1330        finally:
1331            self.lock.release()
1332        return ret
1333
1334
1335    def clear_cache_thread(self):
1336        while True:
1337            time.sleep(self.cleanupinterval)
1338            now=time.time()
1339            try:
1340                gotlock=self.lock.acquire(True)
1341                if not gotlock:
1342                    continue
1343
1344                cleancount=0
1345
1346                for key in set(self.cache.keys()):
1347                    obj,instime=self.cache[key]
1348                    if now-instime>self.cachetime:
1349                        del self.cache[key]
1350                        cleancount+=1
1351            except Exception as e:
1352                self.logger.exception(e)
1353            finally:
1354                if gotlock:
1355                    self.lock.release()
1356            self.logger.debug("Cleaned %s expired entries."%cleancount)
1357
1358
1359
1360class CacheSingleton(object):
1361    """
1362    Process singleton to store a default Cache instance
1363    Note it is important there is a separate Cache instance for each process
1364    since otherwise the Threading.Lock will screw up and block the execution.
1365    """
1366
1367    instance = None
1368    procPID = None
1369
1370    def __init__(self, *args, **kwargs):
1371        pid =  os.getpid()
1372        logger = logging.getLogger("%s.CacheSingleton" % __package__)
1373        if pid == CacheSingleton.procPID and CacheSingleton.instance is not None:
1374            logger.debug("Return existing Cache Singleton for process with pid: %u"%pid)
1375        else:
1376            if CacheSingleton.instance is None:
1377                logger.info("Create CacheSingleton for process with pid: %u"%pid)
1378            elif CacheSingleton.procPID != pid:
1379                logger.warning("Replace CacheSingleton(created by process %u) for process with pid: %u"%(CacheSingleton.procPID,pid))
1380
1381            CacheSingleton.instance = Cache(*args,**kwargs)
1382            CacheSingleton.procPID  = pid
1383
1384    def __getattr__(self, name):
1385        return getattr(self.instance, name)
1386
1387
1388def get_default_cache():
1389    """
1390    Function to get processor unique Cache Singleton
1391    """
1392    return CacheSingleton()
1393