1# -*- coding: UTF-8 -*- 2# Copyright 2009-2018 Oli Schacher 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15# 16# 17# 18import logging 19import os 20import sys 21import time 22import socket 23import uuid 24import threading 25from fuglu.localStringEncoding import force_uString, force_bString 26try: 27 from html.parser import HTMLParser 28except ImportError: 29 from HTMLParser import HTMLParser 30 31HAVE_BEAUTIFULSOUP = False 32BS_VERSION = 0 33try: 34 import bs4 as BeautifulSoup 35 HAVE_BEAUTIFULSOUP = True 36 BS_VERSION = 4 37except ImportError: 38 pass 39 40if not HAVE_BEAUTIFULSOUP: 41 try: 42 import BeautifulSoup 43 HAVE_BEAUTIFULSOUP = True 44 BS_VERSION = 3 45 except ImportError: 46 pass 47 48import email 49import re 50try: 51 import configparser 52except ImportError: 53 import ConfigParser as configparser 54import datetime 55from string import Template 56from email.header import Header 57 58# constants 59 60DUNNO = 0 # go on 61ACCEPT = 1 # accept message, no further tests 62DELETE = 2 # blackhole, no further tests 63REJECT = 3 # reject, no further tests 64DEFER = 4 # defer, no further tests 65 66ALLCODES = { 67 'DUNNO': DUNNO, 68 'ACCEPT': ACCEPT, 69 'DELETE': DELETE, 70 'REJECT': REJECT, 71 'DEFER': DEFER, 72} 73 74 75def actioncode_to_string(actioncode): 76 """Return the human readable string for this code""" 77 for key, val in list(ALLCODES.items()): 78 if val == actioncode: 79 return key 80 if actioncode is None: 81 return "NULL ACTION CODE" 82 return 'INVALID ACTION CODE %s' % actioncode 83 84 85def string_to_actioncode(actionstring, config=None): 86 """return the code for this action""" 87 upper = actionstring.upper().strip() 88 89 # support DISCARD as alias for DELETE 90 if upper == 'DISCARD': 91 upper = 'DELETE' 92 93 if config is not None: 94 if upper == 'DEFAULTHIGHSPAMACTION': 95 confval = config.get('spam', 'defaulthighspamaction').upper() 96 if confval not in ALLCODES: 97 return None 98 return ALLCODES[confval] 99 100 if upper == 'DEFAULTLOWSPAMACTION': 101 confval = config.get('spam', 'defaultlowspamaction').upper() 102 if confval not in ALLCODES: 103 return None 104 return ALLCODES[confval] 105 106 if upper == 'DEFAULTVIRUSACTION': 107 confval = config.get('virus', 'defaultvirusaction').upper() 108 if confval not in ALLCODES: 109 return None 110 return ALLCODES[confval] 111 112 if upper not in ALLCODES: 113 return None 114 return ALLCODES[upper] 115 116 117def apply_template(templatecontent, suspect, values=None, valuesfunction=None): 118 """Replace templatecontent variables as defined in http://fumail.github.io/fuglu/plugins-index.html#template-variables 119 with actual values from suspect 120 the calling function can pass additional values by passing a values dict 121 122 if valuesfunction is not none, it is called with the final dict with all built-in and passed values 123 and allows further modifications, like SQL escaping etc 124 """ 125 if values is None: 126 values = {} 127 128 default_template_values(suspect, values) 129 130 if valuesfunction is not None: 131 values = valuesfunction(values) 132 133 template = Template(force_uString(templatecontent)) 134 135 message = template.safe_substitute(values) 136 return message 137 138 139def default_template_values(suspect, values=None): 140 """Return a dict with default template variables applicable for this suspect 141 if values is not none, fill the values dict instead of returning a new one""" 142 143 if values is None: 144 values = {} 145 146 values['id'] = suspect.id 147 values['timestamp'] = suspect.timestamp 148 values['from_address'] = suspect.from_address 149 values['to_address'] = suspect.to_address 150 values['from_domain'] = suspect.from_domain 151 values['to_domain'] = suspect.to_domain 152 values['subject'] = suspect.get_message_rep()['subject'] 153 values['date'] = str(datetime.date.today()) 154 values['time'] = time.strftime('%X') 155 return values 156 157HOSTNAME = socket.gethostname() 158 159 160def yesno(val): 161 """returns the string 'yes' for values that evaluate to True, 'no' otherwise""" 162 if val: 163 return 'yes' 164 else: 165 return 'no' 166 167 168class Suspect(object): 169 170 """ 171 The suspect represents the message to be scanned. Each scannerplugin will be presented 172 with a suspect and may modify the tags or even the message content itself. 173 """ 174 175 def __init__(self, from_address, recipients, tempfile): 176 self.source = None 177 """holds the message source if set directly""" 178 179 self._msgrep = None 180 """holds a copy of the message representation""" 181 182 # tags set by plugins 183 self.tags = {} 184 self.tags['virus'] = {} 185 self.tags['blocked'] = {} 186 self.tags['spam'] = {} 187 self.tags['highspam'] = {} 188 self.tags['decisions'] = [] 189 self.tags['scantimes'] = [] 190 191 # temporary file containing the message source 192 self.tempfile = tempfile 193 194 # stuff set from smtp transaction 195 self.size = os.path.getsize(tempfile) 196 self.from_address = from_address 197 198 # backwards compatibility, recipients can be a single address 199 if isinstance(recipients, list): 200 self.recipients = recipients 201 else: 202 self.recipients = [recipients, ] 203 204 # basic email validitiy check - nothing more than necessary for our internal assumptions 205 for rec in self.recipients: 206 if rec is None: 207 raise ValueError("Recipient address can not be None") 208 if not re.match(r"[^@]+@[^@]+$", rec): 209 raise ValueError("Invalid recipient address: %s"%rec) 210 211 212 # additional basic information 213 self.timestamp = time.time() 214 self.id = self._generate_id() 215 216 # headers which are prepended before re-injecting the message 217 self.addheaders = {} 218 219 if self.from_address is None: 220 self.from_address = '' 221 222 if self.from_address!='' and not re.match(r"[^@]+@[^@]+$", self.from_address): 223 raise ValueError("invalid sender address: %s"%self.from_address) 224 225 self.clientinfo = None 226 """holds client info tuple: helo, ip, reversedns""" 227 228 @property 229 def to_address(self): 230 """Returns the first recipient address""" 231 try: 232 return self.recipients[0] 233 except IndexError: 234 return None 235 236 @to_address.setter 237 def to_address(self, recipient): 238 """Sets a single recipient for this suspect, removing all others""" 239 self.recipients=[recipient,] 240 241 @property 242 def to_localpart(self): 243 """Returns the local part of the first recipient""" 244 try: 245 return self.to_address.rsplit('@', 1)[0] 246 except Exception: 247 logging.getLogger('suspect').error('could not extract localpart from recipient address %s' % self.to_address) 248 return None 249 250 @property 251 def to_domain(self): 252 """Returns the local part of the first recipient""" 253 try: 254 return self.to_address.rsplit('@', 1)[1] 255 except Exception: 256 logging.getLogger('suspect').error('could not extract domain from recipient address %s' % self.from_address) 257 return None 258 259 260 @property 261 def from_localpart(self): 262 if self.from_address == '': 263 return '' 264 265 else: 266 try: 267 return self.from_address.rsplit('@', 1)[0] 268 except Exception: 269 logging.getLogger('suspect').error('could not extract localpart from sender address %s'%self.from_address) 270 return None 271 272 @property 273 def from_domain(self): 274 if self.from_address == '': 275 return '' 276 277 else: 278 try: 279 return self.from_address.rsplit('@', 1)[1] 280 except Exception: 281 logging.getLogger('suspect').error('could not extract domain from sender address %s' % self.from_address) 282 return None 283 284 285 def _generate_id(self): 286 """ 287 returns a unique id (a string of 32 hex characters) 288 """ 289 return uuid.uuid4().hex 290 291 def debug(self, message): 292 """Add a line to the debug log if debugging is enabled for this message""" 293 if not self.get_tag('debug'): 294 return 295 isotime = datetime.datetime.now().isoformat() 296 fp = self.get_tag('debugfile') 297 try: 298 fp.write('%s %s\n' % (isotime, message)) 299 fp.flush() 300 except Exception as e: 301 logging.getLogger('suspect').error( 302 'Could not write to logfile: %s' % e) 303 304 def get_tag(self, key, defaultvalue=None): 305 """returns the tag value. if the tag is not found, return defaultvalue instead (None if no defaultvalue passed)""" 306 if key not in self.tags: 307 return defaultvalue 308 return self.tags[key] 309 310 def set_tag(self, key, value): 311 """Set a new tag""" 312 self.tags[key] = value 313 314 def is_highspam(self): 315 """Returns True if ANY of the spam engines tagged this suspect as high spam""" 316 for key in list(self.tags['highspam'].keys()): 317 val = self.tags['highspam'][key] 318 if val: 319 return True 320 return False 321 322 def is_spam(self): 323 """Returns True if ANY of the spam engines tagged this suspect as spam""" 324 for key in list(self.tags['spam'].keys()): 325 val = self.tags['spam'][key] 326 if val: 327 return True 328 return False 329 330 def is_blocked(self): 331 """Returns True if ANY plugin tagged this suspect as blocked""" 332 for key in list(self.tags['blocked'].keys()): 333 val = self.tags['blocked'][key] 334 if val: 335 return True 336 return False 337 338 def is_virus(self): 339 """Returns True if ANY of the antivirus engines tagged this suspect as infected""" 340 for key in list(self.tags['virus'].keys()): 341 val = self.tags['virus'][key] 342 if val: 343 return True 344 return False 345 346 def is_ham(self): 347 """Returns True if message is neither considered to be spam, virus or blocked""" 348 if self.is_spam() or self.is_virus() or self.is_blocked() or self.is_highspam(): 349 return False 350 return True 351 352 def update_subject(self, subject_cb, **cb_params): 353 """ 354 update/alter the message subject 355 :param subject_cb: callback function that alters the subject. must accept a string and return a string 356 :param cb_params: additional parameters to be passed to subject_cb 357 :return: True if subject was altered, False otherwise 358 """ 359 msgrep = self.get_message_rep() 360 oldsubj = msgrep.get("subject","") 361 newsubj = subject_cb(oldsubj, **cb_params) 362 if oldsubj != newsubj: 363 del msgrep["subject"] 364 msgrep["subject"] = newsubj 365 self.set_message_rep(msgrep) 366 if self.get_tag('origsubj') is None: 367 self.set_tag('origsubj', oldsubj) 368 return True 369 return False 370 371 372 def add_header(self, key, value, immediate=False): 373 """adds a header to the message. by default, headers will added when re-injecting the message back to postfix 374 if you set immediate=True the message source will be replaced immediately. Only set this to true if a header must be 375 visible to later plugins (eg. for spamassassin rules), otherwise, leave as False which is faster. 376 """ 377 if immediate: 378 # is ignore the right thing to do here? 379 value = value.encode('UTF-8', 'ignore') 380 hdr = Header(value, header_name=key, continuation_ws=' ') 381 hdrline = "%s: %s\n" % (key, hdr.encode()) 382 src = force_bString(hdrline) + force_bString(self.get_source()) 383 self.set_source(src) 384 else: 385 self.addheaders[key] = value 386 387 def addheader(self, key, value, immediate=False): 388 """old name for add_header""" 389 return self.add_header(key, value, immediate) 390 391 def get_current_decision_code(self): 392 dectag = self.get_tag('decisions') 393 if dectag is None: 394 return DUNNO 395 try: 396 pluginname, code = dectag[-1] 397 return code 398 except Exception: 399 return DUNNO 400 401 def _short_tag_rep(self): 402 """return a tag representation suitable for logging, with some tags stripped, some shortened""" 403 blacklist = ['decisions', 'scantimes', 'debugfile'] 404 tagscopy = {} 405 406 for k, v in self.tags.items(): 407 if k in blacklist: 408 continue 409 410 try: 411 strrep = str(v) 412 except Exception: # Unicodedecode errors and stuff like that 413 continue 414 415 therep = v 416 417 maxtaglen = 100 418 if len(strrep) > maxtaglen: 419 therep = strrep[:maxtaglen] + "..." 420 421 # specialfixes 422 if k == 'SAPlugin.spamscore' and not isinstance(v, str): 423 therep = "%.2f" % v 424 425 tagscopy[k] = therep 426 return str(tagscopy) 427 428 def log_format(self, template=None): 429 addvals = { 430 'size': self.size, 431 'spam': yesno(self.is_spam()), 432 'highspam': yesno(self.is_highspam()), 433 'blocked': yesno(self.is_blocked()), 434 'virus': yesno(self.is_virus()), 435 'modified': yesno(self.is_modified()), 436 'decision': actioncode_to_string(self.get_current_decision_code()), 437 'tags': self._short_tag_rep(), 438 'fulltags': str(self.tags), 439 } 440 return apply_template(template, self, addvals) 441 442 def __str__(self): 443 """representation good for logging""" 444 return self.log_format("Suspect ${id}: from=${from_address} to=${to_address} size=${size} spam=${spam} blocked=${blocked} virus=${virus} modified=${modified} decision=${decision} tags=${tags}") 445 446 def get_message_rep(self): 447 """returns the python email api representation of this suspect""" 448 # do we have a cached instance already? 449 if self._msgrep is not None: 450 return self._msgrep 451 452 if self.source is not None: 453 if sys.version_info > (3,): 454 # Python 3 and larger 455 # the basic "str" type is unicode 456 if isinstance(self.source, str): 457 msgrep = email.message_from_string(self.source) 458 else: 459 msgrep = email.message_from_bytes(self.source) 460 else: 461 # Python 2.x 462 msgrep = email.message_from_string(self.source) 463 464 self._msgrep = msgrep 465 return msgrep 466 else: 467 if sys.version_info > (3,): 468 # Python 3 and larger 469 # file should be binary... 470 471 # IMPORTANT: It is possible to use email.message_from_bytes BUT this will automatically replace 472 # '\r\n' in the message (_payload) by '\n' and the endtoend_test.py will fail! 473 tmpSource = self.get_original_source() 474 msgrep = email.message_from_bytes(tmpSource) 475 else: 476 # Python 2.x 477 with open(self.tempfile, 'r') as fh: 478 msgrep = email.message_from_file(fh) 479 self._msgrep = msgrep 480 return msgrep 481 482 def getMessageRep(self): 483 """old name for get_message_rep""" 484 return self.get_message_rep() 485 486 def set_message_rep(self, msgrep): 487 """replace the message content. this must be a standard python email representation 488 Warning: setting the source via python email representation seems to break dkim signatures! 489 """ 490 if sys.version_info > (3,): 491 # Python 3 and larger 492 # stick to bytes... 493 try: 494 self.set_source(msgrep.as_bytes()) 495 except AttributeError: 496 self.set_source(force_bString(msgrep.as_string())) 497 else: 498 # Python 2.x 499 self.set_source(msgrep.as_string()) 500 501 # order is important, set_source sets source to None 502 self._msgrep = msgrep 503 504 def setMessageRep(self, msgrep): 505 """old name for set_message_rep""" 506 return self.set_message_rep(msgrep) 507 508 def is_modified(self): 509 """returns true if the message source has been modified""" 510 return self.source is not None 511 512 def get_source(self, maxbytes=None): 513 """returns the current message source, possibly changed by plugins""" 514 if self.source is not None: 515 return self.source[:maxbytes] 516 else: 517 return self.get_original_source(maxbytes) 518 519 def getSource(self, maxbytes=None): 520 """old name for get_source""" 521 return self.get_source(maxbytes) 522 523 def set_source(self, source): 524 self.source = source 525 self._msgrep = None 526 527 def setSource(self, source): 528 """old name for set_source""" 529 return self.set_source(source) 530 531 def get_original_source(self, maxbytes=None): 532 """returns the original, unmodified message source""" 533 readbytes = -1 534 if maxbytes is not None: 535 readbytes = maxbytes 536 try: 537 with open(self.tempfile, 'rb') as fh: 538 source = fh.read(readbytes) 539 except Exception as e: 540 logging.getLogger('fuglu.suspect').error( 541 'Cannot retrieve original source from tempfile %s : %s' % (self.tempfile, str(e))) 542 raise e 543 return source 544 545 def getOriginalSource(self, maxbytes=None): 546 """old name for get_original_source""" 547 return self.get_original_source(maxbytes) 548 549 def get_headers(self): 550 """returns the message headers as string""" 551 headers = re.split( 552 b'(?:\n\n)|(?:\r\n\r\n)', self.get_source(maxbytes=1048576), 1)[0] 553 return headers 554 555 def get_client_info(self, config=None): 556 """returns information about the client that submitted this message. 557 (helo,ip,reversedns) 558 559 In before-queue mode this info is extracted using the XFORWARD SMTP protocol extension. 560 561 In after-queue mode this information is extracted from the message Received: headers and therefore probably not 100% reliable 562 all information is returned as-is, this means for example, that non-fcrdns client will show 'unknown' as reverse dns value. 563 564 if no config object is passed, the first parseable Received header is used. otherwise, the config is used to determine the correct boundary MTA (trustedhostregex / boundarydistance) 565 """ 566 if self.clientinfo is not None: 567 return self.clientinfo 568 569 if config is None: 570 clientinfo = self.client_info_from_rcvd() 571 572 else: 573 clientinfo = self.client_info_from_rcvd(config.get( 574 'environment', 'trustedhostsregex'), config.getint('environment', 'boundarydistance')) 575 self.clientinfo = clientinfo 576 return clientinfo 577 578 def client_info_from_rcvd(self, ignoreregex=None, skip=0): 579 """returns information about the client that submitted this message. 580 (helo,ip,reversedns) 581 582 This information is extracted from the message Received: headers and therefore probably not 100% reliable 583 all information is returned as-is, this means for example, that non-fcrdns client will show 'unknown' as reverse dns value. 584 585 if ignoreregex is not None, all results which match this regex in either helo,ip or reversedns will be ignored 586 587 By default, this method starts searching at the top Received Header. Set a higher skip value to start searching further down. 588 589 both these arguments can be used to filter received headers from local systems in order to get the information from a boundary MTA 590 591 returns None if the client info can not be found or if all applicable values are filtered by skip/ignoreregex 592 """ 593 ignorere = None 594 if ignoreregex is not None and ignoreregex != '': 595 ignorere = re.compile(ignoreregex) 596 597 unknown = None 598 599 receivedheaders = self.get_message_rep().get_all('Received') 600 if receivedheaders is None: 601 return unknown 602 603 for rcvdline in receivedheaders[skip:]: 604 h_rev_ip = self._parse_rcvd_header(rcvdline) 605 if h_rev_ip is None: 606 return unknown 607 608 helo, revdns, ip = h_rev_ip 609 610 # check if hostname or ip matches the ignore re, try next header if 611 # it does 612 if ignorere is not None: 613 excludematch = ignorere.search(ip) 614 if excludematch is not None: 615 continue 616 617 excludematch = ignorere.search(revdns) 618 if excludematch is not None: 619 continue 620 621 excludematch = ignorere.search(helo) 622 if excludematch is not None: 623 continue 624 625 clientinfo = helo, ip, revdns 626 return clientinfo 627 # we should only land here if we only have received headers in 628 # mynetworks 629 return unknown 630 631 def _parse_rcvd_header(self, rcvdline): 632 """return tuple HELO,REVERSEDNS,IP from received Header line, or None, if extraction fails""" 633 receivedpattern = re.compile( 634 "^from\s(?P<helo>[^\s]+)\s\((?P<revdns>[^\s]+)\s\[(?:IPv6:)?(?P<ip>(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})|(?:[0-9a-f:]{3,40}))\]\)") 635 match = receivedpattern.search(rcvdline) 636 if match is None: 637 return None 638 return match.groups() 639 640 641# it is important that this class explicitly extends from object, or 642# __subclasses__() will not work! 643 644 645class BasicPlugin(object): 646 647 """Base class for all plugins""" 648 649 def __init__(self, config, section=None): 650 if section is None: 651 self.section = self.__class__.__name__ 652 else: 653 self.section = section 654 655 self.config = config 656 self.requiredvars = {} 657 658 def _logger(self): 659 """returns the logger for this plugin""" 660 myclass = self.__class__.__name__ 661 loggername = "fuglu.plugin.%s" % myclass 662 return logging.getLogger(loggername) 663 664 def lint(self): 665 return self.checkConfig() 666 667 def checkConfig(self): 668 """old name for check_config""" 669 return self.check_config() 670 671 def check_config(self): 672 """Print missing / non-default configuration settings""" 673 allOK = True 674 675 # old config style 676 if isinstance(self.requiredvars, (tuple, list)): 677 for configvar in self.requiredvars: 678 if isinstance(self.requiredvars, tuple): 679 (section, config) = configvar 680 else: 681 config = configvar 682 section = self.section 683 try: 684 self.config.get(section, config) 685 except configparser.NoOptionError: 686 print("Missing configuration value [%s] :: %s" % ( 687 section, config)) 688 allOK = False 689 except configparser.NoSectionError: 690 print("Missing configuration section %s" % section) 691 allOK = False 692 693 # new config style 694 if isinstance(self.requiredvars, dict): 695 for config, infodic in self.requiredvars.items(): 696 section = self.section 697 if 'section' in infodic: 698 section = infodic['section'] 699 700 try: 701 var = self.config.get(section, config) 702 if 'validator' in infodic: 703 if not infodic["validator"](var): 704 print("Validation failed for [%s] :: %s" % ( 705 section, config)) 706 allOK = False 707 except configparser.NoSectionError: 708 print("Missing configuration section [%s] :: %s" % ( 709 section, config)) 710 allOK = False 711 except configparser.NoOptionError: 712 print("Missing configuration value [%s] :: %s" % ( 713 section, config)) 714 allOK = False 715 716 return allOK 717 718 def __str__(self): 719 classname = self.__class__.__name__ 720 if self.section == classname: 721 return classname 722 else: 723 return '%s(%s)' % (classname, self.section) 724 725 726class ScannerPlugin(BasicPlugin): 727 728 """Scanner Plugin Base Class""" 729 730 def examine(self, suspect): 731 self._logger().warning('Unimplemented examine() method') 732 733 734class PrependerPlugin(BasicPlugin): 735 736 """Prepender Plugins - Plugins run before the scanners that can influence 737 the list of scanners being run for a certain message""" 738 739 def pluginlist(self, suspect, pluginlist): 740 """return the modified pluginlist or None for no change""" 741 self._logger().warning('Unimplemented pluginlist() method') 742 return None 743 744 745class AppenderPlugin(BasicPlugin): 746 747 """Appender Plugins are run after the scan process (and after the re-injection if the message 748 was accepted)""" 749 750 def process(self, suspect, decision): 751 self._logger().warning('Unimplemented process() method') 752 753 754class SuspectFilter(object): 755 756 """Allows filtering Suspect based on header/tag/body regexes""" 757 758 def __init__(self, filename): 759 self.filename = filename 760 self.patterns = [] 761 762 self.reloadinterval = 30 763 self.lastreload = 0 764 self.logger = logging.getLogger('fuglu.suspectfilter') 765 766 if filename is not None: 767 self._reloadifnecessary() 768 self.stripre = re.compile(r'<[^>]*?>') 769 770 def _reloadifnecessary(self): 771 now = time.time() 772 # check if reloadinterval has passed 773 if now - self.lastreload < self.reloadinterval: 774 return 775 if self.file_changed(): 776 self._reload() 777 778 def _load_simplestyle_line(self, line): 779 sp = line.split(None, 2) 780 if len(sp) < 2: 781 raise Exception( 782 """"Invalid line '%s' in Rulefile %s. Ignoring.""" % (line, self.filename)) 783 784 args = None 785 if len(sp) == 3: 786 args = sp[2] 787 788 fieldname = sp[0] 789 # strip ending : (request AXB) 790 if fieldname.endswith(':'): 791 fieldname = fieldname[:-1] 792 regex = sp[1] 793 try: 794 pattern = re.compile(regex, re.IGNORECASE | re.DOTALL) 795 except Exception as e: 796 raise Exception( 797 'Could not compile regex %s in file %s (%s)' % (regex, self.filename, e)) 798 799 tup = (fieldname, pattern, args) 800 return tup 801 802 def _load_perlstyle_line(self, line): 803 patt = r"""(?P<fieldname>[a-zA-Z0-9\-\.\_\:]+)[:]?\s+\/(?P<regex>(?:\\.|[^/\\])*)/(?P<flags>[IiMm]+)?((?:\s*$)|(?:\s+(?P<args>.*)))$""" 804 m = re.match(patt, line) 805 if m is None: 806 return None 807 808 groups = m.groupdict() 809 regex = groups['regex'] 810 flags = groups['flags'] 811 if flags is None: 812 flags = [] 813 args = groups['args'] 814 if args is not None and args.strip() == '': 815 args = None 816 fieldname = groups['fieldname'] 817 if fieldname.endswith(':'): 818 fieldname = fieldname[:-1] 819 820 reflags = 0 821 for flag in flags: 822 flag = flag.lower() 823 if flag == 'i': 824 reflags |= re.I 825 if flag == 'm': 826 reflags |= re.M 827 828 try: 829 pattern = re.compile(regex, reflags) 830 except Exception as e: 831 raise Exception( 832 'Could not compile regex %s in file %s (%s)' % (regex, self.filename, e)) 833 834 tup = (fieldname, pattern, args) 835 return tup 836 837 def _reload(self): 838 self.logger.info('Reloading Rulefile %s' % self.filename) 839 statinfo = os.stat(self.filename) 840 ctime = statinfo.st_ctime 841 self.lastreload = ctime 842 with open(self.filename, 'r') as fp: 843 lines = fp.readlines() 844 newpatterns = [] 845 846 for line in lines: 847 line = line.strip() 848 if line == "": 849 continue 850 if line.startswith('#'): 851 continue 852 853 # try advanced regex line 854 #<headername> /regex/<flags> <arguments> 855 try: 856 tup = self._load_perlstyle_line(line) 857 if tup is not None: 858 newpatterns.append(tup) 859 continue 860 except Exception as e: 861 self.logger.error( 862 "perl style line failed %s, error: %s" % (line, str(e))) 863 continue 864 865 # line shold be "headername regex arguments" 866 try: 867 tup = self._load_simplestyle_line(line) 868 newpatterns.append(tup) 869 continue 870 except Exception as e: 871 self.logger.error(str(e)) 872 continue 873 874 self.patterns = newpatterns 875 876 def strip_text(self, content, remove_tags=None, replace_nbsp=True, use_bfs=True): 877 """Strip HTML Tags from content, replace newline with space (like Spamassassin)""" 878 879 if remove_tags is None: 880 remove_tags = ['script', 'style'] 881 882 # content may land as a bytes object in py3, so we have to convert to a string so we can 883 # replace newline with space 884 # if it's unicode, we don't convert 885 if isinstance(content, bytes): # in py2 bytes is an alias for str, no change 886 content = str(content) 887 content = content.replace("\n", " ") 888 889 if HAVE_BEAUTIFULSOUP and use_bfs: 890 if BS_VERSION >= 4: 891 soup = BeautifulSoup.BeautifulSoup(content, "lxml") 892 else: 893 soup = BeautifulSoup.BeautifulSoup(content) 894 for r in remove_tags: 895 [x.extract() for x in soup.findAll(r)] 896 897 if BS_VERSION >= 4: 898 stripped = soup.get_text() 899 if replace_nbsp: 900 stripped = stripped.replace(u'\xa0', u' ') 901 return stripped 902 else: 903 stripped = ''.join( 904 # Can retain unicode check since BS < 4 is Py2 only 905 [e for e in soup.recursiveChildGenerator() \ 906 if isinstance(e, unicode) \ 907 and not isinstance(e, BeautifulSoup.Declaration) \ 908 and not isinstance(e, BeautifulSoup.ProcessingInstruction) \ 909 and not isinstance(e, BeautifulSoup.Comment)]) 910 if replace_nbsp: 911 stripped = stripped.replace(u'\xa0', u' ') 912 return stripped 913 914 # no BeautifulSoup available, let's try a modified version of pyzor's 915 # html stripper 916 stripper = HTMLStripper(strip_tags=remove_tags) 917 918 try: 919 # always try to replace nbsp as HTMLStripper would just remove them 920 content = content.replace(" ", " ").replace(" ", " ").replace(" ", " ") 921 except Exception: 922 pass 923 924 try: 925 stripper.feed(content) 926 return stripper.get_stripped_data() 927 except Exception: # ignore parsing/encoding errors 928 pass 929 # use regex replace 930 return re.sub(self.stripre, '', content) 931 932 def get_decoded_textparts(self, messagerep): 933 """Returns a list of all text contents""" 934 textparts = [] 935 for part in messagerep.walk(): 936 payload = None 937 if part.get_content_maintype() == 'text' and (not part.is_multipart()): 938 payload = part.get_payload(None, True) 939 940 #multipart/mixed are text by default as well 941 if part.get_content_maintype() == 'multipart' and part.get_content_subtype() == 'mixed': 942 payload = part.get_payload(None, True) 943 944 # payload can be None even if it was returned from part.get_payload() 945 if payload is not None: 946 textparts.append(payload) 947 return textparts 948 949 def get_field(self, suspect, headername): 950 """return a list of mail header values or special values. If the value can not be found, an empty list is returned. 951 952 headers: 953 just the headername for normal headers 954 mime:headername for attached mime part headers 955 956 envelope data: 957 envelope_from (or from_address) 958 envelope_to (or to_address) 959 from_domain 960 to_domain 961 clientip 962 clienthostname (fcrdns or 'unknown') 963 clienthelo 964 965 tags 966 @tagname 967 968 body source: 969 body:full -> (full source, encoded) 970 body:stripped (or just 'body') : -> returns text/* bodyparts with tags and newlines stripped 971 body:raw -> decoded raw message body parts 972 973 974 """ 975 # builtins 976 if headername == 'envelope_from' or headername == 'from_address': 977 return [suspect.from_address, ] 978 if headername == 'envelope_to' or headername == 'to_address': 979 return suspect.recipients 980 if headername == 'from_domain': 981 return [suspect.from_domain, ] 982 if headername == 'to_domain': 983 return [suspect.to_domain, ] 984 if headername == 'body:full': 985 return [suspect.get_original_source()] 986 987 if headername in ['clientip', 'clienthostname', 'clienthelo']: 988 clinfo = suspect.get_client_info() 989 if clinfo is None: 990 return [] 991 if headername == 'clienthelo': 992 return [clinfo[0], ] 993 if headername == 'clientip': 994 return [clinfo[1], ] 995 if headername == 'clienthostname': 996 return [clinfo[2], ] 997 998 # if it starts with a @ we return a tag, not a header 999 if headername[0:1] == '@': 1000 tagname = headername[1:] 1001 tagval = suspect.get_tag(tagname) 1002 if tagval is None: 1003 return [] 1004 if isinstance(tagval, list): 1005 return tagval 1006 return [tagval] 1007 1008 messagerep = suspect.get_message_rep() 1009 1010 # body rules on decoded text parts 1011 if headername == 'body:raw': 1012 return self.get_decoded_textparts(messagerep) 1013 1014 if headername == 'body' or headername == 'body:stripped': 1015 return list(map(self.strip_text, self.get_decoded_textparts(messagerep))) 1016 1017 if headername.startswith('mime:'): 1018 allvalues = [] 1019 realheadername = headername[5:] 1020 for part in messagerep.walk(): 1021 hdrslist = self._get_headers(realheadername, part) 1022 allvalues.extend(hdrslist) 1023 return allvalues 1024 1025 # standard header 1026 return self._get_headers(headername, messagerep) 1027 1028 def _get_headers(self, headername, payload): 1029 valuelist = [] 1030 if '*' in headername: 1031 regex = re.escape(headername) 1032 regex = regex.replace('\*', '.*') 1033 patt = re.compile(regex, re.IGNORECASE) 1034 1035 for h in list(payload.keys()): 1036 if re.match(patt, h) is not None: 1037 valuelist.extend(payload.get_all(h,[])) 1038 else: 1039 valuelist = payload.get_all(headername,[]) 1040 1041 return valuelist 1042 1043 def matches(self, suspect, extended=False): 1044 """returns (True,arg) if any regex matches, (False,None) otherwise 1045 1046 if extended=True, returns all available info about the match in a tuple: 1047 True, (fieldname, matchedvalue, arg, regex) 1048 """ 1049 self._reloadifnecessary() 1050 1051 for tup in self.patterns: 1052 (fieldname, pattern, arg) = tup 1053 vals = self.get_field(suspect, fieldname) 1054 if vals is None or len(vals) == 0: 1055 self.logger.debug('No field %s found' % fieldname) 1056 continue 1057 1058 for val in vals: 1059 if val is None: 1060 continue 1061 1062 strval = str(val) 1063 if pattern.search(strval): 1064 self.logger.debug("""MATCH field %s (arg '%s') regex '%s' against value '%s'""" % ( 1065 fieldname, arg, pattern.pattern, val)) 1066 suspect.debug("message matches rule in %s: field=%s arg=%s regex=%s content=%s" % ( 1067 self.filename, fieldname, arg, pattern.pattern, val)) 1068 if extended: 1069 return True, (fieldname, strval, arg, pattern.pattern) 1070 else: 1071 return True, arg 1072 else: 1073 self.logger.debug("""NO MATCH field %s (arg '%s') regex '%s' against value '%s'""" % ( 1074 fieldname, arg, pattern.pattern, val)) 1075 1076 self.logger.debug('No match found') 1077 suspect.debug("message does not match any rule in %s" % self.filename) 1078 return False, None 1079 1080 def get_args(self, suspect, extended=False): 1081 """returns all args of matched regexes in a list 1082 if extended=True: returns a list of tuples with all available information: 1083 (fieldname, matchedvalue, arg, regex) 1084 """ 1085 ret = [] 1086 self._reloadifnecessary() 1087 for tup in self.patterns: 1088 (fieldname, pattern, arg) = tup 1089 vals = self.get_field(suspect, fieldname) 1090 if vals is None or len(vals) == 0: 1091 self.logger.debug('No field %s found' % fieldname) 1092 continue 1093 for val in vals: 1094 if val is None: 1095 continue 1096 strval = str(val) 1097 if pattern.search(strval) is not None: 1098 self.logger.debug("""MATCH field %s (arg '%s') regex '%s' against value '%s'""" % ( 1099 fieldname, arg, pattern.pattern, val)) 1100 suspect.debug("message matches rule in %s: field=%s arg=%s regex=%s content=%s" % ( 1101 self.filename, fieldname, arg, pattern.pattern, val)) 1102 if extended: 1103 ret.append((fieldname, strval, arg, pattern.pattern)) 1104 else: 1105 ret.append(arg) 1106 else: 1107 self.logger.debug("""NO MATCH field %s (arg '%s') regex '%s' against value '%s'""" % ( 1108 fieldname, arg, pattern.pattern, val)) 1109 1110 return ret 1111 1112 def getArgs(self, suspect): 1113 """old name for get_args""" 1114 return self.get_args(suspect) 1115 1116 def file_changed(self): 1117 """Return True if the file has changed on disks since the last reload""" 1118 if not os.path.isfile(self.filename): 1119 return False 1120 statinfo = os.stat(self.filename) 1121 ctime = statinfo.st_ctime 1122 if ctime > self.lastreload: 1123 return True 1124 return False 1125 1126 def lint(self): 1127 """check file and print warnings to console. returns True if everything is ok, False otherwise""" 1128 if not os.path.isfile(self.filename): 1129 print("SuspectFilter file not found: %s" % self.filename) 1130 return False 1131 with open(self.filename, 'r') as fp: 1132 lines = fp.readlines() 1133 lineno = 0 1134 for line in lines: 1135 lineno += 1 1136 line = line.strip() 1137 if line == "": 1138 continue 1139 if line.startswith('#'): 1140 continue 1141 try: 1142 tup = self._load_perlstyle_line(line) 1143 if tup is not None: 1144 continue 1145 self._load_simplestyle_line(line) 1146 except Exception as e: 1147 print("Error in SuspectFilter file '%s', lineno %s , line '%s' : %s" % ( 1148 self.filename, lineno, line, str(e))) 1149 return False 1150 return True 1151 1152 1153class HTMLStripper(HTMLParser): 1154 1155 def __init__(self, strip_tags=None): 1156 HTMLParser.__init__(self) 1157 self.strip_tags = strip_tags or ['script', 'style'] 1158 self.reset() 1159 self.collect = True 1160 self.stripped_data = [] 1161 1162 def handle_data(self, data): 1163 if data and self.collect: 1164 self.stripped_data.append(data) 1165 1166 def handle_starttag(self, tag, attrs): 1167 HTMLParser.handle_starttag(self, tag, attrs) 1168 if tag.lower() in self.strip_tags: 1169 self.collect = False 1170 1171 def handle_endtag(self, tag): 1172 HTMLParser.handle_endtag(self, tag) 1173 if tag.lower() in self.strip_tags: 1174 self.collect = True 1175 1176 def get_stripped_data(self): 1177 return ''.join(self.stripped_data) 1178 1179 1180class FileList(object): 1181 1182 """Map all lines from a textfile into a list. If the file is changed, the list is refreshed automatically 1183 Each line can be run through a callback filter which can change or remove the content. 1184 1185 filename: The textfile which should be mapped to a list. This can be changed at runtime. If None, an empty list will be returned. 1186 strip: remove leading/trailing whitespace from each line. Note that the newline character is always stripped 1187 skip_empty: skip empty lines (if used in combination with strip: skip all lines with only whitespace) 1188 skip_comments: skip lines starting with # 1189 lowercase: lowercase each line 1190 additional_filters: function or list of functions which will be called for each line on reload. 1191 Each function accept a single argument and must return a (possibly modified) line or None to skip this line 1192 minimum_time_between_reloads: number of seconds to cache the list before it will be reloaded if the file changes 1193 """ 1194 1195 def __init__(self, filename=None, strip=True, skip_empty=True, skip_comments=True, lowercase=False, additional_filters=None, minimum_time_between_reloads=5): 1196 self.filename = filename 1197 self.minium_time_between_reloads = minimum_time_between_reloads 1198 self._lastreload = 0 1199 self.linefilters = [] 1200 self.content = [] 1201 self.logger = logging.getLogger('%s.filelist' % __package__) 1202 self.lock = threading.Lock() 1203 1204 # we always strip newline 1205 self.linefilters.append(lambda x: x.rstrip('\r\n')) 1206 1207 if strip: 1208 self.linefilters.append(lambda x: x.strip()) 1209 1210 if skip_empty: 1211 self.linefilters.append(lambda x: x if x != '' else None) 1212 1213 if skip_comments: 1214 self.linefilters.append( 1215 lambda x: None if x.strip().startswith('#') else x) 1216 1217 if lowercase: 1218 self.linefilters.append(lambda x: x.lower()) 1219 1220 if additional_filters is not None: 1221 if isinstance(additional_filters, list): 1222 self.linefilters.extend(additional_filters) 1223 else: 1224 self.linefilters.append(additional_filters) 1225 1226 if filename is not None: 1227 self._reload_if_necessary() 1228 1229 def _reload_if_necessary(self): 1230 """Calls _reload if the file has been changed since the last reload""" 1231 now = time.time() 1232 # check if reloadinterval has passed 1233 if now - self._lastreload < self.minium_time_between_reloads: 1234 return 1235 if not self.file_changed(): 1236 return 1237 if not self.lock.acquire(): 1238 return 1239 try: 1240 self._reload() 1241 finally: 1242 self.lock.release() 1243 1244 def _reload(self): 1245 """Reload the file and build the list""" 1246 self.logger.info('Reloading file %s' % self.filename) 1247 statinfo = os.stat(self.filename) 1248 ctime = statinfo.st_ctime 1249 self._lastreload = ctime 1250 with open(self.filename, 'r') as fp: 1251 lines = fp.readlines() 1252 newcontent = [] 1253 1254 for line in lines: 1255 for func in self.linefilters: 1256 line = func(line) 1257 if line is None: 1258 break 1259 1260 if line is not None: 1261 newcontent.append(line) 1262 1263 self.content = newcontent 1264 1265 def file_changed(self): 1266 """Return True if the file has changed on disks since the last reload""" 1267 if not os.path.isfile(self.filename): 1268 return False 1269 statinfo = os.stat(self.filename) 1270 ctime = statinfo.st_ctime 1271 if ctime > self._lastreload: 1272 return True 1273 return False 1274 1275 def get_list(self): 1276 """Returns the current list. If the file has been changed since the last call, it will rebuild the list automatically.""" 1277 self._reload_if_necessary() 1278 return self.content 1279 1280 1281 1282class Cache(object): 1283 """ 1284 Simple local cache object. 1285 cached data will expire after a defined interval 1286 """ 1287 1288 def __init__(self, cachetime=30, cleanupinterval=300): 1289 self.cache={} 1290 self.cachetime=cachetime 1291 self.cleanupinterval=cleanupinterval 1292 self.lock=threading.Lock() 1293 self.logger=logging.getLogger("%s.settingscache" % __package__) 1294 1295 t = threading.Thread(target=self.clear_cache_thread) 1296 t.daemon = True 1297 t.start() 1298 1299 1300 def put_cache(self,key,obj): 1301 try: 1302 gotlock=self.lock.acquire(True) 1303 if gotlock: 1304 self.cache[key]=(obj,time.time()) 1305 except Exception as e: 1306 self.logger.exception(e) 1307 finally: 1308 if gotlock: 1309 self.lock.release() 1310 1311 1312 def get_cache(self,key): 1313 try: 1314 gotlock=self.lock.acquire(True) 1315 if not gotlock: 1316 return None 1317 1318 ret=None 1319 1320 if key in self.cache: 1321 obj,instime=self.cache[key] 1322 now=time.time() 1323 if now-instime<self.cachetime: 1324 ret=obj 1325 else: 1326 del self.cache[key] 1327 1328 except Exception as e: 1329 self.logger.exception(e) 1330 finally: 1331 self.lock.release() 1332 return ret 1333 1334 1335 def clear_cache_thread(self): 1336 while True: 1337 time.sleep(self.cleanupinterval) 1338 now=time.time() 1339 try: 1340 gotlock=self.lock.acquire(True) 1341 if not gotlock: 1342 continue 1343 1344 cleancount=0 1345 1346 for key in set(self.cache.keys()): 1347 obj,instime=self.cache[key] 1348 if now-instime>self.cachetime: 1349 del self.cache[key] 1350 cleancount+=1 1351 except Exception as e: 1352 self.logger.exception(e) 1353 finally: 1354 if gotlock: 1355 self.lock.release() 1356 self.logger.debug("Cleaned %s expired entries."%cleancount) 1357 1358 1359 1360class CacheSingleton(object): 1361 """ 1362 Process singleton to store a default Cache instance 1363 Note it is important there is a separate Cache instance for each process 1364 since otherwise the Threading.Lock will screw up and block the execution. 1365 """ 1366 1367 instance = None 1368 procPID = None 1369 1370 def __init__(self, *args, **kwargs): 1371 pid = os.getpid() 1372 logger = logging.getLogger("%s.CacheSingleton" % __package__) 1373 if pid == CacheSingleton.procPID and CacheSingleton.instance is not None: 1374 logger.debug("Return existing Cache Singleton for process with pid: %u"%pid) 1375 else: 1376 if CacheSingleton.instance is None: 1377 logger.info("Create CacheSingleton for process with pid: %u"%pid) 1378 elif CacheSingleton.procPID != pid: 1379 logger.warning("Replace CacheSingleton(created by process %u) for process with pid: %u"%(CacheSingleton.procPID,pid)) 1380 1381 CacheSingleton.instance = Cache(*args,**kwargs) 1382 CacheSingleton.procPID = pid 1383 1384 def __getattr__(self, name): 1385 return getattr(self.instance, name) 1386 1387 1388def get_default_cache(): 1389 """ 1390 Function to get processor unique Cache Singleton 1391 """ 1392 return CacheSingleton() 1393