1# $Log$ 2# Revision 1.8 2011/11/05 15:51:03 customdesigned 3# New example 4# 5# Revision 1.7 2009/06/13 21:15:12 customdesigned 6# Doxygen updates. 7# 8# Revision 1.6 2009/06/09 03:13:13 customdesigned 9# More doxygen docs. 10# 11# Revision 1.5 2005/07/20 14:49:43 customdesigned 12# Handle corrupt and empty ZIP files. 13# 14# Revision 1.4 2005/06/17 01:49:39 customdesigned 15# Handle zip within zip. 16# 17# Revision 1.3 2005/06/02 15:00:17 customdesigned 18# Configure banned extensions. Scan zipfile option with test case. 19# 20# Revision 1.2 2005/06/02 04:18:55 customdesigned 21# Update copyright notices after reading article on /. 22# 23# Revision 1.1.1.4 2005/05/31 18:23:49 customdesigned 24# Development changes since 0.7.2 25# 26# Revision 1.62 2005/02/14 22:31:17 stuart 27# _parseparam replacement not needed for python2.4 28# 29# Revision 1.61 2005/02/12 02:11:11 stuart 30# Pass unit tests with python2.4. 31# 32# Revision 1.60 2005/02/11 18:34:14 stuart 33# Handle garbage after quote in boundary. 34# 35# Revision 1.59 2005/02/10 01:10:59 stuart 36# Fixed MimeMessage.ismodified() 37# 38# Revision 1.58 2005/02/10 00:56:49 stuart 39# Runs with python2.4. Defang not working correctly - more work needed. 40# 41# Revision 1.57 2004/11/20 16:37:52 stuart 42# fix regex for splitting header and body 43# 44# Revision 1.56 2004/11/09 20:33:51 stuart 45# Recognize more dynamic PTR variations. 46# 47# Revision 1.55 2004/10/06 21:39:20 stuart 48# Handle message attachments with boundary errors by not parsing them 49# until needed. 50# 51# Revision 1.54 2004/08/18 01:59:46 stuart 52# Handle mislabeled multipart messages 53# 54# Revision 1.53 2004/04/24 22:53:20 stuart 55# Rename some local variables to avoid shadowing builtins 56# 57# Revision 1.52 2004/04/24 22:47:13 stuart 58# Convert header values to str 59# 60# Revision 1.51 2004/03/25 03:19:10 stuart 61# Correctly defang rfc822 attachments when boundary specified with 62# content-type message/rfc822. 63# 64# Revision 1.50 2003/10/15 22:01:00 stuart 65# Test for and work around email bug with encoded filenames. 66# 67# Revision 1.49 2003/09/04 18:48:13 stuart 68# Support python-2.2.3 69# 70# Revision 1.48 2003/09/02 00:27:27 stuart 71# Should have full milter based dspam support working 72# 73# Revision 1.47 2003/08/26 06:08:18 stuart 74# Use new python boolean since we now require 2.2.2 75# 76# Revision 1.46 2003/08/26 05:01:38 stuart 77# Release 0.6.0 78# 79# Revision 1.45 2003/08/26 04:01:24 stuart 80# Use new email module for parsing mail. Still need mime module to 81# provide various bug fixes to email module, and maintain some compatibility 82# with old milter code. 83# 84 85## @package mime 86# This module provides a "defang" function to replace naughty attachments. 87# 88# We also provide workarounds for bugs in the email module that comes 89# with python. The "bugs" fixed mostly come up only with malformed 90# messages - but that is what you have when dealing with spam. 91 92# Author: Stuart D. Gathman <stuart@bmsi.com> 93# Copyright 2001,2002,2003,2004,2005 Business Management Systems, Inc. 94# This code is under the GNU General Public License. See COPYING for details. 95 96from __future__ import print_function 97try: 98 from io import BytesIO, StringIO 99except: 100 from StringIO import StringIO 101 BytesIO = StringIO 102import socket 103import Milter 104import zipfile 105import sys 106 107import email 108from email.message import Message 109try: 110 from email.generator import BytesGenerator 111 from email import message_from_binary_file 112except: 113 from email.generator import Generator as BytesGenerator 114 from email import message_from_file as message_from_binary_file 115from email.utils import quote 116 117if not getattr(Message,'as_bytes',None): 118 Message.as_bytes = Message.as_string 119 120## Return a list of filenames in a zip file. 121# Embedded zip files are recursively expanded. 122def zipnames(txt): 123 fp = BytesIO(txt) 124 zipf = zipfile.ZipFile(fp,'r') 125 names = [] 126 for nm in zipf.namelist(): 127 names.append(('zipname',nm)) 128 if nm.lower().endswith('.zip'): 129 names += zipnames(zipf.read(nm)) 130 return names 131 132## Fix multipart handling in email.Generator. 133# 134class MimeGenerator(BytesGenerator): 135 def _dispatch(self, msg): 136 # Get the Content-Type: for the message, then try to dispatch to 137 # self._handle_<maintype>_<subtype>(). If there's no handler for the 138 # full MIME type, then dispatch to self._handle_<maintype>(). If 139 # that's missing too, then dispatch to self._writeBody(). 140 main = msg.get_content_maintype() 141 if msg.is_multipart() and main.lower() != 'multipart': 142 self._handle_multipart(msg) 143 else: 144 BytesGenerator._dispatch(self,msg) 145 146def unquote(s): 147 """Remove quotes from a string.""" 148 if len(s) > 1: 149 if s.startswith('"'): 150 if s.endswith('"'): 151 s = s[1:-1] 152 else: # remove garbage after trailing quote 153 try: s = s[1:s[1:].index('"')+1] 154 except: 155 return s 156 return s.replace('\\\\', '\\').replace('\\"', '"') 157 if s.startswith('<') and s.endswith('>'): 158 return s[1:-1] 159 return s 160 161def _unquotevalue(value): 162 if isinstance(value, tuple): 163 return value[0], value[1], unquote(value[2]) 164 else: 165 return unquote(value) 166 167#email.Message._unquotevalue = _unquotevalue 168 169from email.message import _parseparam 170 171## Enhance email.message.Message 172# 173# Tracks modifications to headers of body or any part independently. 174 175class MimeMessage(Message): 176 """Version of email.Message.Message compatible with old mime module 177 """ 178 def __init__(self,fp=None,seekable=1): 179 Message.__init__(self) 180 self.submsg = None 181 self.modified = False 182 ## @var headerchange 183 # Provide a headerchange event for integration with Milter. 184 # The headerchange attribute can be assigned a function to be called when 185 # changing headers. The signature is: 186 # headerchange(msg,name,value) -> None 187 self.headerchange = None 188 189 def get_param(self, param, failobj=None, header='content-type', unquote=True): 190 val = Message.get_param(self,param,failobj,header,unquote) 191 if val != failobj and param == 'boundary' and unquote: 192 # unquote boundaries an extra time, test case testDefang5 193 return _unquotevalue(val) 194 return val 195 196 getfilename = Message.get_filename 197 ismultipart = Message.is_multipart 198 getheaders = Message.get_all 199 gettype = Message.get_content_type 200 getparam = Message.get_param 201 202 def getparams(self): return self.get_params([]) 203 204 def getname(self): 205 return self.get_param('name') 206 207 def getnames(self,scan_zip=False): 208 """Return a list of (attr,name) pairs of attributes that IE might 209 interpret as a name - and hence decide to execute this message.""" 210 names = [] 211 for attr,val in self._get_params_preserve([],'content-type'): 212 if isinstance(val, tuple): 213 # It's an RFC 2231 encoded parameter 214 newvalue = _unquotevalue(val) 215 if val[0]: 216 val = unicode(newvalue[2], newvalue[0]) 217 else: 218 val = unicode(newvalue[2]) 219 else: 220 val = _unquotevalue(val.strip()) 221 names.append((attr,val)) 222 names += [("filename",self.get_filename())] 223 if scan_zip: 224 for key,name in tuple(names): # copy by converting to tuple 225 if name and name.lower().endswith('.zip'): 226 txt = self.get_payload(decode=True) 227 if txt.strip(): 228 names += zipnames(txt) 229 return names 230 231 def ismodified(self): 232 "True if this message or a subpart has been modified." 233 if not self.is_multipart(): 234 if isinstance(self.submsg,Message): 235 return self.submsg.ismodified() 236 return self.modified 237 if self.modified: return True 238 for i in self.get_payload(): 239 if i.ismodified(): return True 240 return False 241 242 def dump(self,file,unixfrom=False): 243 "Write this message (and all subparts) to a file" 244 g = MimeGenerator(file) 245 g.flatten(self,unixfrom=unixfrom) 246 247 def as_bytes(self, unixfrom=False): 248 "Return the entire formatted message as a string." 249 fp = BytesIO() 250 self.dump(fp,unixfrom=unixfrom) 251 return fp.getvalue() 252 253 def getencoding(self): 254 return self.get('content-transfer-encoding',None) 255 256 # Decode body to stream according to transfer encoding, return encoding name 257 def decode(self,filt): 258 try: 259 filt.write(self.get_payload(decode=True)) 260 except: 261 pass 262 return self.getencoding() 263 264 def get_payload_decoded(self): 265 return self.get_payload(decode=True) 266 267 def __setitem__(self, name, value): 268 rc = Message.__setitem__(self,name,value) 269 self.modified = True 270 if self.headerchange: self.headerchange(self,name,str(value)) 271 return rc 272 273 def __delitem__(self, name): 274 if self.headerchange: self.headerchange(self,name,None) 275 rc = Message.__delitem__(self,name) 276 self.modified = True 277 return rc 278 279 def get_payload(self,i=None,decode=False): 280 msg = self.submsg 281 if isinstance(msg,Message) and msg.ismodified(): 282 self.set_payload([msg]) 283 return Message.get_payload(self,i,decode) 284 285 def set_payload(self, val, charset=None): 286 self.modified = True 287 try: 288 val.seek(0) 289 val = val.read() 290 except: pass 291 Message.set_payload(self,val,charset) 292 self.submsg = None 293 294 def get_submsg(self): 295 t = self.get_content_type().lower() 296 if t == 'message/rfc822' or t.startswith('multipart/'): 297 if not self.submsg: 298 txt = self.get_payload() 299 if type(txt) == str: 300 txt = self.get_payload(decode=True) 301 self.submsg = email.message_from_string(txt,MimeMessage) 302 for part in self.submsg.walk(): 303 part.modified = False 304 else: 305 self.submsg = txt[0] 306 return self.submsg 307 return None 308 309def message_from_file(fp): 310 msg = message_from_binary_file(fp,MimeMessage) 311 for part in msg.walk(): 312 part.modified = False 313 assert not msg.ismodified() 314 return msg 315 316extlist = ''.join(""" 317ade,adp,asd,asx,asp,bas,bat,chm,cmd,com,cpl,crt,dll,exe,hlp,hta,inf,ins,isp,js, 318jse,lnk,mdb,mde,msc,msi,msp,mst,ocx,pcd,pif,reg,scr,sct,shs,url,vb,vbe,vbs,wsc, 319wsf,wsh 320""".split()) 321bad_extensions = ['.' + x for x in extlist.split(',')] 322 323def check_ext(name): 324 "Check a name for dangerous Winblows extensions." 325 if not name: return name 326 lname = name.lower() 327 for ext in bad_extensions: 328 if lname.endswith(ext): return name 329 return None 330 331virus_msg = """This message appeared to contain a virus. 332It was originally named '%s', and has been removed. 333A copy of your original message was saved as '%s:%s'. 334See your administrator. 335""" 336 337def check_name(msg,savname=None,ckname=check_ext,scan_zip=False): 338 "Replace attachment with a warning if its name is suspicious." 339 try: 340 for key,name in msg.getnames(scan_zip): 341 badname = ckname(name) 342 if badname: 343 if key == 'zipname': 344 badname = msg.get_filename() 345 break 346 else: 347 return Milter.CONTINUE 348 except zipfile.BadZipfile: 349 # a ZIP that is not a zip is very suspicious 350 badname = msg.get_filename() 351 hostname = socket.gethostname() 352 msg.set_payload(virus_msg % (badname,hostname,savname)) 353 del msg["content-type"] 354 del msg["content-disposition"] 355 del msg["content-transfer-encoding"] 356 name = "WARNING.TXT" 357 msg["Content-Type"] = "text/plain; name="+name 358 return Milter.CONTINUE 359 360def check_attachments(msg,check): 361 """Scan attachments. 362msg MimeMessage 363check function(MimeMessage): int 364 Return CONTINUE, REJECT, ACCEPT 365 """ 366 if msg.is_multipart(): 367 for i in msg.get_payload(): 368 rc = check_attachments(i,check) 369 if rc != Milter.CONTINUE: return rc 370 return Milter.CONTINUE 371 return check(msg) 372 373# save call context for Python without nested_scopes 374class _defang: 375 376 def __init__(self,scan_html=True): 377 self.scan_html = scan_html 378 379 def _chk_name(self,msg): 380 rc = check_name(msg,self._savname,self._check,self.scan_zip) 381 if self.scan_html: 382 check_html(msg,self._savname) # remove scripts from HTML 383 if self.scan_rfc822: 384 msg = msg.get_submsg() 385 if isinstance(msg,Message): 386 return check_attachments(msg,self._chk_name) 387 return rc 388 389 def __call__(self,msg,savname=None,check=check_ext,scan_rfc822=True, 390 scan_zip=False): 391 """Compatible entry point. 392 Replace all attachments with dangerous names.""" 393 self._savname = savname 394 self._check = check 395 self.scan_rfc822 = scan_rfc822 396 self.scan_zip = scan_zip 397 check_attachments(msg,self._chk_name) 398 if msg.ismodified(): 399 return True 400 return False 401 402# emulate old defang function 403defang = _defang() 404 405if sys.version < '3.0.0': 406 from sgmllib import SGMLParser as HTMLParser 407else: 408 from Milter.sgmllib import SGMLParser as HTMLParser 409 410import re 411declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*') 412declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*') 413 414class SGMLFilter(HTMLParser): 415 """Parse HTML and pass through all constructs unchanged. It is intended for 416 derived classes to implement exceptional processing for selected cases. 417 """ 418 def __init__(self,out): 419 HTMLParser.__init__(self) 420 self.out = out 421 422 def handle_comment(self,comment): 423 self.out.write("<!--%s-->" % comment) 424 425 def unknown_starttag(self,tag,attr): 426 if hasattr(self,"get_starttag_text"): 427 self.out.write(self.get_starttag_text()) 428 else: 429 self.out.write("<%s" % tag) 430 for (key,val) in attr: 431 self.out.write(' %s="%s"' % (key,val)) 432 self.out.write('>') 433 434 def handle_data(self,data): 435 self.out.write(data) 436 437 def handle_entityref(self,ref): 438 self.out.write("&%s;" % ref) 439 440 def handle_charref(self,ref): 441 self.out.write("&#%s;" % ref) 442 443 def unknown_endtag(self,tag): 444 self.out.write("</%s>" % tag) 445 446 def handle_special(self,data): 447 self.out.write("<!%s>" % data) 448 449 def write(self,buf): 450 "Act like a writer. Why doesn't HTMLParser do this by default?" 451 self.feed(buf) 452 453 # Python-2.1 sgmllib rejects illegal declarations. Since various Microsoft 454 # products accept and output them, we need to pass them through - 455 # at least until we discover that MS will execute them. 456 # sgmlop-1.1 will not use this method, but calls handle_special to 457 # do what we want. 458 def parse_declaration(self, i): 459 rawdata = self.rawdata 460 n = len(rawdata) 461 j = i + 2 462 while j < n: 463 c = rawdata[j] 464 if c == ">": 465 # end of declaration syntax 466 self.handle_special(rawdata[i+2:j]) 467 return j + 1 468 if c in "\"'": 469 m = declstringlit.match(rawdata, j) 470 if not m: 471 # incomplete or an error? 472 return -1 473 j = m.end() 474 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": 475 m = declname.match(rawdata, j) 476 if not m: 477 # incomplete or an error? 478 return -1 479 j = m.end() 480 else: 481 j += 1 482 # end of buffer between tokens 483 return -1 484 485class HTMLScriptFilter(SGMLFilter): 486 "Remove scripts from an HTML document." 487 def __init__(self,out): 488 SGMLFilter.__init__(self,out) 489 self.ignoring = 0 490 self.modified = False 491 self.msg = "<!-- WARNING: embedded script removed -->" 492 def start_script(self,unused): 493 #print('beg script',unused) 494 self.ignoring += 1 495 self.modified = True 496 def end_script(self): 497 #print('end script') 498 self.ignoring -= 1 499 if not self.ignoring: 500 self.out.write(self.msg) 501 def handle_data(self,data): 502 if not self.ignoring: SGMLFilter.handle_data(self,data) 503 def handle_comment(self,comment): 504 if not self.ignoring: SGMLFilter.handle_comment(self,comment) 505 506def check_html(msg,savname=None): 507 "Remove scripts from HTML attachments." 508 msgtype = msg.get_content_type().lower() 509 # check for more MSIE braindamage 510 if msgtype == 'application/octet-stream': 511 for (attr,name) in msg.getnames(): 512 if name and name.lower().endswith(".htm"): 513 msgtype = 'text/html' 514 if msgtype == 'text/html': 515 out = StringIO() 516 htmlfilter = HTMLScriptFilter(out) 517 try: 518 htmlfilter.write(msg.get_payload(decode=True).decode()) 519 htmlfilter.close() 520 #except sgmllib.SGMLParseError: 521 except: 522 mimetools.copyliteral(msg.get_payload(),open('debug.out','wb')) 523 htmlfilter.close() 524 hostname = socket.gethostname() 525 msg.set_payload( 526 "An HTML attachment could not be parsed. The original is saved as '%s:%s'" 527 % (hostname,savname)) 528 del msg["content-type"] 529 del msg["content-disposition"] 530 del msg["content-transfer-encoding"] 531 name = "WARNING.TXT" 532 msg["Content-Type"] = "text/plain; name="+name 533 return Milter.CONTINUE 534 if htmlfilter.modified: 535 msg.set_payload(out) # remove embedded scripts 536 del msg["content-transfer-encoding"] 537 email.Encoders.encode_quopri(msg) 538 return Milter.CONTINUE 539 540if __name__ == '__main__': 541 def _list_attach(msg): 542 t = msg.get_content_type() 543 p = msg.get_payload(decode=True) 544 print(msg.get_filename(),msg.get_content_type(),type(p)) 545 msg = msg.get_submsg() 546 if isinstance(msg,Message): 547 return check_attachments(msg,_list_attach) 548 return Milter.CONTINUE 549 550 for fname in sys.argv[1:]: 551 fp = open(fname,'rb') 552 msg = message_from_file(fp) 553 email.iterators._structure(msg) 554 check_attachments(msg,_list_attach) 555