1# $Log$
2# Revision 1.8  2011/11/05 15:51:03  customdesigned
3# New example
4#
5# Revision 1.7  2009/06/13 21:15:12  customdesigned
6# Doxygen updates.
7#
8# Revision 1.6  2009/06/09 03:13:13  customdesigned
9# More doxygen docs.
10#
11# Revision 1.5  2005/07/20 14:49:43  customdesigned
12# Handle corrupt and empty ZIP files.
13#
14# Revision 1.4  2005/06/17 01:49:39  customdesigned
15# Handle zip within zip.
16#
17# Revision 1.3  2005/06/02 15:00:17  customdesigned
18# Configure banned extensions.  Scan zipfile option with test case.
19#
20# Revision 1.2  2005/06/02 04:18:55  customdesigned
21# Update copyright notices after reading article on /.
22#
23# Revision 1.1.1.4  2005/05/31 18:23:49  customdesigned
24# Development changes since 0.7.2
25#
26# Revision 1.62  2005/02/14 22:31:17  stuart
27# _parseparam replacement not needed for python2.4
28#
29# Revision 1.61  2005/02/12 02:11:11  stuart
30# Pass unit tests with python2.4.
31#
32# Revision 1.60  2005/02/11 18:34:14  stuart
33# Handle garbage after quote in boundary.
34#
35# Revision 1.59  2005/02/10 01:10:59  stuart
36# Fixed MimeMessage.ismodified()
37#
38# Revision 1.58  2005/02/10 00:56:49  stuart
39# Runs with python2.4.  Defang not working correctly - more work needed.
40#
41# Revision 1.57  2004/11/20 16:37:52  stuart
42# fix regex for splitting header and body
43#
44# Revision 1.56  2004/11/09 20:33:51  stuart
45# Recognize more dynamic PTR variations.
46#
47# Revision 1.55  2004/10/06 21:39:20  stuart
48# Handle message attachments with boundary errors by not parsing them
49# until needed.
50#
51# Revision 1.54  2004/08/18 01:59:46  stuart
52# Handle mislabeled multipart messages
53#
54# Revision 1.53  2004/04/24 22:53:20  stuart
55# Rename some local variables to avoid shadowing builtins
56#
57# Revision 1.52  2004/04/24 22:47:13  stuart
58# Convert header values to str
59#
60# Revision 1.51  2004/03/25 03:19:10  stuart
61# Correctly defang rfc822 attachments when boundary specified with
62# content-type message/rfc822.
63#
64# Revision 1.50  2003/10/15 22:01:00  stuart
65# Test for and work around email bug with encoded filenames.
66#
67# Revision 1.49  2003/09/04 18:48:13  stuart
68# Support python-2.2.3
69#
70# Revision 1.48  2003/09/02 00:27:27  stuart
71# Should have full milter based dspam support working
72#
73# Revision 1.47  2003/08/26 06:08:18  stuart
74# Use new python boolean since we now require 2.2.2
75#
76# Revision 1.46  2003/08/26 05:01:38  stuart
77# Release 0.6.0
78#
79# Revision 1.45  2003/08/26 04:01:24  stuart
80# Use new email module for parsing mail.  Still need mime module to
81# provide various bug fixes to email module, and maintain some compatibility
82# with old milter code.
83#
84
85## @package mime
86# This module provides a "defang" function to replace naughty attachments.
87#
88# We also provide workarounds for bugs in the email module that comes
89# with python.  The "bugs" fixed mostly come up only with malformed
90# messages - but that is what you have when dealing with spam.
91
92# Author: Stuart D. Gathman <stuart@bmsi.com>
93# Copyright 2001,2002,2003,2004,2005 Business Management Systems, Inc.
94# This code is under the GNU General Public License.  See COPYING for details.
95
96from __future__ import print_function
97try:
98  from io import BytesIO, StringIO
99except:
100  from StringIO import StringIO
101  BytesIO = StringIO
102import socket
103import Milter
104import zipfile
105import sys
106
107import email
108from email.message import Message
109try:
110  from email.generator import BytesGenerator
111  from email import message_from_binary_file
112except:
113  from email.generator import Generator as BytesGenerator
114  from email import message_from_file as message_from_binary_file
115from email.utils import quote
116
117if not getattr(Message,'as_bytes',None):
118  Message.as_bytes = Message.as_string
119
120## Return a list of filenames in a zip file.
121# Embedded zip files are recursively expanded.
122def zipnames(txt):
123  fp =  BytesIO(txt)
124  zipf = zipfile.ZipFile(fp,'r')
125  names = []
126  for nm in zipf.namelist():
127    names.append(('zipname',nm))
128    if nm.lower().endswith('.zip'):
129      names += zipnames(zipf.read(nm))
130  return names
131
132## Fix multipart handling in email.Generator.
133#
134class MimeGenerator(BytesGenerator):
135    def _dispatch(self, msg):
136        # Get the Content-Type: for the message, then try to dispatch to
137        # self._handle_<maintype>_<subtype>().  If there's no handler for the
138        # full MIME type, then dispatch to self._handle_<maintype>().  If
139        # that's missing too, then dispatch to self._writeBody().
140        main = msg.get_content_maintype()
141        if msg.is_multipart() and main.lower() != 'multipart':
142          self._handle_multipart(msg)
143        else:
144          BytesGenerator._dispatch(self,msg)
145
146def unquote(s):
147    """Remove quotes from a string."""
148    if len(s) > 1:
149        if s.startswith('"'):
150          if s.endswith('"'):
151            s = s[1:-1]
152          else: # remove garbage after trailing quote
153            try: s = s[1:s[1:].index('"')+1]
154            except:
155              return s
156          return s.replace('\\\\', '\\').replace('\\"', '"')
157        if s.startswith('<') and s.endswith('>'):
158          return s[1:-1]
159    return s
160
161def _unquotevalue(value):
162  if isinstance(value, tuple):
163      return value[0], value[1], unquote(value[2])
164  else:
165      return unquote(value)
166
167#email.Message._unquotevalue = _unquotevalue
168
169from email.message import _parseparam
170
171## Enhance email.message.Message
172#
173# Tracks modifications to headers of body or any part independently.
174
175class MimeMessage(Message):
176  """Version of email.Message.Message compatible with old mime module
177  """
178  def __init__(self,fp=None,seekable=1):
179    Message.__init__(self)
180    self.submsg = None
181    self.modified = False
182  ## @var headerchange
183  # Provide a headerchange event for integration with Milter.
184  #   The headerchange attribute can be assigned a function to be called when
185  #   changing headers.  The signature is:
186  #   headerchange(msg,name,value) -> None
187    self.headerchange = None
188
189  def get_param(self, param, failobj=None, header='content-type', unquote=True):
190    val = Message.get_param(self,param,failobj,header,unquote)
191    if val != failobj and param == 'boundary' and unquote:
192      # unquote boundaries an extra time, test case testDefang5
193      return _unquotevalue(val)
194    return val
195
196  getfilename = Message.get_filename
197  ismultipart = Message.is_multipart
198  getheaders = Message.get_all
199  gettype = Message.get_content_type
200  getparam = Message.get_param
201
202  def getparams(self): return self.get_params([])
203
204  def getname(self):
205    return self.get_param('name')
206
207  def getnames(self,scan_zip=False):
208    """Return a list of (attr,name) pairs of attributes that IE might
209       interpret as a name - and hence decide to execute this message."""
210    names = []
211    for attr,val in self._get_params_preserve([],'content-type'):
212      if isinstance(val, tuple):
213	  # It's an RFC 2231 encoded parameter
214          newvalue = _unquotevalue(val)
215          if val[0]:
216            val =  unicode(newvalue[2], newvalue[0])
217          else:
218            val = unicode(newvalue[2])
219      else:
220          val = _unquotevalue(val.strip())
221      names.append((attr,val))
222    names += [("filename",self.get_filename())]
223    if scan_zip:
224      for key,name in tuple(names):	# copy by converting to tuple
225        if name and name.lower().endswith('.zip'):
226          txt = self.get_payload(decode=True)
227          if txt.strip():
228            names += zipnames(txt)
229    return names
230
231  def ismodified(self):
232    "True if this message or a subpart has been modified."
233    if not self.is_multipart():
234      if isinstance(self.submsg,Message):
235        return self.submsg.ismodified()
236      return self.modified
237    if self.modified: return True
238    for i in self.get_payload():
239      if i.ismodified(): return True
240    return False
241
242  def dump(self,file,unixfrom=False):
243    "Write this message (and all subparts) to a file"
244    g = MimeGenerator(file)
245    g.flatten(self,unixfrom=unixfrom)
246
247  def as_bytes(self, unixfrom=False):
248      "Return the entire formatted message as a string."
249      fp = BytesIO()
250      self.dump(fp,unixfrom=unixfrom)
251      return fp.getvalue()
252
253  def getencoding(self):
254    return self.get('content-transfer-encoding',None)
255
256  # Decode body to stream according to transfer encoding, return encoding name
257  def decode(self,filt):
258    try:
259      filt.write(self.get_payload(decode=True))
260    except:
261      pass
262    return self.getencoding()
263
264  def get_payload_decoded(self):
265    return self.get_payload(decode=True)
266
267  def __setitem__(self, name, value):
268    rc = Message.__setitem__(self,name,value)
269    self.modified = True
270    if self.headerchange: self.headerchange(self,name,str(value))
271    return rc
272
273  def __delitem__(self, name):
274    if self.headerchange: self.headerchange(self,name,None)
275    rc = Message.__delitem__(self,name)
276    self.modified = True
277    return rc
278
279  def get_payload(self,i=None,decode=False):
280    msg = self.submsg
281    if isinstance(msg,Message) and msg.ismodified():
282      self.set_payload([msg])
283    return Message.get_payload(self,i,decode)
284
285  def set_payload(self, val, charset=None):
286    self.modified = True
287    try:
288      val.seek(0)
289      val = val.read()
290    except: pass
291    Message.set_payload(self,val,charset)
292    self.submsg = None
293
294  def get_submsg(self):
295    t = self.get_content_type().lower()
296    if t == 'message/rfc822' or t.startswith('multipart/'):
297      if not self.submsg:
298        txt = self.get_payload()
299        if type(txt) == str:
300          txt = self.get_payload(decode=True)
301          self.submsg = email.message_from_string(txt,MimeMessage)
302          for part in self.submsg.walk():
303            part.modified = False
304        else:
305          self.submsg = txt[0]
306      return self.submsg
307    return None
308
309def message_from_file(fp):
310  msg = message_from_binary_file(fp,MimeMessage)
311  for part in msg.walk():
312    part.modified = False
313  assert not msg.ismodified()
314  return msg
315
316extlist = ''.join("""
317ade,adp,asd,asx,asp,bas,bat,chm,cmd,com,cpl,crt,dll,exe,hlp,hta,inf,ins,isp,js,
318jse,lnk,mdb,mde,msc,msi,msp,mst,ocx,pcd,pif,reg,scr,sct,shs,url,vb,vbe,vbs,wsc,
319wsf,wsh
320""".split())
321bad_extensions = ['.' + x for x in extlist.split(',')]
322
323def check_ext(name):
324  "Check a name for dangerous Winblows extensions."
325  if not name: return name
326  lname = name.lower()
327  for ext in bad_extensions:
328    if lname.endswith(ext): return name
329  return None
330
331virus_msg = """This message appeared to contain a virus.
332It was originally named '%s', and has been removed.
333A copy of your original message was saved as '%s:%s'.
334See your administrator.
335"""
336
337def check_name(msg,savname=None,ckname=check_ext,scan_zip=False):
338  "Replace attachment with a warning if its name is suspicious."
339  try:
340    for key,name in msg.getnames(scan_zip):
341      badname = ckname(name)
342      if badname:
343        if key == 'zipname':
344          badname = msg.get_filename()
345        break
346    else:
347      return Milter.CONTINUE
348  except zipfile.BadZipfile:
349    # a ZIP that is not a zip is very suspicious
350    badname = msg.get_filename()
351  hostname = socket.gethostname()
352  msg.set_payload(virus_msg % (badname,hostname,savname))
353  del msg["content-type"]
354  del msg["content-disposition"]
355  del msg["content-transfer-encoding"]
356  name = "WARNING.TXT"
357  msg["Content-Type"] = "text/plain; name="+name
358  return Milter.CONTINUE
359
360def check_attachments(msg,check):
361  """Scan attachments.
362msg	MimeMessage
363check	function(MimeMessage): int
364	Return CONTINUE, REJECT, ACCEPT
365  """
366  if msg.is_multipart():
367    for i in msg.get_payload():
368      rc = check_attachments(i,check)
369      if rc != Milter.CONTINUE: return rc
370    return Milter.CONTINUE
371  return check(msg)
372
373# save call context for Python without nested_scopes
374class _defang:
375
376  def __init__(self,scan_html=True):
377    self.scan_html = scan_html
378
379  def _chk_name(self,msg):
380    rc = check_name(msg,self._savname,self._check,self.scan_zip)
381    if self.scan_html:
382      check_html(msg,self._savname)	# remove scripts from HTML
383    if self.scan_rfc822:
384      msg = msg.get_submsg()
385      if isinstance(msg,Message):
386        return check_attachments(msg,self._chk_name)
387    return rc
388
389  def __call__(self,msg,savname=None,check=check_ext,scan_rfc822=True,
390		scan_zip=False):
391    """Compatible entry point.
392    Replace all attachments with dangerous names."""
393    self._savname = savname
394    self._check = check
395    self.scan_rfc822 = scan_rfc822
396    self.scan_zip = scan_zip
397    check_attachments(msg,self._chk_name)
398    if msg.ismodified():
399      return True
400    return False
401
402# emulate old defang function
403defang = _defang()
404
405if sys.version < '3.0.0':
406    from sgmllib import SGMLParser as HTMLParser
407else:
408    from Milter.sgmllib import SGMLParser as HTMLParser
409
410import re
411declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
412declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
413
414class SGMLFilter(HTMLParser):
415  """Parse HTML and pass through all constructs unchanged.  It is intended for
416     derived classes to implement exceptional processing for selected cases.
417  """
418  def __init__(self,out):
419    HTMLParser.__init__(self)
420    self.out = out
421
422  def handle_comment(self,comment):
423    self.out.write("<!--%s-->" % comment)
424
425  def unknown_starttag(self,tag,attr):
426    if hasattr(self,"get_starttag_text"):
427      self.out.write(self.get_starttag_text())
428    else:
429      self.out.write("<%s" % tag)
430      for (key,val) in attr:
431        self.out.write(' %s="%s"' % (key,val))
432      self.out.write('>')
433
434  def handle_data(self,data):
435    self.out.write(data)
436
437  def handle_entityref(self,ref):
438    self.out.write("&%s;" % ref)
439
440  def handle_charref(self,ref):
441    self.out.write("&#%s;" % ref)
442
443  def unknown_endtag(self,tag):
444    self.out.write("</%s>" % tag)
445
446  def handle_special(self,data):
447    self.out.write("<!%s>" % data)
448
449  def write(self,buf):
450    "Act like a writer.  Why doesn't HTMLParser do this by default?"
451    self.feed(buf)
452
453  # Python-2.1 sgmllib rejects illegal declarations.  Since various Microsoft
454  # products accept and output them, we need to pass them through -
455  # at least until we discover that MS will execute them.
456  # sgmlop-1.1 will not use this method, but calls handle_special to
457  # do what we want.
458  def parse_declaration(self, i):
459      rawdata = self.rawdata
460      n = len(rawdata)
461      j = i + 2
462      while j < n:
463          c = rawdata[j]
464          if c == ">":
465              # end of declaration syntax
466              self.handle_special(rawdata[i+2:j])
467              return j + 1
468          if c in "\"'":
469              m = declstringlit.match(rawdata, j)
470              if not m:
471		  # incomplete or an error?
472                  return -1
473              j = m.end()
474          elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
475              m = declname.match(rawdata, j)
476              if not m:
477                  # incomplete or an error?
478                  return -1
479              j = m.end()
480          else:
481              j += 1
482      # end of buffer between tokens
483      return -1
484
485class HTMLScriptFilter(SGMLFilter):
486  "Remove scripts from an HTML document."
487  def __init__(self,out):
488    SGMLFilter.__init__(self,out)
489    self.ignoring = 0
490    self.modified = False
491    self.msg = "<!-- WARNING: embedded script removed -->"
492  def start_script(self,unused):
493    #print('beg script',unused)
494    self.ignoring += 1
495    self.modified = True
496  def end_script(self):
497    #print('end script')
498    self.ignoring -= 1
499    if not self.ignoring:
500      self.out.write(self.msg)
501  def handle_data(self,data):
502    if not self.ignoring: SGMLFilter.handle_data(self,data)
503  def handle_comment(self,comment):
504    if not self.ignoring: SGMLFilter.handle_comment(self,comment)
505
506def check_html(msg,savname=None):
507  "Remove scripts from HTML attachments."
508  msgtype = msg.get_content_type().lower()
509  # check for more MSIE braindamage
510  if msgtype == 'application/octet-stream':
511    for (attr,name) in msg.getnames():
512      if name and name.lower().endswith(".htm"):
513        msgtype = 'text/html'
514  if msgtype == 'text/html':
515    out = StringIO()
516    htmlfilter = HTMLScriptFilter(out)
517    try:
518      htmlfilter.write(msg.get_payload(decode=True).decode())
519      htmlfilter.close()
520    #except sgmllib.SGMLParseError:
521    except:
522      mimetools.copyliteral(msg.get_payload(),open('debug.out','wb'))
523      htmlfilter.close()
524      hostname = socket.gethostname()
525      msg.set_payload(
526  "An HTML attachment could not be parsed.  The original is saved as '%s:%s'"
527      % (hostname,savname))
528      del msg["content-type"]
529      del msg["content-disposition"]
530      del msg["content-transfer-encoding"]
531      name = "WARNING.TXT"
532      msg["Content-Type"] = "text/plain; name="+name
533      return Milter.CONTINUE
534    if htmlfilter.modified:
535      msg.set_payload(out)	# remove embedded scripts
536      del msg["content-transfer-encoding"]
537      email.Encoders.encode_quopri(msg)
538  return Milter.CONTINUE
539
540if __name__ == '__main__':
541  def _list_attach(msg):
542    t = msg.get_content_type()
543    p = msg.get_payload(decode=True)
544    print(msg.get_filename(),msg.get_content_type(),type(p))
545    msg = msg.get_submsg()
546    if isinstance(msg,Message):
547      return check_attachments(msg,_list_attach)
548    return Milter.CONTINUE
549
550  for fname in sys.argv[1:]:
551    fp = open(fname,'rb')
552    msg = message_from_file(fp)
553    email.iterators._structure(msg)
554    check_attachments(msg,_list_attach)
555