1import base64
2import imghdr
3import logging
4import mimetypes
5import quopri
6from contextlib import closing
7from os import path
8
9import six
10from six.moves import StringIO
11
12from flanker import metrics, _email
13from flanker.mime import bounce
14from flanker.mime.message import headers, charsets
15from flanker.mime.message.errors import EncodingError, DecodingError
16from flanker.mime.message.headers import (WithParams, ContentType, MessageId,
17                                          Subject)
18from flanker.mime.message.headers.parametrized import fix_content_type
19from flanker.utils import is_pure_ascii
20
21log = logging.getLogger(__name__)
22
23CTE = WithParams('7bit', {})
24
25
26class Stream(object):
27
28    def __init__(self, content_type, start, end, string, stream):
29        self.content_type = content_type
30        self.start = start
31        self.end = end
32        self.string = string
33        self.stream = stream
34
35        self._headers = None
36        self._body_start = None
37        self._body = None
38        self._body_changed = False
39        self.size = len(self.string)
40
41    @property
42    def headers(self):
43        self._load_headers()
44        return self._headers
45
46    @property
47    def body(self):
48        self._load_body()
49        return self._body
50
51    @body.setter
52    def body(self, value):
53        self._set_body(value)
54
55    def read_message(self):
56        self.stream.seek(self.start)
57        return self.stream.read(self.end - self.start + 1)
58
59    def read_body(self):
60        self._load_headers()
61        self.stream.seek(self._body_start)
62        return self.stream.read(self.end - self._body_start + 1)
63
64    def _load_headers(self):
65        if self._headers is None:
66            self.stream.seek(self.start)
67            self._headers = headers.MimeHeaders.from_stream(self.stream)
68            self._body_start = self.stream.tell()
69
70    def _load_body(self):
71        if self._body is None:
72            self._load_headers()
73            self.stream.seek(self._body_start)
74            self._body = _decode_body(
75                self.content_type,
76                self.headers.get('Content-Transfer-Encoding', CTE).value,
77                self.stream.read(self.end - self._body_start + 1))
78
79    def _set_body(self, value):
80        if value != self._body:
81            self._body = value
82            self._body_changed = True
83
84    def _stream_prepended_headers(self, out):
85        if self._headers:
86            self._headers.to_stream(out, prepends_only=True)
87
88    def headers_changed(self, ignore_prepends=False):
89        return self._headers is not None and self._headers.have_changed(ignore_prepends)
90
91    def body_changed(self):
92        return self._body_changed
93
94
95def adjust_content_type(content_type, body=None, filename=None):
96    """Adjust content type based on filename or body contents
97    """
98    if filename and str(content_type) == 'application/octet-stream':
99        # check if our internal guess returns anything
100        guessed = _guess_type(filename)
101        if guessed:
102            return guessed
103
104        # our internal attempt didn't return anything, use mimetypes
105        guessed = mimetypes.guess_type(filename)[0]
106        if guessed:
107            main, sub = fix_content_type(
108                guessed, default=('application', 'octet-stream'))
109            content_type = ContentType(main, sub)
110
111    if content_type.main == 'image' and body:
112        image_preamble = body[:32]
113        if six.PY3 and isinstance(body, six.text_type):
114            image_preamble = image_preamble.encode('utf-8', 'ignore')
115
116        sub = imghdr.what(None, image_preamble)
117        if sub:
118            content_type = ContentType('image', sub)
119
120    elif content_type.main == 'audio' and body:
121        sub = _email.detect_audio_type(body)
122        if sub:
123            content_type = ContentType('audio', sub)
124
125    return content_type
126
127
128def _guess_type(filename):
129    """
130    Internal content type guesser. This is used to hard code certain tricky content-types
131    that heuristic content type checker get wrong.
132    """
133
134    if filename.endswith('.bz2'):
135        return ContentType('application', 'x-bzip2')
136
137    if filename.endswith('.gz'):
138        return ContentType('application', 'x-gzip')
139
140    return None
141
142
143class Body(object):
144    def __init__(self, content_type, body, charset=None, disposition=None,
145                 filename=None, trust_ctype=False):
146        self.headers = headers.MimeHeaders()
147        self.body = body
148        self.disposition = disposition or ('attachment' if filename else None)
149        self.filename = filename
150        self.size = len(body)
151
152        if self.filename:
153            self.filename = path.basename(self.filename)
154
155        if not trust_ctype:
156            content_type = adjust_content_type(content_type, body, filename)
157
158        if content_type.main == 'text':
159            # the text should have a charset
160            if not charset:
161                charset = 'utf-8'
162
163            # it should be stored as unicode. period
164            if isinstance(body, six.binary_type):
165                self.body = charsets.convert_to_unicode(charset, body)
166
167            # let's be simple when possible
168            if charset != 'ascii' and is_pure_ascii(body):
169                charset = 'ascii'
170
171        self.headers['MIME-Version'] = '1.0'
172        self.headers['Content-Type'] = content_type
173        if charset:
174            content_type.params['charset'] = charset
175
176        if self.disposition:
177            self.headers['Content-Disposition'] = WithParams(disposition)
178            if self.filename:
179                self.headers['Content-Disposition'].params['filename'] = self.filename
180                self.headers['Content-Type'].params['name'] = self.filename
181
182    @property
183    def content_type(self):
184        return self.headers['Content-Type']
185
186    def headers_changed(self, ignore_prepends=False):
187        return True
188
189    def body_changed(self):
190        return True
191
192    def _stream_prepended_headers(self, out):
193        self.headers.to_stream(out, prepends_only=True)
194
195
196class Part(object):
197
198    def __init__(self, ctype):
199        self.headers = headers.MimeHeaders()
200        self.body = None
201        self.headers['Content-Type'] = ctype
202        self.headers['MIME-Version'] = '1.0'
203        self.size = 0
204
205    @property
206    def content_type(self):
207        return self.headers['Content-Type']
208
209    def headers_changed(self, ignore_prepends=False):
210        return True
211
212    def body_changed(self):
213        return True
214
215    def _stream_prepended_headers(self, out):
216        self.headers.to_stream(out, prepends_only=True)
217
218
219class RichPartMixin(object):
220
221    def __init__(self, is_root=False):
222        self._is_root = is_root
223        self._bounce = None
224
225    @property
226    def message_id(self):
227        return MessageId.from_string(self.headers.get('Message-Id', ''))
228
229    @message_id.setter
230    def message_id(self, value):
231        if not MessageId.is_valid(value):
232            raise ValueError('invalid message id format')
233        self.headers['Message-Id'] = '<{0}>'.format(value)
234
235    @property
236    def subject(self):
237        return self.headers.get('Subject', '')
238
239    @property
240    def clean_subject(self):
241        """
242        Subject without re, fw, fwd, HA prefixes
243        """
244        return Subject(self.subject).strip_replies()
245
246    @property
247    def references(self):
248        """
249        Returns a list of message ids referencing the message in accordance to
250        the Jamie Zawinski threading algorithm.
251
252        See http://www.jwz.org/doc/threading.html for details.
253        """
254        refs = list(MessageId.scan(self.headers.get('References', '')))
255        if not refs:
256            reply = MessageId.from_string(self.headers.get('In-Reply-To', ''))
257            if reply:
258                refs.append(reply[0])
259        return refs
260
261    @property
262    def detected_file_name(self):
263        """
264        Detects file name based on content type or part name.
265        """
266        ctype = self.content_type
267        file_name = ctype.params.get('name', '') or ctype.params.get('filename', '')
268
269        value, params = self.content_disposition
270        if value in ['attachment', 'inline']:
271            file_name = params.get('filename', '') or file_name
272
273        # filenames can be presented as tuples, like:
274        # ('us-ascii', 'en-us', 'image.jpg')
275        if isinstance(file_name, tuple) and len(file_name) == 3:
276            # encoding permissible to be empty
277            encoding = file_name[0]
278            if encoding:
279                file_name = file_name[2].decode(encoding)
280            else:
281                file_name = file_name[2]
282
283        file_name = headers.mime_to_unicode(file_name)
284        return file_name
285
286    @property
287    def detected_format(self):
288        return self.detected_content_type.format_type
289
290    @property
291    def detected_subtype(self):
292        return self.detected_content_type.subtype
293
294    @property
295    def detected_content_type(self):
296        """
297        Returns content type based on the body content, the file name and the
298        original content type provided inside the message.
299        """
300        return adjust_content_type(self.content_type,
301                                   filename=self.detected_file_name)
302
303    def is_body(self):
304        return (not self.detected_file_name and
305                (self.content_type.format_type == 'text' or
306                 self.content_type.format_type == 'message'))
307
308    def is_root(self):
309        return self._is_root
310
311    def set_root(self, val):
312        self._is_root = bool(val)
313
314    def walk(self, with_self=False, skip_enclosed=False):
315        """
316        Returns iterator object traversing through the message parts. If the
317        top level part needs to be included then set the `with_self` to `True`.
318        If the parts of the enclosed messages should not be included then set
319        the `skip_enclosed` parameter to `True`.
320        """
321
322        if with_self:
323            yield self
324
325        if self.content_type.is_multipart():
326            for p in self.parts:
327                yield p
328                for x in p.walk(with_self=False, skip_enclosed=skip_enclosed):
329                    yield x
330
331        elif self.content_type.is_message_container() and not skip_enclosed:
332            yield self.enclosed
333            for p in self.enclosed.walk(with_self=False):
334                yield p
335
336    def is_attachment(self):
337        return self.content_disposition[0] == 'attachment'
338
339    def is_inline(self):
340        return self.content_disposition[0] == 'inline'
341
342    def is_delivery_notification(self):
343        """
344        Tells whether a message is a system delivery notification.
345        """
346        content_type = self.content_type
347        return (content_type == 'multipart/report'
348                and content_type.params.get('report-type') == 'delivery-status')
349
350    def get_attached_message(self):
351        """
352        Returns attached message if found, `None` otherwise.
353        """
354        try:
355            for part in self.walk(with_self=True):
356                if part.content_type == 'message/rfc822':
357                    for p in part.walk():
358                        return p
359        except Exception:
360            log.exception('Failed to get attached message')
361            return None
362
363    def remove_headers(self, *header_names):
364        """
365        Removes all passed headers name in one operation.
366        """
367        for header_name in header_names:
368            if header_name in self.headers:
369                del self.headers[header_name]
370
371    @property
372    def bounce(self):
373        """
374        Deprecated: use bounce.detect(message).
375        """
376        if not self._bounce:
377            self._bounce = bounce.detect(self)
378        return self._bounce
379
380    def is_bounce(self, probability=0.3):
381        """
382        Deprecated: use bounce.detect(message).
383        """
384        return self.bounce.is_bounce(probability)
385
386    def __str__(self):
387        return '({0})'.format(self.content_type)
388
389
390class MimePart(RichPartMixin):
391
392    def __init__(self, container, parts=None, enclosed=None, is_root=False):
393        RichPartMixin.__init__(self, is_root)
394        self._container = container
395        self.parts = parts or []
396        self.enclosed = enclosed
397
398    @property
399    def size(self):
400        """ Returns message size in bytes"""
401        if self.is_root() and not self.was_changed():
402            if isinstance(self._container, Stream):
403                return self._container.size
404            else:
405                return sum(part._container.size
406                           for part in self.walk(with_self=True))
407        else:
408            with closing(_CounterIO()) as out:
409                self.to_stream(out)
410                return out.getvalue()
411
412    @property
413    def headers(self):
414        """Returns multi dictionary with headers converted to unicode,
415        headers like Content-Type, Content-Disposition are tuples
416        ('value', {'param': 'val'})"""
417        return self._container.headers
418
419    @property
420    def content_type(self):
421        """ returns object with properties:
422        main - main part of content type
423        sub - subpart of content type
424        params - dictionary with parameters
425        """
426        return self._container.content_type
427
428    @property
429    def content_disposition(self):
430        """ returns tuple (value, params) """
431        return self.headers.get('Content-Disposition', WithParams(None))
432
433    @property
434    def content_encoding(self):
435        return self.headers.get(
436            'Content-Transfer-Encoding', WithParams('7bit'))
437
438    @content_encoding.setter
439    def content_encoding(self, value):
440        self.headers['Content-Transfer-Encoding'] = value
441
442    @property
443    def body(self):
444        """ returns decoded body """
445        if self.content_type.is_singlepart()\
446                or self.content_type.is_delivery_status():
447            return self._container.body
448
449    @body.setter
450    def body(self, value):
451        if self.content_type.is_singlepart()\
452                or self.content_type.is_delivery_status():
453            self._container.body = value
454
455    @property
456    def charset(self):
457        return self.content_type.get_charset()
458
459    @charset.setter
460    def charset(self, value):
461        charset = value.lower()
462        self.content_type.set_charset(value)
463        if 'Content-Type' not in self.headers:
464            self.headers['Content-Type'] = ContentType('text', 'plain', {})
465        self.headers['Content-Type'].params['charset'] = charset
466        self.headers.changed = True
467
468    def to_string(self):
469        """
470        Returns a MIME representation of the message.
471        """
472        # this optimisation matters *A LOT*
473        # when there are no prepended headers
474        # we submit the original string,
475        # no copying, no alternation, yeah!
476        if self.is_root() and not self.was_changed(ignore_prepends=True):
477            with closing(StringIO()) as out:
478                self._container._stream_prepended_headers(out)
479                return out.getvalue() + self._container.string
480        else:
481            with closing(StringIO()) as out:
482                self.to_stream(out)
483                return out.getvalue()
484
485    def to_stream(self, out):
486        """
487        Serializes the message using a file like object.
488        """
489        if not self.was_changed(ignore_prepends=True):
490            self._container._stream_prepended_headers(out)
491            out.write(self._container.read_message())
492        else:
493            try:
494                original_position = out.tell()
495                self._to_stream_when_changed(out)
496            except DecodingError:
497                out.seek(original_position)
498                out.write(self._container.read_message())
499
500    def was_changed(self, ignore_prepends=False):
501        if self._container.headers_changed(ignore_prepends):
502            return True
503
504        if self.content_type.is_singlepart():
505            if self._container.body_changed():
506                return True
507            return False
508
509        elif self.content_type.is_multipart():
510            return any(p.was_changed() for p in self.parts)
511
512        elif self.content_type.is_message_container():
513            return self.enclosed.was_changed()
514
515    def to_python_message(self):
516        return _email.message_from_string(self.to_string())
517
518    def append(self, *messages):
519        for m in messages:
520            self.parts.append(m)
521            m.set_root(False)
522
523    def enclose(self, message):
524        self.enclosed = message
525        message.set_root(False)
526
527    def _to_stream_when_changed(self, out):
528
529        ctype = self.content_type
530
531        if ctype.is_singlepart():
532
533            if self._container.body_changed():
534                charset, encoding, body = _encode_body(self)
535                if charset:
536                    self.charset = charset
537                self.content_encoding = WithParams(encoding)
538            else:
539                body = self._container.read_body()
540
541            # RFC allows subparts without headers
542            if self.headers:
543                self.headers.to_stream(out)
544            elif self.is_root():
545                raise EncodingError('Root message should have headers')
546
547            out.write(_CRLF)
548            out.write(body)
549        else:
550            self.headers.to_stream(out)
551            out.write(_CRLF)
552
553            if ctype.is_multipart():
554                boundary = ctype.get_boundary_line()
555                for index, part in enumerate(self.parts):
556                    out.write(
557                        (_CRLF if index != 0 else '') + boundary + _CRLF)
558                    part.to_stream(out)
559                out.write(_CRLF + ctype.get_boundary_line(final=True) + _CRLF)
560
561            elif ctype.is_message_container():
562                self.enclosed.to_stream(out)
563
564
565def _decode_body(content_type, content_encoding, body):
566    # decode the transfer encoding
567    body = _decode_transfer_encoding(content_encoding, body)
568
569    # decode the charset next
570    return _decode_charset(content_type, body)
571
572
573def _decode_transfer_encoding(encoding, body):
574    if encoding == 'base64':
575        return _base64_decode(body)
576    elif encoding == 'quoted-printable':
577        return quopri.decodestring(body)
578    else:
579        return body
580
581
582def _decode_charset(ctype, body):
583    if ctype.main != 'text':
584        return body
585
586    charset = ctype.get_charset()
587    body = charsets.convert_to_unicode(charset, body)
588
589    # for text/html unicode bodies make sure to replace
590    # the whitespace (0xA0) with &nbsp; Outlook is reported to
591    # have a bug there
592    if ctype.sub == 'html' and charset == 'utf-8':
593        # Outlook bug
594        body = body.replace(u'\xa0', u'&nbsp;')
595
596    return body
597
598
599def _encode_body(part):
600    content_type = part.content_type
601    content_encoding = part.content_encoding.value
602    body = part._container.body
603
604    charset = content_type.get_charset()
605    if content_type.main == 'text':
606        charset, body = _encode_charset(charset, body)
607        if not part.is_attachment():
608            content_encoding = _choose_text_encoding(charset, content_encoding,
609                                                     body)
610            # report which text encoding is chosen
611            metrics.incr('encoding.' + content_encoding)
612        else:
613            content_encoding = 'base64'
614    else:
615        content_encoding = 'base64'
616
617    body = _encode_transfer_encoding(content_encoding, body)
618    return charset, content_encoding, body
619
620
621def _encode_charset(preferred_charset, text):
622    try:
623        charset = preferred_charset or 'ascii'
624        text = text.encode(preferred_charset)
625    except:
626        charset = 'utf-8'
627        text = text.encode(charset)
628    return charset, text
629
630
631def _encode_transfer_encoding(encoding, body):
632    if six.PY3:
633        if encoding == 'quoted-printable':
634            body = quopri.encodestring(body, quotetabs=False)
635            body = fix_leading_dot(body)
636            return body.decode('utf-8')
637
638        if encoding == 'base64':
639            if isinstance(body, six.text_type):
640                body = body.encode('utf-8')
641
642            body = _email.encode_base64(body)
643            return body.decode('utf-8')
644
645        if six.PY3 and isinstance(body, six.binary_type):
646            return body.decode('utf-8')
647
648        return body
649
650    if encoding == 'quoted-printable':
651        body = quopri.encodestring(body, quotetabs=False)
652        return fix_leading_dot(body)
653    elif encoding == 'base64':
654        return _email.encode_base64(body)
655    else:
656        return body
657
658
659def fix_leading_dot(s):
660    """
661    From SMTP RFC: https://tools.ietf.org/html/rfc5321#section-4.5.2
662
663    -----
664    When a line of mail text is received by the SMTP server, it checks
665    the line.  If the line is composed of a single period, it is
666    treated as the end of mail indicator.  If the first character is a
667    period and there are other characters on the line, the first
668    character is deleted.
669    -----
670
671    We have observed some remote SMTP servers have an intermittent obscure bug
672    where the leading '.' is removed according to the above spec. Even when the '.'
673    is obviously within the bounds of a mime part, and with our sending SMTP
674    clients dot stuffing the line. To combat this we convert any leading '.'
675    to a '=2E'.
676    """
677    infp = six.BytesIO(s)
678    outfp = six.BytesIO()
679
680    # TODO(thrawn01): We could scan the entire string looking for leading '.'
681    #  If none found return the original string. This would save memory at the
682    #  expense of some additional processing
683
684    dot = b"."
685    if six.PY3:
686        dot = ord('.')
687
688    while 1:
689        line = infp.readline()
690        if not line:
691            break
692
693        if line[0] == dot:
694            line = _quote_and_cut(line)
695
696        outfp.write(line)
697
698    return outfp.getvalue()
699
700
701def _quote_and_cut(ln):
702    """
703    Quotes the leading '.', if the resulting line is longer than 76 characters
704    cut the line in half without dividing any quoted characters and
705    conforming to the quoted-printable RFC in regards to ending characters.
706    """
707    ln = quopri.quote(ln[0:1]) + ln[1:]
708
709    # If the line is under the 76 + '\n' character limit
710    if len(ln) <= 77:
711        return ln
712
713    # Find a suitable cut point that doesn't divide a quoted character
714    in_quote, pos = 0, -1
715    for pos, c in enumerate(ln):
716
717        # Skip quoted (=XX) characters
718        if in_quote != 0:
719            in_quote += 1
720            if in_quote <= 3:
721                continue
722            in_quote = 0
723
724        # If we are past the half way mark, make our cut here
725        if pos > len(ln)/2:
726            break
727
728        if six.PY3:
729            c = bytes((c,))
730
731        # Should be a quoted character
732        if c == b'=':
733            # Peak ahead, does the next char appear to be a hex value?
734            if quopri.ishex(ln[pos+1:pos+2]):
735                in_quote = 1
736            continue
737
738    new_line = ln[:pos]
739    next_line = ln[pos:]
740
741    # If new line ends with a :space or :tab
742    if new_line[-1:] in b' \t':
743        new_line = new_line[:-1] + quopri.quote(new_line[-1:])
744
745    dot = b'.'
746    if six.PY3:
747        dot = ord('.')
748
749    # If the next line starts with a '.'
750    if next_line[0] == dot:
751        next_line = quopri.quote(next_line[0:1]) + next_line[1:]
752
753    return new_line + b"=\n" + next_line
754
755
756def _choose_text_encoding(charset, preferred_encoding, body):
757    if charset in ('ascii', 'iso-8859-1', 'us-ascii'):
758        if has_long_lines(body):
759            return _stronger_encoding(preferred_encoding, 'quoted-printable')
760        else:
761            return preferred_encoding
762    else:
763        encoding = _stronger_encoding(preferred_encoding, 'quoted-printable')
764        return encoding
765
766
767def _stronger_encoding(a, b):
768    weights = {'7bit': 0, 'quoted-printable': 1, 'base64': 1, '8bit': 3}
769    if weights.get(a, -1) >= weights[b]:
770        return a
771    return b
772
773
774def has_long_lines(text, max_line_len=599):
775    """
776    Returns True if text contains lines longer than a certain length.
777    Some SMTP servers (Exchange) refuse to accept messages 'wider' than
778    certain length.
779    """
780    if not text:
781        return False
782    for line in text.splitlines():
783        if len(line) >= max_line_len:
784            return True
785    return False
786
787
788def _base64_decode(s):
789    """Recover base64 if it is broken."""
790    try:
791        return base64.b64decode(s)
792
793    except (TypeError, ValueError):
794        s = _recover_base64(s)
795        tail_size = len(s) & 3
796        if tail_size == 1:
797            # crop last character as adding padding does not help
798            return base64.b64decode(s[:-1])
799
800        # add padding
801        return base64.b64decode(s + '=' * (4 - tail_size))
802
803
804class _CounterIO(object):
805
806    def __init__(self):
807        self.length = 0
808
809    def tell(self):
810        return self.length
811
812    def write(self, s):
813        self.length += len(s)
814
815    def seek(self, p):
816        self.length = p
817
818    def getvalue(self):
819        return self.length
820
821    def close(self):
822        pass
823
824
825_CRLF = '\r\n'
826
827
828# To recover base64 we need to translate the part to the base64 alphabet.
829_b64_alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
830_b64_invalid_chars = ''
831for ch in range(256):
832    if chr(ch) not in _b64_alphabet:
833        _b64_invalid_chars += chr(ch)
834
835
836def _recover_base64(s):
837    if six.PY2:
838        return s.translate(None, _b64_invalid_chars)
839
840    buf = StringIO()
841    chunk_start = 0
842    for i, c in enumerate(s):
843        if (('A' <= c <= 'Z') or
844            ('a' <= c <= 'z') or
845            ('0' <= c <= '9') or
846            c == '+' or c == '/'
847        ):
848            continue
849
850        buf.write(s[chunk_start:i])
851        chunk_start = i + 1
852
853    buf.write(s[chunk_start:len(s)])
854    return buf.getvalue()
855