1import base64 2import imghdr 3import logging 4import mimetypes 5import quopri 6from contextlib import closing 7from os import path 8 9import six 10from six.moves import StringIO 11 12from flanker import metrics, _email 13from flanker.mime import bounce 14from flanker.mime.message import headers, charsets 15from flanker.mime.message.errors import EncodingError, DecodingError 16from flanker.mime.message.headers import (WithParams, ContentType, MessageId, 17 Subject) 18from flanker.mime.message.headers.parametrized import fix_content_type 19from flanker.utils import is_pure_ascii 20 21log = logging.getLogger(__name__) 22 23CTE = WithParams('7bit', {}) 24 25 26class Stream(object): 27 28 def __init__(self, content_type, start, end, string, stream): 29 self.content_type = content_type 30 self.start = start 31 self.end = end 32 self.string = string 33 self.stream = stream 34 35 self._headers = None 36 self._body_start = None 37 self._body = None 38 self._body_changed = False 39 self.size = len(self.string) 40 41 @property 42 def headers(self): 43 self._load_headers() 44 return self._headers 45 46 @property 47 def body(self): 48 self._load_body() 49 return self._body 50 51 @body.setter 52 def body(self, value): 53 self._set_body(value) 54 55 def read_message(self): 56 self.stream.seek(self.start) 57 return self.stream.read(self.end - self.start + 1) 58 59 def read_body(self): 60 self._load_headers() 61 self.stream.seek(self._body_start) 62 return self.stream.read(self.end - self._body_start + 1) 63 64 def _load_headers(self): 65 if self._headers is None: 66 self.stream.seek(self.start) 67 self._headers = headers.MimeHeaders.from_stream(self.stream) 68 self._body_start = self.stream.tell() 69 70 def _load_body(self): 71 if self._body is None: 72 self._load_headers() 73 self.stream.seek(self._body_start) 74 self._body = _decode_body( 75 self.content_type, 76 self.headers.get('Content-Transfer-Encoding', CTE).value, 77 self.stream.read(self.end - self._body_start + 1)) 78 79 def _set_body(self, value): 80 if value != self._body: 81 self._body = value 82 self._body_changed = True 83 84 def _stream_prepended_headers(self, out): 85 if self._headers: 86 self._headers.to_stream(out, prepends_only=True) 87 88 def headers_changed(self, ignore_prepends=False): 89 return self._headers is not None and self._headers.have_changed(ignore_prepends) 90 91 def body_changed(self): 92 return self._body_changed 93 94 95def adjust_content_type(content_type, body=None, filename=None): 96 """Adjust content type based on filename or body contents 97 """ 98 if filename and str(content_type) == 'application/octet-stream': 99 # check if our internal guess returns anything 100 guessed = _guess_type(filename) 101 if guessed: 102 return guessed 103 104 # our internal attempt didn't return anything, use mimetypes 105 guessed = mimetypes.guess_type(filename)[0] 106 if guessed: 107 main, sub = fix_content_type( 108 guessed, default=('application', 'octet-stream')) 109 content_type = ContentType(main, sub) 110 111 if content_type.main == 'image' and body: 112 image_preamble = body[:32] 113 if six.PY3 and isinstance(body, six.text_type): 114 image_preamble = image_preamble.encode('utf-8', 'ignore') 115 116 sub = imghdr.what(None, image_preamble) 117 if sub: 118 content_type = ContentType('image', sub) 119 120 elif content_type.main == 'audio' and body: 121 sub = _email.detect_audio_type(body) 122 if sub: 123 content_type = ContentType('audio', sub) 124 125 return content_type 126 127 128def _guess_type(filename): 129 """ 130 Internal content type guesser. This is used to hard code certain tricky content-types 131 that heuristic content type checker get wrong. 132 """ 133 134 if filename.endswith('.bz2'): 135 return ContentType('application', 'x-bzip2') 136 137 if filename.endswith('.gz'): 138 return ContentType('application', 'x-gzip') 139 140 return None 141 142 143class Body(object): 144 def __init__(self, content_type, body, charset=None, disposition=None, 145 filename=None, trust_ctype=False): 146 self.headers = headers.MimeHeaders() 147 self.body = body 148 self.disposition = disposition or ('attachment' if filename else None) 149 self.filename = filename 150 self.size = len(body) 151 152 if self.filename: 153 self.filename = path.basename(self.filename) 154 155 if not trust_ctype: 156 content_type = adjust_content_type(content_type, body, filename) 157 158 if content_type.main == 'text': 159 # the text should have a charset 160 if not charset: 161 charset = 'utf-8' 162 163 # it should be stored as unicode. period 164 if isinstance(body, six.binary_type): 165 self.body = charsets.convert_to_unicode(charset, body) 166 167 # let's be simple when possible 168 if charset != 'ascii' and is_pure_ascii(body): 169 charset = 'ascii' 170 171 self.headers['MIME-Version'] = '1.0' 172 self.headers['Content-Type'] = content_type 173 if charset: 174 content_type.params['charset'] = charset 175 176 if self.disposition: 177 self.headers['Content-Disposition'] = WithParams(disposition) 178 if self.filename: 179 self.headers['Content-Disposition'].params['filename'] = self.filename 180 self.headers['Content-Type'].params['name'] = self.filename 181 182 @property 183 def content_type(self): 184 return self.headers['Content-Type'] 185 186 def headers_changed(self, ignore_prepends=False): 187 return True 188 189 def body_changed(self): 190 return True 191 192 def _stream_prepended_headers(self, out): 193 self.headers.to_stream(out, prepends_only=True) 194 195 196class Part(object): 197 198 def __init__(self, ctype): 199 self.headers = headers.MimeHeaders() 200 self.body = None 201 self.headers['Content-Type'] = ctype 202 self.headers['MIME-Version'] = '1.0' 203 self.size = 0 204 205 @property 206 def content_type(self): 207 return self.headers['Content-Type'] 208 209 def headers_changed(self, ignore_prepends=False): 210 return True 211 212 def body_changed(self): 213 return True 214 215 def _stream_prepended_headers(self, out): 216 self.headers.to_stream(out, prepends_only=True) 217 218 219class RichPartMixin(object): 220 221 def __init__(self, is_root=False): 222 self._is_root = is_root 223 self._bounce = None 224 225 @property 226 def message_id(self): 227 return MessageId.from_string(self.headers.get('Message-Id', '')) 228 229 @message_id.setter 230 def message_id(self, value): 231 if not MessageId.is_valid(value): 232 raise ValueError('invalid message id format') 233 self.headers['Message-Id'] = '<{0}>'.format(value) 234 235 @property 236 def subject(self): 237 return self.headers.get('Subject', '') 238 239 @property 240 def clean_subject(self): 241 """ 242 Subject without re, fw, fwd, HA prefixes 243 """ 244 return Subject(self.subject).strip_replies() 245 246 @property 247 def references(self): 248 """ 249 Returns a list of message ids referencing the message in accordance to 250 the Jamie Zawinski threading algorithm. 251 252 See http://www.jwz.org/doc/threading.html for details. 253 """ 254 refs = list(MessageId.scan(self.headers.get('References', ''))) 255 if not refs: 256 reply = MessageId.from_string(self.headers.get('In-Reply-To', '')) 257 if reply: 258 refs.append(reply[0]) 259 return refs 260 261 @property 262 def detected_file_name(self): 263 """ 264 Detects file name based on content type or part name. 265 """ 266 ctype = self.content_type 267 file_name = ctype.params.get('name', '') or ctype.params.get('filename', '') 268 269 value, params = self.content_disposition 270 if value in ['attachment', 'inline']: 271 file_name = params.get('filename', '') or file_name 272 273 # filenames can be presented as tuples, like: 274 # ('us-ascii', 'en-us', 'image.jpg') 275 if isinstance(file_name, tuple) and len(file_name) == 3: 276 # encoding permissible to be empty 277 encoding = file_name[0] 278 if encoding: 279 file_name = file_name[2].decode(encoding) 280 else: 281 file_name = file_name[2] 282 283 file_name = headers.mime_to_unicode(file_name) 284 return file_name 285 286 @property 287 def detected_format(self): 288 return self.detected_content_type.format_type 289 290 @property 291 def detected_subtype(self): 292 return self.detected_content_type.subtype 293 294 @property 295 def detected_content_type(self): 296 """ 297 Returns content type based on the body content, the file name and the 298 original content type provided inside the message. 299 """ 300 return adjust_content_type(self.content_type, 301 filename=self.detected_file_name) 302 303 def is_body(self): 304 return (not self.detected_file_name and 305 (self.content_type.format_type == 'text' or 306 self.content_type.format_type == 'message')) 307 308 def is_root(self): 309 return self._is_root 310 311 def set_root(self, val): 312 self._is_root = bool(val) 313 314 def walk(self, with_self=False, skip_enclosed=False): 315 """ 316 Returns iterator object traversing through the message parts. If the 317 top level part needs to be included then set the `with_self` to `True`. 318 If the parts of the enclosed messages should not be included then set 319 the `skip_enclosed` parameter to `True`. 320 """ 321 322 if with_self: 323 yield self 324 325 if self.content_type.is_multipart(): 326 for p in self.parts: 327 yield p 328 for x in p.walk(with_self=False, skip_enclosed=skip_enclosed): 329 yield x 330 331 elif self.content_type.is_message_container() and not skip_enclosed: 332 yield self.enclosed 333 for p in self.enclosed.walk(with_self=False): 334 yield p 335 336 def is_attachment(self): 337 return self.content_disposition[0] == 'attachment' 338 339 def is_inline(self): 340 return self.content_disposition[0] == 'inline' 341 342 def is_delivery_notification(self): 343 """ 344 Tells whether a message is a system delivery notification. 345 """ 346 content_type = self.content_type 347 return (content_type == 'multipart/report' 348 and content_type.params.get('report-type') == 'delivery-status') 349 350 def get_attached_message(self): 351 """ 352 Returns attached message if found, `None` otherwise. 353 """ 354 try: 355 for part in self.walk(with_self=True): 356 if part.content_type == 'message/rfc822': 357 for p in part.walk(): 358 return p 359 except Exception: 360 log.exception('Failed to get attached message') 361 return None 362 363 def remove_headers(self, *header_names): 364 """ 365 Removes all passed headers name in one operation. 366 """ 367 for header_name in header_names: 368 if header_name in self.headers: 369 del self.headers[header_name] 370 371 @property 372 def bounce(self): 373 """ 374 Deprecated: use bounce.detect(message). 375 """ 376 if not self._bounce: 377 self._bounce = bounce.detect(self) 378 return self._bounce 379 380 def is_bounce(self, probability=0.3): 381 """ 382 Deprecated: use bounce.detect(message). 383 """ 384 return self.bounce.is_bounce(probability) 385 386 def __str__(self): 387 return '({0})'.format(self.content_type) 388 389 390class MimePart(RichPartMixin): 391 392 def __init__(self, container, parts=None, enclosed=None, is_root=False): 393 RichPartMixin.__init__(self, is_root) 394 self._container = container 395 self.parts = parts or [] 396 self.enclosed = enclosed 397 398 @property 399 def size(self): 400 """ Returns message size in bytes""" 401 if self.is_root() and not self.was_changed(): 402 if isinstance(self._container, Stream): 403 return self._container.size 404 else: 405 return sum(part._container.size 406 for part in self.walk(with_self=True)) 407 else: 408 with closing(_CounterIO()) as out: 409 self.to_stream(out) 410 return out.getvalue() 411 412 @property 413 def headers(self): 414 """Returns multi dictionary with headers converted to unicode, 415 headers like Content-Type, Content-Disposition are tuples 416 ('value', {'param': 'val'})""" 417 return self._container.headers 418 419 @property 420 def content_type(self): 421 """ returns object with properties: 422 main - main part of content type 423 sub - subpart of content type 424 params - dictionary with parameters 425 """ 426 return self._container.content_type 427 428 @property 429 def content_disposition(self): 430 """ returns tuple (value, params) """ 431 return self.headers.get('Content-Disposition', WithParams(None)) 432 433 @property 434 def content_encoding(self): 435 return self.headers.get( 436 'Content-Transfer-Encoding', WithParams('7bit')) 437 438 @content_encoding.setter 439 def content_encoding(self, value): 440 self.headers['Content-Transfer-Encoding'] = value 441 442 @property 443 def body(self): 444 """ returns decoded body """ 445 if self.content_type.is_singlepart()\ 446 or self.content_type.is_delivery_status(): 447 return self._container.body 448 449 @body.setter 450 def body(self, value): 451 if self.content_type.is_singlepart()\ 452 or self.content_type.is_delivery_status(): 453 self._container.body = value 454 455 @property 456 def charset(self): 457 return self.content_type.get_charset() 458 459 @charset.setter 460 def charset(self, value): 461 charset = value.lower() 462 self.content_type.set_charset(value) 463 if 'Content-Type' not in self.headers: 464 self.headers['Content-Type'] = ContentType('text', 'plain', {}) 465 self.headers['Content-Type'].params['charset'] = charset 466 self.headers.changed = True 467 468 def to_string(self): 469 """ 470 Returns a MIME representation of the message. 471 """ 472 # this optimisation matters *A LOT* 473 # when there are no prepended headers 474 # we submit the original string, 475 # no copying, no alternation, yeah! 476 if self.is_root() and not self.was_changed(ignore_prepends=True): 477 with closing(StringIO()) as out: 478 self._container._stream_prepended_headers(out) 479 return out.getvalue() + self._container.string 480 else: 481 with closing(StringIO()) as out: 482 self.to_stream(out) 483 return out.getvalue() 484 485 def to_stream(self, out): 486 """ 487 Serializes the message using a file like object. 488 """ 489 if not self.was_changed(ignore_prepends=True): 490 self._container._stream_prepended_headers(out) 491 out.write(self._container.read_message()) 492 else: 493 try: 494 original_position = out.tell() 495 self._to_stream_when_changed(out) 496 except DecodingError: 497 out.seek(original_position) 498 out.write(self._container.read_message()) 499 500 def was_changed(self, ignore_prepends=False): 501 if self._container.headers_changed(ignore_prepends): 502 return True 503 504 if self.content_type.is_singlepart(): 505 if self._container.body_changed(): 506 return True 507 return False 508 509 elif self.content_type.is_multipart(): 510 return any(p.was_changed() for p in self.parts) 511 512 elif self.content_type.is_message_container(): 513 return self.enclosed.was_changed() 514 515 def to_python_message(self): 516 return _email.message_from_string(self.to_string()) 517 518 def append(self, *messages): 519 for m in messages: 520 self.parts.append(m) 521 m.set_root(False) 522 523 def enclose(self, message): 524 self.enclosed = message 525 message.set_root(False) 526 527 def _to_stream_when_changed(self, out): 528 529 ctype = self.content_type 530 531 if ctype.is_singlepart(): 532 533 if self._container.body_changed(): 534 charset, encoding, body = _encode_body(self) 535 if charset: 536 self.charset = charset 537 self.content_encoding = WithParams(encoding) 538 else: 539 body = self._container.read_body() 540 541 # RFC allows subparts without headers 542 if self.headers: 543 self.headers.to_stream(out) 544 elif self.is_root(): 545 raise EncodingError('Root message should have headers') 546 547 out.write(_CRLF) 548 out.write(body) 549 else: 550 self.headers.to_stream(out) 551 out.write(_CRLF) 552 553 if ctype.is_multipart(): 554 boundary = ctype.get_boundary_line() 555 for index, part in enumerate(self.parts): 556 out.write( 557 (_CRLF if index != 0 else '') + boundary + _CRLF) 558 part.to_stream(out) 559 out.write(_CRLF + ctype.get_boundary_line(final=True) + _CRLF) 560 561 elif ctype.is_message_container(): 562 self.enclosed.to_stream(out) 563 564 565def _decode_body(content_type, content_encoding, body): 566 # decode the transfer encoding 567 body = _decode_transfer_encoding(content_encoding, body) 568 569 # decode the charset next 570 return _decode_charset(content_type, body) 571 572 573def _decode_transfer_encoding(encoding, body): 574 if encoding == 'base64': 575 return _base64_decode(body) 576 elif encoding == 'quoted-printable': 577 return quopri.decodestring(body) 578 else: 579 return body 580 581 582def _decode_charset(ctype, body): 583 if ctype.main != 'text': 584 return body 585 586 charset = ctype.get_charset() 587 body = charsets.convert_to_unicode(charset, body) 588 589 # for text/html unicode bodies make sure to replace 590 # the whitespace (0xA0) with Outlook is reported to 591 # have a bug there 592 if ctype.sub == 'html' and charset == 'utf-8': 593 # Outlook bug 594 body = body.replace(u'\xa0', u' ') 595 596 return body 597 598 599def _encode_body(part): 600 content_type = part.content_type 601 content_encoding = part.content_encoding.value 602 body = part._container.body 603 604 charset = content_type.get_charset() 605 if content_type.main == 'text': 606 charset, body = _encode_charset(charset, body) 607 if not part.is_attachment(): 608 content_encoding = _choose_text_encoding(charset, content_encoding, 609 body) 610 # report which text encoding is chosen 611 metrics.incr('encoding.' + content_encoding) 612 else: 613 content_encoding = 'base64' 614 else: 615 content_encoding = 'base64' 616 617 body = _encode_transfer_encoding(content_encoding, body) 618 return charset, content_encoding, body 619 620 621def _encode_charset(preferred_charset, text): 622 try: 623 charset = preferred_charset or 'ascii' 624 text = text.encode(preferred_charset) 625 except: 626 charset = 'utf-8' 627 text = text.encode(charset) 628 return charset, text 629 630 631def _encode_transfer_encoding(encoding, body): 632 if six.PY3: 633 if encoding == 'quoted-printable': 634 body = quopri.encodestring(body, quotetabs=False) 635 body = fix_leading_dot(body) 636 return body.decode('utf-8') 637 638 if encoding == 'base64': 639 if isinstance(body, six.text_type): 640 body = body.encode('utf-8') 641 642 body = _email.encode_base64(body) 643 return body.decode('utf-8') 644 645 if six.PY3 and isinstance(body, six.binary_type): 646 return body.decode('utf-8') 647 648 return body 649 650 if encoding == 'quoted-printable': 651 body = quopri.encodestring(body, quotetabs=False) 652 return fix_leading_dot(body) 653 elif encoding == 'base64': 654 return _email.encode_base64(body) 655 else: 656 return body 657 658 659def fix_leading_dot(s): 660 """ 661 From SMTP RFC: https://tools.ietf.org/html/rfc5321#section-4.5.2 662 663 ----- 664 When a line of mail text is received by the SMTP server, it checks 665 the line. If the line is composed of a single period, it is 666 treated as the end of mail indicator. If the first character is a 667 period and there are other characters on the line, the first 668 character is deleted. 669 ----- 670 671 We have observed some remote SMTP servers have an intermittent obscure bug 672 where the leading '.' is removed according to the above spec. Even when the '.' 673 is obviously within the bounds of a mime part, and with our sending SMTP 674 clients dot stuffing the line. To combat this we convert any leading '.' 675 to a '=2E'. 676 """ 677 infp = six.BytesIO(s) 678 outfp = six.BytesIO() 679 680 # TODO(thrawn01): We could scan the entire string looking for leading '.' 681 # If none found return the original string. This would save memory at the 682 # expense of some additional processing 683 684 dot = b"." 685 if six.PY3: 686 dot = ord('.') 687 688 while 1: 689 line = infp.readline() 690 if not line: 691 break 692 693 if line[0] == dot: 694 line = _quote_and_cut(line) 695 696 outfp.write(line) 697 698 return outfp.getvalue() 699 700 701def _quote_and_cut(ln): 702 """ 703 Quotes the leading '.', if the resulting line is longer than 76 characters 704 cut the line in half without dividing any quoted characters and 705 conforming to the quoted-printable RFC in regards to ending characters. 706 """ 707 ln = quopri.quote(ln[0:1]) + ln[1:] 708 709 # If the line is under the 76 + '\n' character limit 710 if len(ln) <= 77: 711 return ln 712 713 # Find a suitable cut point that doesn't divide a quoted character 714 in_quote, pos = 0, -1 715 for pos, c in enumerate(ln): 716 717 # Skip quoted (=XX) characters 718 if in_quote != 0: 719 in_quote += 1 720 if in_quote <= 3: 721 continue 722 in_quote = 0 723 724 # If we are past the half way mark, make our cut here 725 if pos > len(ln)/2: 726 break 727 728 if six.PY3: 729 c = bytes((c,)) 730 731 # Should be a quoted character 732 if c == b'=': 733 # Peak ahead, does the next char appear to be a hex value? 734 if quopri.ishex(ln[pos+1:pos+2]): 735 in_quote = 1 736 continue 737 738 new_line = ln[:pos] 739 next_line = ln[pos:] 740 741 # If new line ends with a :space or :tab 742 if new_line[-1:] in b' \t': 743 new_line = new_line[:-1] + quopri.quote(new_line[-1:]) 744 745 dot = b'.' 746 if six.PY3: 747 dot = ord('.') 748 749 # If the next line starts with a '.' 750 if next_line[0] == dot: 751 next_line = quopri.quote(next_line[0:1]) + next_line[1:] 752 753 return new_line + b"=\n" + next_line 754 755 756def _choose_text_encoding(charset, preferred_encoding, body): 757 if charset in ('ascii', 'iso-8859-1', 'us-ascii'): 758 if has_long_lines(body): 759 return _stronger_encoding(preferred_encoding, 'quoted-printable') 760 else: 761 return preferred_encoding 762 else: 763 encoding = _stronger_encoding(preferred_encoding, 'quoted-printable') 764 return encoding 765 766 767def _stronger_encoding(a, b): 768 weights = {'7bit': 0, 'quoted-printable': 1, 'base64': 1, '8bit': 3} 769 if weights.get(a, -1) >= weights[b]: 770 return a 771 return b 772 773 774def has_long_lines(text, max_line_len=599): 775 """ 776 Returns True if text contains lines longer than a certain length. 777 Some SMTP servers (Exchange) refuse to accept messages 'wider' than 778 certain length. 779 """ 780 if not text: 781 return False 782 for line in text.splitlines(): 783 if len(line) >= max_line_len: 784 return True 785 return False 786 787 788def _base64_decode(s): 789 """Recover base64 if it is broken.""" 790 try: 791 return base64.b64decode(s) 792 793 except (TypeError, ValueError): 794 s = _recover_base64(s) 795 tail_size = len(s) & 3 796 if tail_size == 1: 797 # crop last character as adding padding does not help 798 return base64.b64decode(s[:-1]) 799 800 # add padding 801 return base64.b64decode(s + '=' * (4 - tail_size)) 802 803 804class _CounterIO(object): 805 806 def __init__(self): 807 self.length = 0 808 809 def tell(self): 810 return self.length 811 812 def write(self, s): 813 self.length += len(s) 814 815 def seek(self, p): 816 self.length = p 817 818 def getvalue(self): 819 return self.length 820 821 def close(self): 822 pass 823 824 825_CRLF = '\r\n' 826 827 828# To recover base64 we need to translate the part to the base64 alphabet. 829_b64_alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/' 830_b64_invalid_chars = '' 831for ch in range(256): 832 if chr(ch) not in _b64_alphabet: 833 _b64_invalid_chars += chr(ch) 834 835 836def _recover_base64(s): 837 if six.PY2: 838 return s.translate(None, _b64_invalid_chars) 839 840 buf = StringIO() 841 chunk_start = 0 842 for i, c in enumerate(s): 843 if (('A' <= c <= 'Z') or 844 ('a' <= c <= 'z') or 845 ('0' <= c <= '9') or 846 c == '+' or c == '/' 847 ): 848 continue 849 850 buf.write(s[chunk_start:i]) 851 chunk_start = i + 1 852 853 buf.write(s[chunk_start:len(s)]) 854 return buf.getvalue() 855