1from __future__ import unicode_literals
2from __future__ import division
3from __future__ import absolute_import
4from future.builtins import str
5from future.builtins import next
6
7# Copyright (C) 2001-2007 Python Software Foundation
8# Author: Ben Gertzfield, Barry Warsaw
9# Contact: email-sig@python.org
10
11__all__ = [
12    'Charset',
13    'add_alias',
14    'add_charset',
15    'add_codec',
16    ]
17
18from functools import partial
19
20from future.backports import email
21from future.backports.email import errors
22from future.backports.email.encoders import encode_7or8bit
23
24
25# Flags for types of header encodings
26QP          = 1 # Quoted-Printable
27BASE64      = 2 # Base64
28SHORTEST    = 3 # the shorter of QP and base64, but only for headers
29
30# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
31RFC2047_CHROME_LEN = 7
32
33DEFAULT_CHARSET = 'us-ascii'
34UNKNOWN8BIT = 'unknown-8bit'
35EMPTYSTRING = ''
36
37
38# Defaults
39CHARSETS = {
40    # input        header enc  body enc output conv
41    'iso-8859-1':  (QP,        QP,      None),
42    'iso-8859-2':  (QP,        QP,      None),
43    'iso-8859-3':  (QP,        QP,      None),
44    'iso-8859-4':  (QP,        QP,      None),
45    # iso-8859-5 is Cyrillic, and not especially used
46    # iso-8859-6 is Arabic, also not particularly used
47    # iso-8859-7 is Greek, QP will not make it readable
48    # iso-8859-8 is Hebrew, QP will not make it readable
49    'iso-8859-9':  (QP,        QP,      None),
50    'iso-8859-10': (QP,        QP,      None),
51    # iso-8859-11 is Thai, QP will not make it readable
52    'iso-8859-13': (QP,        QP,      None),
53    'iso-8859-14': (QP,        QP,      None),
54    'iso-8859-15': (QP,        QP,      None),
55    'iso-8859-16': (QP,        QP,      None),
56    'windows-1252':(QP,        QP,      None),
57    'viscii':      (QP,        QP,      None),
58    'us-ascii':    (None,      None,    None),
59    'big5':        (BASE64,    BASE64,  None),
60    'gb2312':      (BASE64,    BASE64,  None),
61    'euc-jp':      (BASE64,    None,    'iso-2022-jp'),
62    'shift_jis':   (BASE64,    None,    'iso-2022-jp'),
63    'iso-2022-jp': (BASE64,    None,    None),
64    'koi8-r':      (BASE64,    BASE64,  None),
65    'utf-8':       (SHORTEST,  BASE64, 'utf-8'),
66    }
67
68# Aliases for other commonly-used names for character sets.  Map
69# them to the real ones used in email.
70ALIASES = {
71    'latin_1': 'iso-8859-1',
72    'latin-1': 'iso-8859-1',
73    'latin_2': 'iso-8859-2',
74    'latin-2': 'iso-8859-2',
75    'latin_3': 'iso-8859-3',
76    'latin-3': 'iso-8859-3',
77    'latin_4': 'iso-8859-4',
78    'latin-4': 'iso-8859-4',
79    'latin_5': 'iso-8859-9',
80    'latin-5': 'iso-8859-9',
81    'latin_6': 'iso-8859-10',
82    'latin-6': 'iso-8859-10',
83    'latin_7': 'iso-8859-13',
84    'latin-7': 'iso-8859-13',
85    'latin_8': 'iso-8859-14',
86    'latin-8': 'iso-8859-14',
87    'latin_9': 'iso-8859-15',
88    'latin-9': 'iso-8859-15',
89    'latin_10':'iso-8859-16',
90    'latin-10':'iso-8859-16',
91    'cp949':   'ks_c_5601-1987',
92    'euc_jp':  'euc-jp',
93    'euc_kr':  'euc-kr',
94    'ascii':   'us-ascii',
95    }
96
97
98# Map charsets to their Unicode codec strings.
99CODEC_MAP = {
100    'gb2312':      'eucgb2312_cn',
101    'big5':        'big5_tw',
102    # Hack: We don't want *any* conversion for stuff marked us-ascii, as all
103    # sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
104    # Let that stuff pass through without conversion to/from Unicode.
105    'us-ascii':    None,
106    }
107
108
109# Convenience functions for extending the above mappings
110def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
111    """Add character set properties to the global registry.
112
113    charset is the input character set, and must be the canonical name of a
114    character set.
115
116    Optional header_enc and body_enc is either Charset.QP for
117    quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for
118    the shortest of qp or base64 encoding, or None for no encoding.  SHORTEST
119    is only valid for header_enc.  It describes how message headers and
120    message bodies in the input charset are to be encoded.  Default is no
121    encoding.
122
123    Optional output_charset is the character set that the output should be
124    in.  Conversions will proceed from input charset, to Unicode, to the
125    output charset when the method Charset.convert() is called.  The default
126    is to output in the same character set as the input.
127
128    Both input_charset and output_charset must have Unicode codec entries in
129    the module's charset-to-codec mapping; use add_codec(charset, codecname)
130    to add codecs the module does not know about.  See the codecs module's
131    documentation for more information.
132    """
133    if body_enc == SHORTEST:
134        raise ValueError('SHORTEST not allowed for body_enc')
135    CHARSETS[charset] = (header_enc, body_enc, output_charset)
136
137
138def add_alias(alias, canonical):
139    """Add a character set alias.
140
141    alias is the alias name, e.g. latin-1
142    canonical is the character set's canonical name, e.g. iso-8859-1
143    """
144    ALIASES[alias] = canonical
145
146
147def add_codec(charset, codecname):
148    """Add a codec that map characters in the given charset to/from Unicode.
149
150    charset is the canonical name of a character set.  codecname is the name
151    of a Python codec, as appropriate for the second argument to the unicode()
152    built-in, or to the encode() method of a Unicode string.
153    """
154    CODEC_MAP[charset] = codecname
155
156
157# Convenience function for encoding strings, taking into account
158# that they might be unknown-8bit (ie: have surrogate-escaped bytes)
159def _encode(string, codec):
160    string = str(string)
161    if codec == UNKNOWN8BIT:
162        return string.encode('ascii', 'surrogateescape')
163    else:
164        return string.encode(codec)
165
166
167class Charset(object):
168    """Map character sets to their email properties.
169
170    This class provides information about the requirements imposed on email
171    for a specific character set.  It also provides convenience routines for
172    converting between character sets, given the availability of the
173    applicable codecs.  Given a character set, it will do its best to provide
174    information on how to use that character set in an email in an
175    RFC-compliant way.
176
177    Certain character sets must be encoded with quoted-printable or base64
178    when used in email headers or bodies.  Certain character sets must be
179    converted outright, and are not allowed in email.  Instances of this
180    module expose the following information about a character set:
181
182    input_charset: The initial character set specified.  Common aliases
183                   are converted to their `official' email names (e.g. latin_1
184                   is converted to iso-8859-1).  Defaults to 7-bit us-ascii.
185
186    header_encoding: If the character set must be encoded before it can be
187                     used in an email header, this attribute will be set to
188                     Charset.QP (for quoted-printable), Charset.BASE64 (for
189                     base64 encoding), or Charset.SHORTEST for the shortest of
190                     QP or BASE64 encoding.  Otherwise, it will be None.
191
192    body_encoding: Same as header_encoding, but describes the encoding for the
193                   mail message's body, which indeed may be different than the
194                   header encoding.  Charset.SHORTEST is not allowed for
195                   body_encoding.
196
197    output_charset: Some character sets must be converted before they can be
198                    used in email headers or bodies.  If the input_charset is
199                    one of them, this attribute will contain the name of the
200                    charset output will be converted to.  Otherwise, it will
201                    be None.
202
203    input_codec: The name of the Python codec used to convert the
204                 input_charset to Unicode.  If no conversion codec is
205                 necessary, this attribute will be None.
206
207    output_codec: The name of the Python codec used to convert Unicode
208                  to the output_charset.  If no conversion codec is necessary,
209                  this attribute will have the same value as the input_codec.
210    """
211    def __init__(self, input_charset=DEFAULT_CHARSET):
212        # RFC 2046, $4.1.2 says charsets are not case sensitive.  We coerce to
213        # unicode because its .lower() is locale insensitive.  If the argument
214        # is already a unicode, we leave it at that, but ensure that the
215        # charset is ASCII, as the standard (RFC XXX) requires.
216        try:
217            if isinstance(input_charset, str):
218                input_charset.encode('ascii')
219            else:
220                input_charset = str(input_charset, 'ascii')
221        except UnicodeError:
222            raise errors.CharsetError(input_charset)
223        input_charset = input_charset.lower()
224        # Set the input charset after filtering through the aliases
225        self.input_charset = ALIASES.get(input_charset, input_charset)
226        # We can try to guess which encoding and conversion to use by the
227        # charset_map dictionary.  Try that first, but let the user override
228        # it.
229        henc, benc, conv = CHARSETS.get(self.input_charset,
230                                        (SHORTEST, BASE64, None))
231        if not conv:
232            conv = self.input_charset
233        # Set the attributes, allowing the arguments to override the default.
234        self.header_encoding = henc
235        self.body_encoding = benc
236        self.output_charset = ALIASES.get(conv, conv)
237        # Now set the codecs.  If one isn't defined for input_charset,
238        # guess and try a Unicode codec with the same name as input_codec.
239        self.input_codec = CODEC_MAP.get(self.input_charset,
240                                         self.input_charset)
241        self.output_codec = CODEC_MAP.get(self.output_charset,
242                                          self.output_charset)
243
244    def __str__(self):
245        return self.input_charset.lower()
246
247    __repr__ = __str__
248
249    def __eq__(self, other):
250        return str(self) == str(other).lower()
251
252    def __ne__(self, other):
253        return not self.__eq__(other)
254
255    def get_body_encoding(self):
256        """Return the content-transfer-encoding used for body encoding.
257
258        This is either the string `quoted-printable' or `base64' depending on
259        the encoding used, or it is a function in which case you should call
260        the function with a single argument, the Message object being
261        encoded.  The function should then set the Content-Transfer-Encoding
262        header itself to whatever is appropriate.
263
264        Returns "quoted-printable" if self.body_encoding is QP.
265        Returns "base64" if self.body_encoding is BASE64.
266        Returns conversion function otherwise.
267        """
268        assert self.body_encoding != SHORTEST
269        if self.body_encoding == QP:
270            return 'quoted-printable'
271        elif self.body_encoding == BASE64:
272            return 'base64'
273        else:
274            return encode_7or8bit
275
276    def get_output_charset(self):
277        """Return the output character set.
278
279        This is self.output_charset if that is not None, otherwise it is
280        self.input_charset.
281        """
282        return self.output_charset or self.input_charset
283
284    def header_encode(self, string):
285        """Header-encode a string by converting it first to bytes.
286
287        The type of encoding (base64 or quoted-printable) will be based on
288        this charset's `header_encoding`.
289
290        :param string: A unicode string for the header.  It must be possible
291            to encode this string to bytes using the character set's
292            output codec.
293        :return: The encoded string, with RFC 2047 chrome.
294        """
295        codec = self.output_codec or 'us-ascii'
296        header_bytes = _encode(string, codec)
297        # 7bit/8bit encodings return the string unchanged (modulo conversions)
298        encoder_module = self._get_encoder(header_bytes)
299        if encoder_module is None:
300            return string
301        return encoder_module.header_encode(header_bytes, codec)
302
303    def header_encode_lines(self, string, maxlengths):
304        """Header-encode a string by converting it first to bytes.
305
306        This is similar to `header_encode()` except that the string is fit
307        into maximum line lengths as given by the argument.
308
309        :param string: A unicode string for the header.  It must be possible
310            to encode this string to bytes using the character set's
311            output codec.
312        :param maxlengths: Maximum line length iterator.  Each element
313            returned from this iterator will provide the next maximum line
314            length.  This parameter is used as an argument to built-in next()
315            and should never be exhausted.  The maximum line lengths should
316            not count the RFC 2047 chrome.  These line lengths are only a
317            hint; the splitter does the best it can.
318        :return: Lines of encoded strings, each with RFC 2047 chrome.
319        """
320        # See which encoding we should use.
321        codec = self.output_codec or 'us-ascii'
322        header_bytes = _encode(string, codec)
323        encoder_module = self._get_encoder(header_bytes)
324        encoder = partial(encoder_module.header_encode, charset=codec)
325        # Calculate the number of characters that the RFC 2047 chrome will
326        # contribute to each line.
327        charset = self.get_output_charset()
328        extra = len(charset) + RFC2047_CHROME_LEN
329        # Now comes the hard part.  We must encode bytes but we can't split on
330        # bytes because some character sets are variable length and each
331        # encoded word must stand on its own.  So the problem is you have to
332        # encode to bytes to figure out this word's length, but you must split
333        # on characters.  This causes two problems: first, we don't know how
334        # many octets a specific substring of unicode characters will get
335        # encoded to, and second, we don't know how many ASCII characters
336        # those octets will get encoded to.  Unless we try it.  Which seems
337        # inefficient.  In the interest of being correct rather than fast (and
338        # in the hope that there will be few encoded headers in any such
339        # message), brute force it. :(
340        lines = []
341        current_line = []
342        maxlen = next(maxlengths) - extra
343        for character in string:
344            current_line.append(character)
345            this_line = EMPTYSTRING.join(current_line)
346            length = encoder_module.header_length(_encode(this_line, charset))
347            if length > maxlen:
348                # This last character doesn't fit so pop it off.
349                current_line.pop()
350                # Does nothing fit on the first line?
351                if not lines and not current_line:
352                    lines.append(None)
353                else:
354                    separator = (' ' if lines else '')
355                    joined_line = EMPTYSTRING.join(current_line)
356                    header_bytes = _encode(joined_line, codec)
357                    lines.append(encoder(header_bytes))
358                current_line = [character]
359                maxlen = next(maxlengths) - extra
360        joined_line = EMPTYSTRING.join(current_line)
361        header_bytes = _encode(joined_line, codec)
362        lines.append(encoder(header_bytes))
363        return lines
364
365    def _get_encoder(self, header_bytes):
366        if self.header_encoding == BASE64:
367            return email.base64mime
368        elif self.header_encoding == QP:
369            return email.quoprimime
370        elif self.header_encoding == SHORTEST:
371            len64 = email.base64mime.header_length(header_bytes)
372            lenqp = email.quoprimime.header_length(header_bytes)
373            if len64 < lenqp:
374                return email.base64mime
375            else:
376                return email.quoprimime
377        else:
378            return None
379
380    def body_encode(self, string):
381        """Body-encode a string by converting it first to bytes.
382
383        The type of encoding (base64 or quoted-printable) will be based on
384        self.body_encoding.  If body_encoding is None, we assume the
385        output charset is a 7bit encoding, so re-encoding the decoded
386        string using the ascii codec produces the correct string version
387        of the content.
388        """
389        if not string:
390            return string
391        if self.body_encoding is BASE64:
392            if isinstance(string, str):
393                string = string.encode(self.output_charset)
394            return email.base64mime.body_encode(string)
395        elif self.body_encoding is QP:
396            # quopromime.body_encode takes a string, but operates on it as if
397            # it were a list of byte codes.  For a (minimal) history on why
398            # this is so, see changeset 0cf700464177.  To correctly encode a
399            # character set, then, we must turn it into pseudo bytes via the
400            # latin1 charset, which will encode any byte as a single code point
401            # between 0 and 255, which is what body_encode is expecting.
402            if isinstance(string, str):
403                string = string.encode(self.output_charset)
404            string = string.decode('latin1')
405            return email.quoprimime.body_encode(string)
406        else:
407            if isinstance(string, str):
408                string = string.encode(self.output_charset).decode('ascii')
409            return string
410