1#
2#   Cython -- encoding related tools
3#
4
5from __future__ import absolute_import
6
7import re
8import sys
9
10if sys.version_info[0] >= 3:
11    _unicode, _str, _bytes, _unichr = str, str, bytes, chr
12    IS_PYTHON3 = True
13else:
14    _unicode, _str, _bytes, _unichr = unicode, str, str, unichr
15    IS_PYTHON3 = False
16
17empty_bytes = _bytes()
18empty_unicode = _unicode()
19
20join_bytes = empty_bytes.join
21
22
23class UnicodeLiteralBuilder(object):
24    """Assemble a unicode string.
25    """
26    def __init__(self):
27        self.chars = []
28
29    def append(self, characters):
30        if isinstance(characters, _bytes):
31            # this came from a Py2 string literal in the parser code
32            characters = characters.decode("ASCII")
33        assert isinstance(characters, _unicode), str(type(characters))
34        self.chars.append(characters)
35
36    if sys.maxunicode == 65535:
37        def append_charval(self, char_number):
38            if char_number > 65535:
39                # wide Unicode character on narrow platform => replace
40                # by surrogate pair
41                char_number -= 0x10000
42                self.chars.append( _unichr((char_number // 1024) + 0xD800) )
43                self.chars.append( _unichr((char_number  % 1024) + 0xDC00) )
44            else:
45                self.chars.append( _unichr(char_number) )
46    else:
47        def append_charval(self, char_number):
48            self.chars.append( _unichr(char_number) )
49
50    def append_uescape(self, char_number, escape_string):
51        self.append_charval(char_number)
52
53    def getstring(self):
54        return EncodedString(u''.join(self.chars))
55
56    def getstrings(self):
57        return (None, self.getstring())
58
59
60class BytesLiteralBuilder(object):
61    """Assemble a byte string or char value.
62    """
63    def __init__(self, target_encoding):
64        self.chars = []
65        self.target_encoding = target_encoding
66
67    def append(self, characters):
68        if isinstance(characters, _unicode):
69            characters = characters.encode(self.target_encoding)
70        assert isinstance(characters, _bytes), str(type(characters))
71        self.chars.append(characters)
72
73    def append_charval(self, char_number):
74        self.chars.append( _unichr(char_number).encode('ISO-8859-1') )
75
76    def append_uescape(self, char_number, escape_string):
77        self.append(escape_string)
78
79    def getstring(self):
80        # this *must* return a byte string!
81        return bytes_literal(join_bytes(self.chars), self.target_encoding)
82
83    def getchar(self):
84        # this *must* return a byte string!
85        return self.getstring()
86
87    def getstrings(self):
88        return (self.getstring(), None)
89
90
91class StrLiteralBuilder(object):
92    """Assemble both a bytes and a unicode representation of a string.
93    """
94    def __init__(self, target_encoding):
95        self._bytes   = BytesLiteralBuilder(target_encoding)
96        self._unicode = UnicodeLiteralBuilder()
97
98    def append(self, characters):
99        self._bytes.append(characters)
100        self._unicode.append(characters)
101
102    def append_charval(self, char_number):
103        self._bytes.append_charval(char_number)
104        self._unicode.append_charval(char_number)
105
106    def append_uescape(self, char_number, escape_string):
107        self._bytes.append(escape_string)
108        self._unicode.append_charval(char_number)
109
110    def getstrings(self):
111        return (self._bytes.getstring(), self._unicode.getstring())
112
113
114class EncodedString(_unicode):
115    # unicode string subclass to keep track of the original encoding.
116    # 'encoding' is None for unicode strings and the source encoding
117    # otherwise
118    encoding = None
119
120    def __deepcopy__(self, memo):
121        return self
122
123    def byteencode(self):
124        assert self.encoding is not None
125        return self.encode(self.encoding)
126
127    def utf8encode(self):
128        assert self.encoding is None
129        return self.encode("UTF-8")
130
131    @property
132    def is_unicode(self):
133        return self.encoding is None
134
135    def contains_surrogates(self):
136        return string_contains_surrogates(self)
137
138    def as_utf8_string(self):
139        return bytes_literal(self.utf8encode(), 'utf8')
140
141
142def string_contains_surrogates(ustring):
143    """
144    Check if the unicode string contains surrogate code points
145    on a CPython platform with wide (UCS-4) or narrow (UTF-16)
146    Unicode, i.e. characters that would be spelled as two
147    separate code units on a narrow platform.
148    """
149    for c in map(ord, ustring):
150        if c > 65535:  # can only happen on wide platforms
151            return True
152        if 0xD800 <= c <= 0xDFFF:
153            return True
154    return False
155
156
157class BytesLiteral(_bytes):
158    # bytes subclass that is compatible with EncodedString
159    encoding = None
160
161    def __deepcopy__(self, memo):
162        return self
163
164    def byteencode(self):
165        if IS_PYTHON3:
166            return _bytes(self)
167        else:
168            # fake-recode the string to make it a plain bytes object
169            return self.decode('ISO-8859-1').encode('ISO-8859-1')
170
171    def utf8encode(self):
172        assert False, "this is not a unicode string: %r" % self
173
174    def __str__(self):
175        """Fake-decode the byte string to unicode to support %
176        formatting of unicode strings.
177        """
178        return self.decode('ISO-8859-1')
179
180    is_unicode = False
181
182    def as_c_string_literal(self):
183        value = split_string_literal(escape_byte_string(self))
184        return '"%s"' % value
185
186
187def bytes_literal(s, encoding):
188    assert isinstance(s, bytes)
189    s = BytesLiteral(s)
190    s.encoding = encoding
191    return s
192
193
194def encoded_string(s, encoding):
195    assert isinstance(s, (_unicode, bytes))
196    s = EncodedString(s)
197    if encoding is not None:
198        s.encoding = encoding
199    return s
200
201
202char_from_escape_sequence = {
203    r'\a' : u'\a',
204    r'\b' : u'\b',
205    r'\f' : u'\f',
206    r'\n' : u'\n',
207    r'\r' : u'\r',
208    r'\t' : u'\t',
209    r'\v' : u'\v',
210    }.get
211
212_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
213
214
215def _to_escape_sequence(s):
216    if s in '\n\r\t':
217        return repr(s)[1:-1]
218    elif s == '"':
219        return r'\"'
220    elif s == '\\':
221        return r'\\'
222    else:
223        # within a character sequence, oct passes much better than hex
224        return ''.join(['\\%03o' % ord(c) for c in s])
225
226
227def _build_specials_replacer():
228    subexps = []
229    replacements = {}
230    for special in _c_special:
231        regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
232        subexps.append(regexp)
233        replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
234    sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
235    def replace_specials(m):
236        return replacements[m.group(1)]
237    def replace(s):
238        return sub(replace_specials, s)
239    return replace
240
241_replace_specials = _build_specials_replacer()
242
243
244def escape_char(c):
245    if IS_PYTHON3:
246        c = c.decode('ISO-8859-1')
247    if c in '\n\r\t\\':
248        return repr(c)[1:-1]
249    elif c == "'":
250        return "\\'"
251    n = ord(c)
252    if n < 32 or n > 127:
253        # hex works well for characters
254        return "\\x%02X" % n
255    else:
256        return c
257
258def escape_byte_string(s):
259    """Escape a byte string so that it can be written into C code.
260    Note that this returns a Unicode string instead which, when
261    encoded as ISO-8859-1, will result in the correct byte sequence
262    being written.
263    """
264    s = _replace_specials(s)
265    try:
266        return s.decode("ASCII") # trial decoding: plain ASCII => done
267    except UnicodeDecodeError:
268        pass
269    if IS_PYTHON3:
270        s_new = bytearray()
271        append, extend = s_new.append, s_new.extend
272        for b in s:
273            if b >= 128:
274                extend(('\\%3o' % b).encode('ASCII'))
275            else:
276                append(b)
277        return s_new.decode('ISO-8859-1')
278    else:
279        l = []
280        append = l.append
281        for c in s:
282            o = ord(c)
283            if o >= 128:
284                append('\\%3o' % o)
285            else:
286                append(c)
287        return join_bytes(l).decode('ISO-8859-1')
288
289def split_string_literal(s, limit=2000):
290    # MSVC can't handle long string literals.
291    if len(s) < limit:
292        return s
293    else:
294        start = 0
295        chunks = []
296        while start < len(s):
297            end = start + limit
298            if len(s) > end-4 and '\\' in s[end-4:end]:
299                end -= 4 - s[end-4:end].find('\\') # just before the backslash
300                while s[end-1] == '\\':
301                    end -= 1
302                    if end == start:
303                        # must have been a long line of backslashes
304                        end = start + limit - (limit % 2) - 4
305                        break
306            chunks.append(s[start:end])
307            start = end
308        return '""'.join(chunks)
309
310def encode_pyunicode_string(s):
311    """Create Py_UNICODE[] representation of a given unicode string.
312    """
313    s = list(map(ord, s)) + [0]
314
315    if sys.maxunicode >= 0x10000:  # Wide build or Py3.3
316        utf16, utf32 = [], s
317        for code_point in s:
318            if code_point >= 0x10000:  # outside of BMP
319                high, low = divmod(code_point - 0x10000, 1024)
320                utf16.append(high + 0xD800)
321                utf16.append(low + 0xDC00)
322            else:
323                utf16.append(code_point)
324    else:
325        utf16, utf32 = s, []
326        for code_unit in s:
327            if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
328                high, low = utf32[-1], code_unit
329                utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
330            else:
331                utf32.append(code_unit)
332
333    if utf16 == utf32:
334        utf16 = []
335    return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32))
336