1#
2#   Cython -- encoding related tools
3#
4
5from __future__ import absolute_import
6
7import re
8import sys
9
10if sys.version_info[0] >= 3:
11    _unicode, _str, _bytes, _unichr = str, str, bytes, chr
12    IS_PYTHON3 = True
13else:
14    _unicode, _str, _bytes, _unichr = unicode, str, str, unichr
15    IS_PYTHON3 = False
16
17empty_bytes = _bytes()
18empty_unicode = _unicode()
19
20join_bytes = empty_bytes.join
21
22
23class UnicodeLiteralBuilder(object):
24    """Assemble a unicode string.
25    """
26    def __init__(self):
27        self.chars = []
28
29    def append(self, characters):
30        if isinstance(characters, _bytes):
31            # this came from a Py2 string literal in the parser code
32            characters = characters.decode("ASCII")
33        assert isinstance(characters, _unicode), str(type(characters))
34        self.chars.append(characters)
35
36    if sys.maxunicode == 65535:
37        def append_charval(self, char_number):
38            if char_number > 65535:
39                # wide Unicode character on narrow platform => replace
40                # by surrogate pair
41                char_number -= 0x10000
42                self.chars.append( _unichr((char_number // 1024) + 0xD800) )
43                self.chars.append( _unichr((char_number  % 1024) + 0xDC00) )
44            else:
45                self.chars.append( _unichr(char_number) )
46    else:
47        def append_charval(self, char_number):
48            self.chars.append( _unichr(char_number) )
49
50    def append_uescape(self, char_number, escape_string):
51        self.append_charval(char_number)
52
53    def getstring(self):
54        return EncodedString(u''.join(self.chars))
55
56    def getstrings(self):
57        return (None, self.getstring())
58
59
60class BytesLiteralBuilder(object):
61    """Assemble a byte string or char value.
62    """
63    def __init__(self, target_encoding):
64        self.chars = []
65        self.target_encoding = target_encoding
66
67    def append(self, characters):
68        if isinstance(characters, _unicode):
69            characters = characters.encode(self.target_encoding)
70        assert isinstance(characters, _bytes), str(type(characters))
71        self.chars.append(characters)
72
73    def append_charval(self, char_number):
74        self.chars.append( _unichr(char_number).encode('ISO-8859-1') )
75
76    def append_uescape(self, char_number, escape_string):
77        self.append(escape_string)
78
79    def getstring(self):
80        # this *must* return a byte string!
81        return bytes_literal(join_bytes(self.chars), self.target_encoding)
82
83    def getchar(self):
84        # this *must* return a byte string!
85        return self.getstring()
86
87    def getstrings(self):
88        return (self.getstring(), None)
89
90
91class StrLiteralBuilder(object):
92    """Assemble both a bytes and a unicode representation of a string.
93    """
94    def __init__(self, target_encoding):
95        self._bytes   = BytesLiteralBuilder(target_encoding)
96        self._unicode = UnicodeLiteralBuilder()
97
98    def append(self, characters):
99        self._bytes.append(characters)
100        self._unicode.append(characters)
101
102    def append_charval(self, char_number):
103        self._bytes.append_charval(char_number)
104        self._unicode.append_charval(char_number)
105
106    def append_uescape(self, char_number, escape_string):
107        self._bytes.append(escape_string)
108        self._unicode.append_charval(char_number)
109
110    def getstrings(self):
111        return (self._bytes.getstring(), self._unicode.getstring())
112
113
114class EncodedString(_unicode):
115    # unicode string subclass to keep track of the original encoding.
116    # 'encoding' is None for unicode strings and the source encoding
117    # otherwise
118    encoding = None
119
120    def __deepcopy__(self, memo):
121        return self
122
123    def byteencode(self):
124        assert self.encoding is not None
125        return self.encode(self.encoding)
126
127    def utf8encode(self):
128        assert self.encoding is None
129        return self.encode("UTF-8")
130
131    @property
132    def is_unicode(self):
133        return self.encoding is None
134
135    def contains_surrogates(self):
136        return string_contains_surrogates(self)
137
138    def as_utf8_string(self):
139        return bytes_literal(self.utf8encode(), 'utf8')
140
141    def as_c_string_literal(self):
142        # first encodes the string then produces a c string literal
143        if self.encoding is None:
144            s = self.as_utf8_string()
145        else:
146            s = bytes_literal(self.byteencode(), self.encoding)
147        return s.as_c_string_literal()
148
149    if not hasattr(_unicode, "isascii"):
150        def isascii(self):
151            # not defined for Python3.7+ since the class already has it
152            try:
153                self.encode("ascii")
154            except UnicodeEncodeError:
155                return False
156            else:
157                return True
158
159
160def string_contains_surrogates(ustring):
161    """
162    Check if the unicode string contains surrogate code points
163    on a CPython platform with wide (UCS-4) or narrow (UTF-16)
164    Unicode, i.e. characters that would be spelled as two
165    separate code units on a narrow platform.
166    """
167    for c in map(ord, ustring):
168        if c > 65535:  # can only happen on wide platforms
169            return True
170        if 0xD800 <= c <= 0xDFFF:
171            return True
172    return False
173
174
175def string_contains_lone_surrogates(ustring):
176    """
177    Check if the unicode string contains lone surrogate code points
178    on a CPython platform with wide (UCS-4) or narrow (UTF-16)
179    Unicode, i.e. characters that would be spelled as two
180    separate code units on a narrow platform, but that do not form a pair.
181    """
182    last_was_start = False
183    unicode_uses_surrogate_encoding = sys.maxunicode == 65535
184    for c in map(ord, ustring):
185        # surrogates tend to be rare
186        if c < 0xD800 or c > 0xDFFF:
187            if last_was_start:
188                return True
189        elif not unicode_uses_surrogate_encoding:
190            # on 32bit Unicode platforms, there is never a pair
191            return True
192        elif c <= 0xDBFF:
193            if last_was_start:
194                return True  # lone start
195            last_was_start = True
196        else:
197            if not last_was_start:
198                return True  # lone end
199            last_was_start = False
200    return last_was_start
201
202
203class BytesLiteral(_bytes):
204    # bytes subclass that is compatible with EncodedString
205    encoding = None
206
207    def __deepcopy__(self, memo):
208        return self
209
210    def byteencode(self):
211        if IS_PYTHON3:
212            return _bytes(self)
213        else:
214            # fake-recode the string to make it a plain bytes object
215            return self.decode('ISO-8859-1').encode('ISO-8859-1')
216
217    def utf8encode(self):
218        assert False, "this is not a unicode string: %r" % self
219
220    def __str__(self):
221        """Fake-decode the byte string to unicode to support %
222        formatting of unicode strings.
223        """
224        return self.decode('ISO-8859-1')
225
226    is_unicode = False
227
228    def as_c_string_literal(self):
229        value = split_string_literal(escape_byte_string(self))
230        return '"%s"' % value
231
232    if not hasattr(_bytes, "isascii"):
233        def isascii(self):
234            # already defined for Python3.7+
235            return True
236
237
238def bytes_literal(s, encoding):
239    assert isinstance(s, bytes)
240    s = BytesLiteral(s)
241    s.encoding = encoding
242    return s
243
244
245def encoded_string(s, encoding):
246    assert isinstance(s, (_unicode, bytes))
247    s = EncodedString(s)
248    if encoding is not None:
249        s.encoding = encoding
250    return s
251
252def encoded_string_or_bytes_literal(s, encoding):
253    if isinstance(s, bytes):
254        return bytes_literal(s, encoding)
255    else:
256        return encoded_string(s, encoding)
257
258
259char_from_escape_sequence = {
260    r'\a' : u'\a',
261    r'\b' : u'\b',
262    r'\f' : u'\f',
263    r'\n' : u'\n',
264    r'\r' : u'\r',
265    r'\t' : u'\t',
266    r'\v' : u'\v',
267    }.get
268
269_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
270
271
272def _to_escape_sequence(s):
273    if s in '\n\r\t':
274        return repr(s)[1:-1]
275    elif s == '"':
276        return r'\"'
277    elif s == '\\':
278        return r'\\'
279    else:
280        # within a character sequence, oct passes much better than hex
281        return ''.join(['\\%03o' % ord(c) for c in s])
282
283
284def _build_specials_replacer():
285    subexps = []
286    replacements = {}
287    for special in _c_special:
288        regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
289        subexps.append(regexp)
290        replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
291    sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
292    def replace_specials(m):
293        return replacements[m.group(1)]
294    def replace(s):
295        return sub(replace_specials, s)
296    return replace
297
298_replace_specials = _build_specials_replacer()
299
300
301def escape_char(c):
302    if IS_PYTHON3:
303        c = c.decode('ISO-8859-1')
304    if c in '\n\r\t\\':
305        return repr(c)[1:-1]
306    elif c == "'":
307        return "\\'"
308    n = ord(c)
309    if n < 32 or n > 127:
310        # hex works well for characters
311        return "\\x%02X" % n
312    else:
313        return c
314
315def escape_byte_string(s):
316    """Escape a byte string so that it can be written into C code.
317    Note that this returns a Unicode string instead which, when
318    encoded as ISO-8859-1, will result in the correct byte sequence
319    being written.
320    """
321    s = _replace_specials(s)
322    try:
323        return s.decode("ASCII")  #  trial decoding: plain ASCII => done
324    except UnicodeDecodeError:
325        pass
326    if IS_PYTHON3:
327        s_new = bytearray()
328        append, extend = s_new.append, s_new.extend
329        for b in s:
330            if b >= 128:
331                extend(('\\%3o' % b).encode('ASCII'))
332            else:
333                append(b)
334        return s_new.decode('ISO-8859-1')
335    else:
336        l = []
337        append = l.append
338        for c in s:
339            o = ord(c)
340            if o >= 128:
341                append('\\%3o' % o)
342            else:
343                append(c)
344        return join_bytes(l).decode('ISO-8859-1')
345
346def split_string_literal(s, limit=2000):
347    # MSVC can't handle long string literals.
348    if len(s) < limit:
349        return s
350    else:
351        start = 0
352        chunks = []
353        while start < len(s):
354            end = start + limit
355            if len(s) > end-4 and '\\' in s[end-4:end]:
356                end -= 4 - s[end-4:end].find('\\')  # just before the backslash
357                while s[end-1] == '\\':
358                    end -= 1
359                    if end == start:
360                        # must have been a long line of backslashes
361                        end = start + limit - (limit % 2) - 4
362                        break
363            chunks.append(s[start:end])
364            start = end
365        return '""'.join(chunks)
366
367def encode_pyunicode_string(s):
368    """Create Py_UNICODE[] representation of a given unicode string.
369    """
370    s = list(map(ord, s)) + [0]
371
372    if sys.maxunicode >= 0x10000:  # Wide build or Py3.3
373        utf16, utf32 = [], s
374        for code_point in s:
375            if code_point >= 0x10000:  # outside of BMP
376                high, low = divmod(code_point - 0x10000, 1024)
377                utf16.append(high + 0xD800)
378                utf16.append(low + 0xDC00)
379            else:
380                utf16.append(code_point)
381    else:
382        utf16, utf32 = s, []
383        for code_unit in s:
384            if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
385                high, low = utf32[-1], code_unit
386                utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
387            else:
388                utf32.append(code_unit)
389
390    if utf16 == utf32:
391        utf16 = []
392    return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32))
393