1# 2# Cython -- encoding related tools 3# 4 5from __future__ import absolute_import 6 7import re 8import sys 9 10if sys.version_info[0] >= 3: 11 _unicode, _str, _bytes, _unichr = str, str, bytes, chr 12 IS_PYTHON3 = True 13else: 14 _unicode, _str, _bytes, _unichr = unicode, str, str, unichr 15 IS_PYTHON3 = False 16 17empty_bytes = _bytes() 18empty_unicode = _unicode() 19 20join_bytes = empty_bytes.join 21 22 23class UnicodeLiteralBuilder(object): 24 """Assemble a unicode string. 25 """ 26 def __init__(self): 27 self.chars = [] 28 29 def append(self, characters): 30 if isinstance(characters, _bytes): 31 # this came from a Py2 string literal in the parser code 32 characters = characters.decode("ASCII") 33 assert isinstance(characters, _unicode), str(type(characters)) 34 self.chars.append(characters) 35 36 if sys.maxunicode == 65535: 37 def append_charval(self, char_number): 38 if char_number > 65535: 39 # wide Unicode character on narrow platform => replace 40 # by surrogate pair 41 char_number -= 0x10000 42 self.chars.append( _unichr((char_number // 1024) + 0xD800) ) 43 self.chars.append( _unichr((char_number % 1024) + 0xDC00) ) 44 else: 45 self.chars.append( _unichr(char_number) ) 46 else: 47 def append_charval(self, char_number): 48 self.chars.append( _unichr(char_number) ) 49 50 def append_uescape(self, char_number, escape_string): 51 self.append_charval(char_number) 52 53 def getstring(self): 54 return EncodedString(u''.join(self.chars)) 55 56 def getstrings(self): 57 return (None, self.getstring()) 58 59 60class BytesLiteralBuilder(object): 61 """Assemble a byte string or char value. 62 """ 63 def __init__(self, target_encoding): 64 self.chars = [] 65 self.target_encoding = target_encoding 66 67 def append(self, characters): 68 if isinstance(characters, _unicode): 69 characters = characters.encode(self.target_encoding) 70 assert isinstance(characters, _bytes), str(type(characters)) 71 self.chars.append(characters) 72 73 def append_charval(self, char_number): 74 self.chars.append( _unichr(char_number).encode('ISO-8859-1') ) 75 76 def append_uescape(self, char_number, escape_string): 77 self.append(escape_string) 78 79 def getstring(self): 80 # this *must* return a byte string! 81 return bytes_literal(join_bytes(self.chars), self.target_encoding) 82 83 def getchar(self): 84 # this *must* return a byte string! 85 return self.getstring() 86 87 def getstrings(self): 88 return (self.getstring(), None) 89 90 91class StrLiteralBuilder(object): 92 """Assemble both a bytes and a unicode representation of a string. 93 """ 94 def __init__(self, target_encoding): 95 self._bytes = BytesLiteralBuilder(target_encoding) 96 self._unicode = UnicodeLiteralBuilder() 97 98 def append(self, characters): 99 self._bytes.append(characters) 100 self._unicode.append(characters) 101 102 def append_charval(self, char_number): 103 self._bytes.append_charval(char_number) 104 self._unicode.append_charval(char_number) 105 106 def append_uescape(self, char_number, escape_string): 107 self._bytes.append(escape_string) 108 self._unicode.append_charval(char_number) 109 110 def getstrings(self): 111 return (self._bytes.getstring(), self._unicode.getstring()) 112 113 114class EncodedString(_unicode): 115 # unicode string subclass to keep track of the original encoding. 116 # 'encoding' is None for unicode strings and the source encoding 117 # otherwise 118 encoding = None 119 120 def __deepcopy__(self, memo): 121 return self 122 123 def byteencode(self): 124 assert self.encoding is not None 125 return self.encode(self.encoding) 126 127 def utf8encode(self): 128 assert self.encoding is None 129 return self.encode("UTF-8") 130 131 @property 132 def is_unicode(self): 133 return self.encoding is None 134 135 def contains_surrogates(self): 136 return string_contains_surrogates(self) 137 138 def as_utf8_string(self): 139 return bytes_literal(self.utf8encode(), 'utf8') 140 141 def as_c_string_literal(self): 142 # first encodes the string then produces a c string literal 143 if self.encoding is None: 144 s = self.as_utf8_string() 145 else: 146 s = bytes_literal(self.byteencode(), self.encoding) 147 return s.as_c_string_literal() 148 149 if not hasattr(_unicode, "isascii"): 150 def isascii(self): 151 # not defined for Python3.7+ since the class already has it 152 try: 153 self.encode("ascii") 154 except UnicodeEncodeError: 155 return False 156 else: 157 return True 158 159 160def string_contains_surrogates(ustring): 161 """ 162 Check if the unicode string contains surrogate code points 163 on a CPython platform with wide (UCS-4) or narrow (UTF-16) 164 Unicode, i.e. characters that would be spelled as two 165 separate code units on a narrow platform. 166 """ 167 for c in map(ord, ustring): 168 if c > 65535: # can only happen on wide platforms 169 return True 170 if 0xD800 <= c <= 0xDFFF: 171 return True 172 return False 173 174 175def string_contains_lone_surrogates(ustring): 176 """ 177 Check if the unicode string contains lone surrogate code points 178 on a CPython platform with wide (UCS-4) or narrow (UTF-16) 179 Unicode, i.e. characters that would be spelled as two 180 separate code units on a narrow platform, but that do not form a pair. 181 """ 182 last_was_start = False 183 unicode_uses_surrogate_encoding = sys.maxunicode == 65535 184 for c in map(ord, ustring): 185 # surrogates tend to be rare 186 if c < 0xD800 or c > 0xDFFF: 187 if last_was_start: 188 return True 189 elif not unicode_uses_surrogate_encoding: 190 # on 32bit Unicode platforms, there is never a pair 191 return True 192 elif c <= 0xDBFF: 193 if last_was_start: 194 return True # lone start 195 last_was_start = True 196 else: 197 if not last_was_start: 198 return True # lone end 199 last_was_start = False 200 return last_was_start 201 202 203class BytesLiteral(_bytes): 204 # bytes subclass that is compatible with EncodedString 205 encoding = None 206 207 def __deepcopy__(self, memo): 208 return self 209 210 def byteencode(self): 211 if IS_PYTHON3: 212 return _bytes(self) 213 else: 214 # fake-recode the string to make it a plain bytes object 215 return self.decode('ISO-8859-1').encode('ISO-8859-1') 216 217 def utf8encode(self): 218 assert False, "this is not a unicode string: %r" % self 219 220 def __str__(self): 221 """Fake-decode the byte string to unicode to support % 222 formatting of unicode strings. 223 """ 224 return self.decode('ISO-8859-1') 225 226 is_unicode = False 227 228 def as_c_string_literal(self): 229 value = split_string_literal(escape_byte_string(self)) 230 return '"%s"' % value 231 232 if not hasattr(_bytes, "isascii"): 233 def isascii(self): 234 # already defined for Python3.7+ 235 return True 236 237 238def bytes_literal(s, encoding): 239 assert isinstance(s, bytes) 240 s = BytesLiteral(s) 241 s.encoding = encoding 242 return s 243 244 245def encoded_string(s, encoding): 246 assert isinstance(s, (_unicode, bytes)) 247 s = EncodedString(s) 248 if encoding is not None: 249 s.encoding = encoding 250 return s 251 252def encoded_string_or_bytes_literal(s, encoding): 253 if isinstance(s, bytes): 254 return bytes_literal(s, encoding) 255 else: 256 return encoded_string(s, encoding) 257 258 259char_from_escape_sequence = { 260 r'\a' : u'\a', 261 r'\b' : u'\b', 262 r'\f' : u'\f', 263 r'\n' : u'\n', 264 r'\r' : u'\r', 265 r'\t' : u'\t', 266 r'\v' : u'\v', 267 }.get 268 269_c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) 270 271 272def _to_escape_sequence(s): 273 if s in '\n\r\t': 274 return repr(s)[1:-1] 275 elif s == '"': 276 return r'\"' 277 elif s == '\\': 278 return r'\\' 279 else: 280 # within a character sequence, oct passes much better than hex 281 return ''.join(['\\%03o' % ord(c) for c in s]) 282 283 284def _build_specials_replacer(): 285 subexps = [] 286 replacements = {} 287 for special in _c_special: 288 regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special]) 289 subexps.append(regexp) 290 replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII') 291 sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub 292 def replace_specials(m): 293 return replacements[m.group(1)] 294 def replace(s): 295 return sub(replace_specials, s) 296 return replace 297 298_replace_specials = _build_specials_replacer() 299 300 301def escape_char(c): 302 if IS_PYTHON3: 303 c = c.decode('ISO-8859-1') 304 if c in '\n\r\t\\': 305 return repr(c)[1:-1] 306 elif c == "'": 307 return "\\'" 308 n = ord(c) 309 if n < 32 or n > 127: 310 # hex works well for characters 311 return "\\x%02X" % n 312 else: 313 return c 314 315def escape_byte_string(s): 316 """Escape a byte string so that it can be written into C code. 317 Note that this returns a Unicode string instead which, when 318 encoded as ISO-8859-1, will result in the correct byte sequence 319 being written. 320 """ 321 s = _replace_specials(s) 322 try: 323 return s.decode("ASCII") # trial decoding: plain ASCII => done 324 except UnicodeDecodeError: 325 pass 326 if IS_PYTHON3: 327 s_new = bytearray() 328 append, extend = s_new.append, s_new.extend 329 for b in s: 330 if b >= 128: 331 extend(('\\%3o' % b).encode('ASCII')) 332 else: 333 append(b) 334 return s_new.decode('ISO-8859-1') 335 else: 336 l = [] 337 append = l.append 338 for c in s: 339 o = ord(c) 340 if o >= 128: 341 append('\\%3o' % o) 342 else: 343 append(c) 344 return join_bytes(l).decode('ISO-8859-1') 345 346def split_string_literal(s, limit=2000): 347 # MSVC can't handle long string literals. 348 if len(s) < limit: 349 return s 350 else: 351 start = 0 352 chunks = [] 353 while start < len(s): 354 end = start + limit 355 if len(s) > end-4 and '\\' in s[end-4:end]: 356 end -= 4 - s[end-4:end].find('\\') # just before the backslash 357 while s[end-1] == '\\': 358 end -= 1 359 if end == start: 360 # must have been a long line of backslashes 361 end = start + limit - (limit % 2) - 4 362 break 363 chunks.append(s[start:end]) 364 start = end 365 return '""'.join(chunks) 366 367def encode_pyunicode_string(s): 368 """Create Py_UNICODE[] representation of a given unicode string. 369 """ 370 s = list(map(ord, s)) + [0] 371 372 if sys.maxunicode >= 0x10000: # Wide build or Py3.3 373 utf16, utf32 = [], s 374 for code_point in s: 375 if code_point >= 0x10000: # outside of BMP 376 high, low = divmod(code_point - 0x10000, 1024) 377 utf16.append(high + 0xD800) 378 utf16.append(low + 0xDC00) 379 else: 380 utf16.append(code_point) 381 else: 382 utf16, utf32 = s, [] 383 for code_unit in s: 384 if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF: 385 high, low = utf32[-1], code_unit 386 utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000 387 else: 388 utf32.append(code_unit) 389 390 if utf16 == utf32: 391 utf16 = [] 392 return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32)) 393