1# 2# Cython -- encoding related tools 3# 4 5from __future__ import absolute_import 6 7import re 8import sys 9 10if sys.version_info[0] >= 3: 11 _unicode, _str, _bytes, _unichr = str, str, bytes, chr 12 IS_PYTHON3 = True 13else: 14 _unicode, _str, _bytes, _unichr = unicode, str, str, unichr 15 IS_PYTHON3 = False 16 17empty_bytes = _bytes() 18empty_unicode = _unicode() 19 20join_bytes = empty_bytes.join 21 22 23class UnicodeLiteralBuilder(object): 24 """Assemble a unicode string. 25 """ 26 def __init__(self): 27 self.chars = [] 28 29 def append(self, characters): 30 if isinstance(characters, _bytes): 31 # this came from a Py2 string literal in the parser code 32 characters = characters.decode("ASCII") 33 assert isinstance(characters, _unicode), str(type(characters)) 34 self.chars.append(characters) 35 36 if sys.maxunicode == 65535: 37 def append_charval(self, char_number): 38 if char_number > 65535: 39 # wide Unicode character on narrow platform => replace 40 # by surrogate pair 41 char_number -= 0x10000 42 self.chars.append( _unichr((char_number // 1024) + 0xD800) ) 43 self.chars.append( _unichr((char_number % 1024) + 0xDC00) ) 44 else: 45 self.chars.append( _unichr(char_number) ) 46 else: 47 def append_charval(self, char_number): 48 self.chars.append( _unichr(char_number) ) 49 50 def append_uescape(self, char_number, escape_string): 51 self.append_charval(char_number) 52 53 def getstring(self): 54 return EncodedString(u''.join(self.chars)) 55 56 def getstrings(self): 57 return (None, self.getstring()) 58 59 60class BytesLiteralBuilder(object): 61 """Assemble a byte string or char value. 62 """ 63 def __init__(self, target_encoding): 64 self.chars = [] 65 self.target_encoding = target_encoding 66 67 def append(self, characters): 68 if isinstance(characters, _unicode): 69 characters = characters.encode(self.target_encoding) 70 assert isinstance(characters, _bytes), str(type(characters)) 71 self.chars.append(characters) 72 73 def append_charval(self, char_number): 74 self.chars.append( _unichr(char_number).encode('ISO-8859-1') ) 75 76 def append_uescape(self, char_number, escape_string): 77 self.append(escape_string) 78 79 def getstring(self): 80 # this *must* return a byte string! 81 return bytes_literal(join_bytes(self.chars), self.target_encoding) 82 83 def getchar(self): 84 # this *must* return a byte string! 85 return self.getstring() 86 87 def getstrings(self): 88 return (self.getstring(), None) 89 90 91class StrLiteralBuilder(object): 92 """Assemble both a bytes and a unicode representation of a string. 93 """ 94 def __init__(self, target_encoding): 95 self._bytes = BytesLiteralBuilder(target_encoding) 96 self._unicode = UnicodeLiteralBuilder() 97 98 def append(self, characters): 99 self._bytes.append(characters) 100 self._unicode.append(characters) 101 102 def append_charval(self, char_number): 103 self._bytes.append_charval(char_number) 104 self._unicode.append_charval(char_number) 105 106 def append_uescape(self, char_number, escape_string): 107 self._bytes.append(escape_string) 108 self._unicode.append_charval(char_number) 109 110 def getstrings(self): 111 return (self._bytes.getstring(), self._unicode.getstring()) 112 113 114class EncodedString(_unicode): 115 # unicode string subclass to keep track of the original encoding. 116 # 'encoding' is None for unicode strings and the source encoding 117 # otherwise 118 encoding = None 119 120 def __deepcopy__(self, memo): 121 return self 122 123 def byteencode(self): 124 assert self.encoding is not None 125 return self.encode(self.encoding) 126 127 def utf8encode(self): 128 assert self.encoding is None 129 return self.encode("UTF-8") 130 131 @property 132 def is_unicode(self): 133 return self.encoding is None 134 135 def contains_surrogates(self): 136 return string_contains_surrogates(self) 137 138 def as_utf8_string(self): 139 return bytes_literal(self.utf8encode(), 'utf8') 140 141 142def string_contains_surrogates(ustring): 143 """ 144 Check if the unicode string contains surrogate code points 145 on a CPython platform with wide (UCS-4) or narrow (UTF-16) 146 Unicode, i.e. characters that would be spelled as two 147 separate code units on a narrow platform. 148 """ 149 for c in map(ord, ustring): 150 if c > 65535: # can only happen on wide platforms 151 return True 152 if 0xD800 <= c <= 0xDFFF: 153 return True 154 return False 155 156 157class BytesLiteral(_bytes): 158 # bytes subclass that is compatible with EncodedString 159 encoding = None 160 161 def __deepcopy__(self, memo): 162 return self 163 164 def byteencode(self): 165 if IS_PYTHON3: 166 return _bytes(self) 167 else: 168 # fake-recode the string to make it a plain bytes object 169 return self.decode('ISO-8859-1').encode('ISO-8859-1') 170 171 def utf8encode(self): 172 assert False, "this is not a unicode string: %r" % self 173 174 def __str__(self): 175 """Fake-decode the byte string to unicode to support % 176 formatting of unicode strings. 177 """ 178 return self.decode('ISO-8859-1') 179 180 is_unicode = False 181 182 def as_c_string_literal(self): 183 value = split_string_literal(escape_byte_string(self)) 184 return '"%s"' % value 185 186 187def bytes_literal(s, encoding): 188 assert isinstance(s, bytes) 189 s = BytesLiteral(s) 190 s.encoding = encoding 191 return s 192 193 194def encoded_string(s, encoding): 195 assert isinstance(s, (_unicode, bytes)) 196 s = EncodedString(s) 197 if encoding is not None: 198 s.encoding = encoding 199 return s 200 201 202char_from_escape_sequence = { 203 r'\a' : u'\a', 204 r'\b' : u'\b', 205 r'\f' : u'\f', 206 r'\n' : u'\n', 207 r'\r' : u'\r', 208 r'\t' : u'\t', 209 r'\v' : u'\v', 210 }.get 211 212_c_special = ('\\', '??', '"') + tuple(map(chr, range(32))) 213 214 215def _to_escape_sequence(s): 216 if s in '\n\r\t': 217 return repr(s)[1:-1] 218 elif s == '"': 219 return r'\"' 220 elif s == '\\': 221 return r'\\' 222 else: 223 # within a character sequence, oct passes much better than hex 224 return ''.join(['\\%03o' % ord(c) for c in s]) 225 226 227def _build_specials_replacer(): 228 subexps = [] 229 replacements = {} 230 for special in _c_special: 231 regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special]) 232 subexps.append(regexp) 233 replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII') 234 sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub 235 def replace_specials(m): 236 return replacements[m.group(1)] 237 def replace(s): 238 return sub(replace_specials, s) 239 return replace 240 241_replace_specials = _build_specials_replacer() 242 243 244def escape_char(c): 245 if IS_PYTHON3: 246 c = c.decode('ISO-8859-1') 247 if c in '\n\r\t\\': 248 return repr(c)[1:-1] 249 elif c == "'": 250 return "\\'" 251 n = ord(c) 252 if n < 32 or n > 127: 253 # hex works well for characters 254 return "\\x%02X" % n 255 else: 256 return c 257 258def escape_byte_string(s): 259 """Escape a byte string so that it can be written into C code. 260 Note that this returns a Unicode string instead which, when 261 encoded as ISO-8859-1, will result in the correct byte sequence 262 being written. 263 """ 264 s = _replace_specials(s) 265 try: 266 return s.decode("ASCII") # trial decoding: plain ASCII => done 267 except UnicodeDecodeError: 268 pass 269 if IS_PYTHON3: 270 s_new = bytearray() 271 append, extend = s_new.append, s_new.extend 272 for b in s: 273 if b >= 128: 274 extend(('\\%3o' % b).encode('ASCII')) 275 else: 276 append(b) 277 return s_new.decode('ISO-8859-1') 278 else: 279 l = [] 280 append = l.append 281 for c in s: 282 o = ord(c) 283 if o >= 128: 284 append('\\%3o' % o) 285 else: 286 append(c) 287 return join_bytes(l).decode('ISO-8859-1') 288 289def split_string_literal(s, limit=2000): 290 # MSVC can't handle long string literals. 291 if len(s) < limit: 292 return s 293 else: 294 start = 0 295 chunks = [] 296 while start < len(s): 297 end = start + limit 298 if len(s) > end-4 and '\\' in s[end-4:end]: 299 end -= 4 - s[end-4:end].find('\\') # just before the backslash 300 while s[end-1] == '\\': 301 end -= 1 302 if end == start: 303 # must have been a long line of backslashes 304 end = start + limit - (limit % 2) - 4 305 break 306 chunks.append(s[start:end]) 307 start = end 308 return '""'.join(chunks) 309 310def encode_pyunicode_string(s): 311 """Create Py_UNICODE[] representation of a given unicode string. 312 """ 313 s = list(map(ord, s)) + [0] 314 315 if sys.maxunicode >= 0x10000: # Wide build or Py3.3 316 utf16, utf32 = [], s 317 for code_point in s: 318 if code_point >= 0x10000: # outside of BMP 319 high, low = divmod(code_point - 0x10000, 1024) 320 utf16.append(high + 0xD800) 321 utf16.append(low + 0xDC00) 322 else: 323 utf16.append(code_point) 324 else: 325 utf16, utf32 = s, [] 326 for code_unit in s: 327 if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF: 328 high, low = utf32[-1], code_unit 329 utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000 330 else: 331 utf32.append(code_unit) 332 333 if utf16 == utf32: 334 utf16 = [] 335 return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32)) 336