1""" 2This module contains support functions for more advanced unicode operations. 3This is not a public API and is for Numba internal use only. Most of the 4functions are relatively straightforward translations of the functions with the 5same name in CPython. 6""" 7from collections import namedtuple 8from enum import IntEnum 9import numpy as np 10import llvmlite.llvmpy.core as lc 11 12from numba.core import types, cgutils 13from numba.core.imputils import (impl_ret_untracked) 14 15from numba.core.extending import overload, intrinsic, register_jitable 16from numba.core.errors import TypingError 17 18# This is equivalent to the struct `_PyUnicode_TypeRecord defined in CPython's 19# Objects/unicodectype.c 20typerecord = namedtuple('typerecord', 21 'upper lower title decimal digit flags') 22 23# The Py_UCS4 type from CPython: 24# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/unicodeobject.h#L112 # noqa: E501 25_Py_UCS4 = types.uint32 26 27# ------------------------------------------------------------------------------ 28# Start code related to/from CPython's unicodectype impl 29# 30# NOTE: the original source at: 31# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c # noqa: E501 32# contains this statement: 33# 34# /* 35# Unicode character type helpers. 36# 37# Written by Marc-Andre Lemburg (mal@lemburg.com). 38# Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) 39# 40# Copyright (c) Corporation for National Research Initiatives. 41# 42# */ 43 44 45# This enum contains the values defined in CPython's Objects/unicodectype.c that 46# provide masks for use against the various members of the typerecord 47# 48# See: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L13-L27 # noqa: E501 49# 50 51 52_Py_TAB = 0x9 53_Py_LINEFEED = 0xa 54_Py_CARRIAGE_RETURN = 0xd 55_Py_SPACE = 0x20 56 57 58class _PyUnicode_TyperecordMasks(IntEnum): 59 ALPHA_MASK = 0x01 60 DECIMAL_MASK = 0x02 61 DIGIT_MASK = 0x04 62 LOWER_MASK = 0x08 63 LINEBREAK_MASK = 0x10 64 SPACE_MASK = 0x20 65 TITLE_MASK = 0x40 66 UPPER_MASK = 0x80 67 XID_START_MASK = 0x100 68 XID_CONTINUE_MASK = 0x200 69 PRINTABLE_MASK = 0x400 70 NUMERIC_MASK = 0x800 71 CASE_IGNORABLE_MASK = 0x1000 72 CASED_MASK = 0x2000 73 EXTENDED_CASE_MASK = 0x4000 74 75 76def _PyUnicode_gettyperecord(a): 77 raise RuntimeError("Calling the Python definition is invalid") 78 79 80@intrinsic 81def _gettyperecord_impl(typingctx, codepoint): 82 """ 83 Provides the binding to numba_gettyperecord, returns a `typerecord` 84 namedtuple of properties from the codepoint. 85 """ 86 if not isinstance(codepoint, types.Integer): 87 raise TypingError("codepoint must be an integer") 88 89 def details(context, builder, signature, args): 90 ll_void = context.get_value_type(types.void) 91 ll_Py_UCS4 = context.get_value_type(_Py_UCS4) 92 ll_intc = context.get_value_type(types.intc) 93 ll_intc_ptr = ll_intc.as_pointer() 94 ll_uchar = context.get_value_type(types.uchar) 95 ll_uchar_ptr = ll_uchar.as_pointer() 96 ll_ushort = context.get_value_type(types.ushort) 97 ll_ushort_ptr = ll_ushort.as_pointer() 98 fnty = lc.Type.function(ll_void, [ 99 ll_Py_UCS4, # code 100 ll_intc_ptr, # upper 101 ll_intc_ptr, # lower 102 ll_intc_ptr, # title 103 ll_uchar_ptr, # decimal 104 ll_uchar_ptr, # digit 105 ll_ushort_ptr, # flags 106 ]) 107 fn = builder.module.get_or_insert_function( 108 fnty, name="numba_gettyperecord") 109 upper = cgutils.alloca_once(builder, ll_intc, name='upper') 110 lower = cgutils.alloca_once(builder, ll_intc, name='lower') 111 title = cgutils.alloca_once(builder, ll_intc, name='title') 112 decimal = cgutils.alloca_once(builder, ll_uchar, name='decimal') 113 digit = cgutils.alloca_once(builder, ll_uchar, name='digit') 114 flags = cgutils.alloca_once(builder, ll_ushort, name='flags') 115 116 byref = [ upper, lower, title, decimal, digit, flags] 117 builder.call(fn, [args[0]] + byref) 118 buf = [] 119 for x in byref: 120 buf.append(builder.load(x)) 121 122 res = context.make_tuple(builder, signature.return_type, tuple(buf)) 123 return impl_ret_untracked(context, builder, signature.return_type, res) 124 125 tupty = types.NamedTuple([types.intc, types.intc, types.intc, types.uchar, 126 types.uchar, types.ushort], typerecord) 127 sig = tupty(_Py_UCS4) 128 return sig, details 129 130 131@overload(_PyUnicode_gettyperecord) 132def gettyperecord_impl(a): 133 """ 134 Provides a _PyUnicode_gettyperecord binding, for convenience it will accept 135 single character strings and code points. 136 """ 137 if isinstance(a, types.UnicodeType): 138 from numba.cpython.unicode import _get_code_point 139 140 def impl(a): 141 if len(a) > 1: 142 msg = "gettyperecord takes a single unicode character" 143 raise ValueError(msg) 144 code_point = _get_code_point(a, 0) 145 data = _gettyperecord_impl(_Py_UCS4(code_point)) 146 return data 147 return impl 148 if isinstance(a, types.Integer): 149 return lambda a: _gettyperecord_impl(_Py_UCS4(a)) 150 151 152# whilst it's possible to grab the _PyUnicode_ExtendedCase symbol as it's global 153# it is safer to use a defined api: 154@intrinsic 155def _PyUnicode_ExtendedCase(typingctx, index): 156 """ 157 Accessor function for the _PyUnicode_ExtendedCase array, binds to 158 numba_get_PyUnicode_ExtendedCase which wraps the array and does the lookup 159 """ 160 if not isinstance(index, types.Integer): 161 raise TypingError("Expected an index") 162 163 def details(context, builder, signature, args): 164 ll_Py_UCS4 = context.get_value_type(_Py_UCS4) 165 ll_intc = context.get_value_type(types.intc) 166 fnty = lc.Type.function(ll_Py_UCS4, [ll_intc]) 167 fn = builder.module.get_or_insert_function( 168 fnty, name="numba_get_PyUnicode_ExtendedCase") 169 return builder.call(fn, [args[0]]) 170 171 sig = _Py_UCS4(types.intc) 172 return sig, details 173 174# The following functions are replications of the functions with the same name 175# in CPython's Objects/unicodectype.c 176 177 178# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L64-L71 # noqa: E501 179@register_jitable 180def _PyUnicode_ToTitlecase(ch): 181 ctype = _PyUnicode_gettyperecord(ch) 182 if (ctype.flags & _PyUnicode_TyperecordMasks.EXTENDED_CASE_MASK): 183 return _PyUnicode_ExtendedCase(ctype.title & 0xFFFF) 184 return ch + ctype.title 185 186 187# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L76-L81 # noqa: E501 188@register_jitable 189def _PyUnicode_IsTitlecase(ch): 190 ctype = _PyUnicode_gettyperecord(ch) 191 return ctype.flags & _PyUnicode_TyperecordMasks.TITLE_MASK != 0 192 193 194# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L86-L91 # noqa: E501 195@register_jitable 196def _PyUnicode_IsXidStart(ch): 197 ctype = _PyUnicode_gettyperecord(ch) 198 return ctype.flags & _PyUnicode_TyperecordMasks.XID_START_MASK != 0 199 200 201# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L96-L101 # noqa: E501 202@register_jitable 203def _PyUnicode_IsXidContinue(ch): 204 ctype = _PyUnicode_gettyperecord(ch) 205 return ctype.flags & _PyUnicode_TyperecordMasks.XID_CONTINUE_MASK != 0 206 207 208@register_jitable 209def _PyUnicode_ToDecimalDigit(ch): 210 ctype = _PyUnicode_gettyperecord(ch) 211 if ctype.flags & _PyUnicode_TyperecordMasks.DECIMAL_MASK: 212 return ctype.decimal 213 return -1 214 215 216# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L123-L1128 # noqa: E501 217@register_jitable 218def _PyUnicode_ToDigit(ch): 219 ctype = _PyUnicode_gettyperecord(ch) 220 if ctype.flags & _PyUnicode_TyperecordMasks.DIGIT_MASK: 221 return ctype.digit 222 return -1 223 224 225# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L140-L145 # noqa: E501 226@register_jitable 227def _PyUnicode_IsNumeric(ch): 228 ctype = _PyUnicode_gettyperecord(ch) 229 return ctype.flags & _PyUnicode_TyperecordMasks.NUMERIC_MASK != 0 230 231 232# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L160-L165 # noqa: E501 233@register_jitable 234def _PyUnicode_IsPrintable(ch): 235 ctype = _PyUnicode_gettyperecord(ch) 236 return ctype.flags & _PyUnicode_TyperecordMasks.PRINTABLE_MASK != 0 237 238 239# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L170-L175 # noqa: E501 240@register_jitable 241def _PyUnicode_IsLowercase(ch): 242 ctype = _PyUnicode_gettyperecord(ch) 243 return ctype.flags & _PyUnicode_TyperecordMasks.LOWER_MASK != 0 244 245 246# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L180-L185 # noqa: E501 247@register_jitable 248def _PyUnicode_IsUppercase(ch): 249 ctype = _PyUnicode_gettyperecord(ch) 250 return ctype.flags & _PyUnicode_TyperecordMasks.UPPER_MASK != 0 251 252 253@register_jitable 254def _PyUnicode_IsLineBreak(ch): 255 ctype = _PyUnicode_gettyperecord(ch) 256 return ctype.flags & _PyUnicode_TyperecordMasks.LINEBREAK_MASK != 0 257 258 259@register_jitable 260def _PyUnicode_ToUppercase(ch): 261 raise NotImplementedError 262 263 264@register_jitable 265def _PyUnicode_ToLowercase(ch): 266 raise NotImplementedError 267 268 269# From: https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodectype.c#L211-L225 # noqa: E501 270@register_jitable 271def _PyUnicode_ToLowerFull(ch, res): 272 ctype = _PyUnicode_gettyperecord(ch) 273 if (ctype.flags & _PyUnicode_TyperecordMasks.EXTENDED_CASE_MASK): 274 index = ctype.lower & 0xFFFF 275 n = ctype.lower >> 24 276 for i in range(n): 277 res[i] = _PyUnicode_ExtendedCase(index + i) 278 return n 279 res[0] = ch + ctype.lower 280 return 1 281 282 283# From: https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodectype.c#L227-L241 # noqa: E501 284@register_jitable 285def _PyUnicode_ToTitleFull(ch, res): 286 ctype = _PyUnicode_gettyperecord(ch) 287 if (ctype.flags & _PyUnicode_TyperecordMasks.EXTENDED_CASE_MASK): 288 index = ctype.title & 0xFFFF 289 n = ctype.title >> 24 290 for i in range(n): 291 res[i] = _PyUnicode_ExtendedCase(index + i) 292 return n 293 res[0] = ch + ctype.title 294 return 1 295 296 297# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L243-L257 # noqa: E501 298@register_jitable 299def _PyUnicode_ToUpperFull(ch, res): 300 ctype = _PyUnicode_gettyperecord(ch) 301 if (ctype.flags & _PyUnicode_TyperecordMasks.EXTENDED_CASE_MASK): 302 index = ctype.upper & 0xFFFF 303 n = ctype.upper >> 24 304 for i in range(n): 305 # Perhaps needed to use unicode._set_code_point() here 306 res[i] = _PyUnicode_ExtendedCase(index + i) 307 return n 308 res[0] = ch + ctype.upper 309 return 1 310 311 312# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L259-L272 # noqa: E501 313@register_jitable 314def _PyUnicode_ToFoldedFull(ch, res): 315 ctype = _PyUnicode_gettyperecord(ch) 316 extended_case_mask = _PyUnicode_TyperecordMasks.EXTENDED_CASE_MASK 317 if ctype.flags & extended_case_mask and (ctype.lower >> 20) & 7: 318 index = (ctype.lower & 0xFFFF) + (ctype.lower >> 24) 319 n = (ctype.lower >> 20) & 7 320 for i in range(n): 321 res[i] = _PyUnicode_ExtendedCase(index + i) 322 return n 323 return _PyUnicode_ToLowerFull(ch, res) 324 325 326# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L274-L279 # noqa: E501 327@register_jitable 328def _PyUnicode_IsCased(ch): 329 ctype = _PyUnicode_gettyperecord(ch) 330 return ctype.flags & _PyUnicode_TyperecordMasks.CASED_MASK != 0 331 332 333# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L281-L286 # noqa: E501 334@register_jitable 335def _PyUnicode_IsCaseIgnorable(ch): 336 ctype = _PyUnicode_gettyperecord(ch) 337 return ctype.flags & _PyUnicode_TyperecordMasks.CASE_IGNORABLE_MASK != 0 338 339 340# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L123-L135 # noqa: E501 341@register_jitable 342def _PyUnicode_IsDigit(ch): 343 if _PyUnicode_ToDigit(ch) < 0: 344 return 0 345 return 1 346 347 348# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L106-L118 # noqa: E501 349@register_jitable 350def _PyUnicode_IsDecimalDigit(ch): 351 if _PyUnicode_ToDecimalDigit(ch) < 0: 352 return 0 353 return 1 354 355 356# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L291-L296 # noqa: E501 357@register_jitable 358def _PyUnicode_IsSpace(ch): 359 ctype = _PyUnicode_gettyperecord(ch) 360 return ctype.flags & _PyUnicode_TyperecordMasks.SPACE_MASK != 0 361 362 363@register_jitable 364def _PyUnicode_IsAlpha(ch): 365 ctype = _PyUnicode_gettyperecord(ch) 366 return ctype.flags & _PyUnicode_TyperecordMasks.ALPHA_MASK != 0 367 368 369# End code related to/from CPython's unicodectype impl 370# ------------------------------------------------------------------------------ 371 372 373# ------------------------------------------------------------------------------ 374# Start code related to/from CPython's pyctype 375 376# From the definition in CPython's Include/pyctype.h 377# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L5-L11 # noqa: E501 378class _PY_CTF(IntEnum): 379 LOWER = 0x01 380 UPPER = 0x02 381 ALPHA = 0x01 | 0x02 382 DIGIT = 0x04 383 ALNUM = 0x01 | 0x02 | 0x04 384 SPACE = 0x08 385 XDIGIT = 0x10 386 387 388# From the definition in CPython's Python/pyctype.c 389# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Python/pyctype.c#L5 # noqa: E501 390_Py_ctype_table = np.array([ 391 0, # 0x0 '\x00' 392 0, # 0x1 '\x01' 393 0, # 0x2 '\x02' 394 0, # 0x3 '\x03' 395 0, # 0x4 '\x04' 396 0, # 0x5 '\x05' 397 0, # 0x6 '\x06' 398 0, # 0x7 '\x07' 399 0, # 0x8 '\x08' 400 _PY_CTF.SPACE, # 0x9 '\t' 401 _PY_CTF.SPACE, # 0xa '\n' 402 _PY_CTF.SPACE, # 0xb '\v' 403 _PY_CTF.SPACE, # 0xc '\f' 404 _PY_CTF.SPACE, # 0xd '\r' 405 0, # 0xe '\x0e' 406 0, # 0xf '\x0f' 407 0, # 0x10 '\x10' 408 0, # 0x11 '\x11' 409 0, # 0x12 '\x12' 410 0, # 0x13 '\x13' 411 0, # 0x14 '\x14' 412 0, # 0x15 '\x15' 413 0, # 0x16 '\x16' 414 0, # 0x17 '\x17' 415 0, # 0x18 '\x18' 416 0, # 0x19 '\x19' 417 0, # 0x1a '\x1a' 418 0, # 0x1b '\x1b' 419 0, # 0x1c '\x1c' 420 0, # 0x1d '\x1d' 421 0, # 0x1e '\x1e' 422 0, # 0x1f '\x1f' 423 _PY_CTF.SPACE, # 0x20 ' ' 424 0, # 0x21 '!' 425 0, # 0x22 '"' 426 0, # 0x23 '#' 427 0, # 0x24 '$' 428 0, # 0x25 '%' 429 0, # 0x26 '&' 430 0, # 0x27 "'" 431 0, # 0x28 '(' 432 0, # 0x29 ')' 433 0, # 0x2a '*' 434 0, # 0x2b '+' 435 0, # 0x2c ',' 436 0, # 0x2d '-' 437 0, # 0x2e '.' 438 0, # 0x2f '/' 439 _PY_CTF.DIGIT | _PY_CTF.XDIGIT, # 0x30 '0' 440 _PY_CTF.DIGIT | _PY_CTF.XDIGIT, # 0x31 '1' 441 _PY_CTF.DIGIT | _PY_CTF.XDIGIT, # 0x32 '2' 442 _PY_CTF.DIGIT | _PY_CTF.XDIGIT, # 0x33 '3' 443 _PY_CTF.DIGIT | _PY_CTF.XDIGIT, # 0x34 '4' 444 _PY_CTF.DIGIT | _PY_CTF.XDIGIT, # 0x35 '5' 445 _PY_CTF.DIGIT | _PY_CTF.XDIGIT, # 0x36 '6' 446 _PY_CTF.DIGIT | _PY_CTF.XDIGIT, # 0x37 '7' 447 _PY_CTF.DIGIT | _PY_CTF.XDIGIT, # 0x38 '8' 448 _PY_CTF.DIGIT | _PY_CTF.XDIGIT, # 0x39 '9' 449 0, # 0x3a ':' 450 0, # 0x3b ';' 451 0, # 0x3c '<' 452 0, # 0x3d '=' 453 0, # 0x3e '>' 454 0, # 0x3f '?' 455 0, # 0x40 '@' 456 _PY_CTF.UPPER | _PY_CTF.XDIGIT, # 0x41 'A' 457 _PY_CTF.UPPER | _PY_CTF.XDIGIT, # 0x42 'B' 458 _PY_CTF.UPPER | _PY_CTF.XDIGIT, # 0x43 'C' 459 _PY_CTF.UPPER | _PY_CTF.XDIGIT, # 0x44 'D' 460 _PY_CTF.UPPER | _PY_CTF.XDIGIT, # 0x45 'E' 461 _PY_CTF.UPPER | _PY_CTF.XDIGIT, # 0x46 'F' 462 _PY_CTF.UPPER, # 0x47 'G' 463 _PY_CTF.UPPER, # 0x48 'H' 464 _PY_CTF.UPPER, # 0x49 'I' 465 _PY_CTF.UPPER, # 0x4a 'J' 466 _PY_CTF.UPPER, # 0x4b 'K' 467 _PY_CTF.UPPER, # 0x4c 'L' 468 _PY_CTF.UPPER, # 0x4d 'M' 469 _PY_CTF.UPPER, # 0x4e 'N' 470 _PY_CTF.UPPER, # 0x4f 'O' 471 _PY_CTF.UPPER, # 0x50 'P' 472 _PY_CTF.UPPER, # 0x51 'Q' 473 _PY_CTF.UPPER, # 0x52 'R' 474 _PY_CTF.UPPER, # 0x53 'S' 475 _PY_CTF.UPPER, # 0x54 'T' 476 _PY_CTF.UPPER, # 0x55 'U' 477 _PY_CTF.UPPER, # 0x56 'V' 478 _PY_CTF.UPPER, # 0x57 'W' 479 _PY_CTF.UPPER, # 0x58 'X' 480 _PY_CTF.UPPER, # 0x59 'Y' 481 _PY_CTF.UPPER, # 0x5a 'Z' 482 0, # 0x5b '[' 483 0, # 0x5c '\\' 484 0, # 0x5d ']' 485 0, # 0x5e '^' 486 0, # 0x5f '_' 487 0, # 0x60 '`' 488 _PY_CTF.LOWER | _PY_CTF.XDIGIT, # 0x61 'a' 489 _PY_CTF.LOWER | _PY_CTF.XDIGIT, # 0x62 'b' 490 _PY_CTF.LOWER | _PY_CTF.XDIGIT, # 0x63 'c' 491 _PY_CTF.LOWER | _PY_CTF.XDIGIT, # 0x64 'd' 492 _PY_CTF.LOWER | _PY_CTF.XDIGIT, # 0x65 'e' 493 _PY_CTF.LOWER | _PY_CTF.XDIGIT, # 0x66 'f' 494 _PY_CTF.LOWER, # 0x67 'g' 495 _PY_CTF.LOWER, # 0x68 'h' 496 _PY_CTF.LOWER, # 0x69 'i' 497 _PY_CTF.LOWER, # 0x6a 'j' 498 _PY_CTF.LOWER, # 0x6b 'k' 499 _PY_CTF.LOWER, # 0x6c 'l' 500 _PY_CTF.LOWER, # 0x6d 'm' 501 _PY_CTF.LOWER, # 0x6e 'n' 502 _PY_CTF.LOWER, # 0x6f 'o' 503 _PY_CTF.LOWER, # 0x70 'p' 504 _PY_CTF.LOWER, # 0x71 'q' 505 _PY_CTF.LOWER, # 0x72 'r' 506 _PY_CTF.LOWER, # 0x73 's' 507 _PY_CTF.LOWER, # 0x74 't' 508 _PY_CTF.LOWER, # 0x75 'u' 509 _PY_CTF.LOWER, # 0x76 'v' 510 _PY_CTF.LOWER, # 0x77 'w' 511 _PY_CTF.LOWER, # 0x78 'x' 512 _PY_CTF.LOWER, # 0x79 'y' 513 _PY_CTF.LOWER, # 0x7a 'z' 514 0, # 0x7b '{' 515 0, # 0x7c '|' 516 0, # 0x7d '}' 517 0, # 0x7e '~' 518 0, # 0x7f '\x7f' 519 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 520 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 521 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 522 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 527], dtype=np.intc) 528 529 530# From the definition in CPython's Python/pyctype.c 531# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Python/pyctype.c#L145 # noqa: E501 532_Py_ctype_tolower = np.array([ 533 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 534 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 535 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 536 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 537 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 538 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 539 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 540 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 541 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 542 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 543 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 544 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 545 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 546 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 547 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 548 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 549 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 550 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 551 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 552 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 553 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 554 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 555 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 556 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 557 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 558 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 559 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 560 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 561 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 562 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 563 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 564 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 565], dtype=np.uint8) 566 567 568# From the definition in CPython's Python/pyctype.c 569# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Python/pyctype.c#L180 570_Py_ctype_toupper = np.array([ 571 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 572 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 573 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 574 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 575 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 576 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 577 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 578 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 579 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 580 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 581 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 582 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 583 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 584 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 585 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 586 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 587 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 588 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 589 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 590 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 591 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 592 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 593 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 594 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 595 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 596 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 597 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 598 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 599 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 600 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 601 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 602 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 603], dtype=np.uint8) 604 605 606class _PY_CTF_LB(IntEnum): 607 LINE_BREAK = 0x01 608 LINE_FEED = 0x02 609 CARRIAGE_RETURN = 0x04 610 611 612_Py_ctype_islinebreak = np.array([ 613 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 614 _PY_CTF_LB.LINE_BREAK | _PY_CTF_LB.LINE_FEED, # 0xa '\n' 615 _PY_CTF_LB.LINE_BREAK, # 0xb '\v' 616 _PY_CTF_LB.LINE_BREAK, # 0xc '\f' 617 _PY_CTF_LB.LINE_BREAK | _PY_CTF_LB.CARRIAGE_RETURN, # 0xd '\r' 618 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 619 _PY_CTF_LB.LINE_BREAK, # 0x1c '\x1c' 620 _PY_CTF_LB.LINE_BREAK, # 0x1d '\x1d' 621 _PY_CTF_LB.LINE_BREAK, # 0x1e '\x1e' 622 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 628 _PY_CTF_LB.LINE_BREAK, # 0x85 '\x85' 629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 633 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 635 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 636 0, 0, 0, 637], dtype=np.intc) 638 639 640# Translation of: 641# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pymacro.h#L25 # noqa: E501 642@register_jitable 643def _Py_CHARMASK(ch): 644 """ 645 Equivalent to the CPython macro `Py_CHARMASK()`, masks off all but the 646 lowest 256 bits of ch. 647 """ 648 return types.uint8(ch) & types.uint8(0xff) 649 650 651# Translation of: 652# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L30 # noqa: E501 653@register_jitable 654def _Py_TOUPPER(ch): 655 """ 656 Equivalent to the CPython macro `Py_TOUPPER()` converts an ASCII range 657 code point to the upper equivalent 658 """ 659 return _Py_ctype_toupper[_Py_CHARMASK(ch)] 660 661 662# Translation of: 663# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L29 # noqa: E501 664@register_jitable 665def _Py_TOLOWER(ch): 666 """ 667 Equivalent to the CPython macro `Py_TOLOWER()` converts an ASCII range 668 code point to the lower equivalent 669 """ 670 return _Py_ctype_tolower[_Py_CHARMASK(ch)] 671 672 673# Translation of: 674# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L18 # noqa: E501 675@register_jitable 676def _Py_ISLOWER(ch): 677 """ 678 Equivalent to the CPython macro `Py_ISLOWER()` 679 """ 680 return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.LOWER 681 682 683# Translation of: 684# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L19 # noqa: E501 685@register_jitable 686def _Py_ISUPPER(ch): 687 """ 688 Equivalent to the CPython macro `Py_ISUPPER()` 689 """ 690 return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.UPPER 691 692 693# Translation of: 694# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L20 # noqa: E501 695@register_jitable 696def _Py_ISALPHA(ch): 697 """ 698 Equivalent to the CPython macro `Py_ISALPHA()` 699 """ 700 return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.ALPHA 701 702 703# Translation of: 704# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L21 # noqa: E501 705@register_jitable 706def _Py_ISDIGIT(ch): 707 """ 708 Equivalent to the CPython macro `Py_ISDIGIT()` 709 """ 710 return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.DIGIT 711 712 713# Translation of: 714# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L22 # noqa: E501 715@register_jitable 716def _Py_ISXDIGIT(ch): 717 """ 718 Equivalent to the CPython macro `Py_ISXDIGIT()` 719 """ 720 return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.XDIGIT 721 722 723# Translation of: 724# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L23 # noqa: E501 725@register_jitable 726def _Py_ISALNUM(ch): 727 """ 728 Equivalent to the CPython macro `Py_ISALNUM()` 729 """ 730 return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.ALNUM 731 732 733# Translation of: 734# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L24 # noqa: E501 735@register_jitable 736def _Py_ISSPACE(ch): 737 """ 738 Equivalent to the CPython macro `Py_ISSPACE()` 739 """ 740 return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.SPACE 741 742 743@register_jitable 744def _Py_ISLINEBREAK(ch): 745 """Check if character is ASCII line break""" 746 return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.LINE_BREAK 747 748 749@register_jitable 750def _Py_ISLINEFEED(ch): 751 """Check if character is line feed `\n`""" 752 return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.LINE_FEED 753 754 755@register_jitable 756def _Py_ISCARRIAGERETURN(ch): 757 """Check if character is carriage return `\r`""" 758 return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.CARRIAGE_RETURN 759 760 761# End code related to/from CPython's pyctype 762# ------------------------------------------------------------------------------ 763