1"""
2This module contains support functions for more advanced unicode operations.
3This is not a public API and is for Numba internal use only. Most of the
4functions are relatively straightforward translations of the functions with the
5same name in CPython.
6"""
7from collections import namedtuple
8from enum import IntEnum
9import numpy as np
10import llvmlite.llvmpy.core as lc
11
12from numba.core import types, cgutils
13from numba.core.imputils import (impl_ret_untracked)
14
15from numba.core.extending import overload, intrinsic, register_jitable
16from numba.core.errors import TypingError
17
18# This is equivalent to the struct `_PyUnicode_TypeRecord defined in CPython's
19# Objects/unicodectype.c
20typerecord = namedtuple('typerecord',
21                        'upper lower title decimal digit flags')
22
23# The Py_UCS4 type from CPython:
24# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/unicodeobject.h#L112    # noqa: E501
25_Py_UCS4 = types.uint32
26
27# ------------------------------------------------------------------------------
28# Start code related to/from CPython's unicodectype impl
29#
30# NOTE: the original source at:
31# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c   # noqa: E501
32# contains this statement:
33#
34# /*
35#   Unicode character type helpers.
36#
37#   Written by Marc-Andre Lemburg (mal@lemburg.com).
38#   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
39#
40#   Copyright (c) Corporation for National Research Initiatives.
41#
42# */
43
44
45# This enum contains the values defined in CPython's Objects/unicodectype.c that
46# provide masks for use against the various members of the typerecord
47#
48# See: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L13-L27    # noqa: E501
49#
50
51
52_Py_TAB = 0x9
53_Py_LINEFEED = 0xa
54_Py_CARRIAGE_RETURN = 0xd
55_Py_SPACE = 0x20
56
57
58class _PyUnicode_TyperecordMasks(IntEnum):
59    ALPHA_MASK = 0x01
60    DECIMAL_MASK = 0x02
61    DIGIT_MASK = 0x04
62    LOWER_MASK = 0x08
63    LINEBREAK_MASK = 0x10
64    SPACE_MASK = 0x20
65    TITLE_MASK = 0x40
66    UPPER_MASK = 0x80
67    XID_START_MASK = 0x100
68    XID_CONTINUE_MASK = 0x200
69    PRINTABLE_MASK = 0x400
70    NUMERIC_MASK = 0x800
71    CASE_IGNORABLE_MASK = 0x1000
72    CASED_MASK = 0x2000
73    EXTENDED_CASE_MASK = 0x4000
74
75
76def _PyUnicode_gettyperecord(a):
77    raise RuntimeError("Calling the Python definition is invalid")
78
79
80@intrinsic
81def _gettyperecord_impl(typingctx, codepoint):
82    """
83    Provides the binding to numba_gettyperecord, returns a `typerecord`
84    namedtuple of properties from the codepoint.
85    """
86    if not isinstance(codepoint, types.Integer):
87        raise TypingError("codepoint must be an integer")
88
89    def details(context, builder, signature, args):
90        ll_void = context.get_value_type(types.void)
91        ll_Py_UCS4 = context.get_value_type(_Py_UCS4)
92        ll_intc = context.get_value_type(types.intc)
93        ll_intc_ptr = ll_intc.as_pointer()
94        ll_uchar = context.get_value_type(types.uchar)
95        ll_uchar_ptr = ll_uchar.as_pointer()
96        ll_ushort = context.get_value_type(types.ushort)
97        ll_ushort_ptr = ll_ushort.as_pointer()
98        fnty = lc.Type.function(ll_void, [
99            ll_Py_UCS4,    # code
100            ll_intc_ptr,   # upper
101            ll_intc_ptr,   # lower
102            ll_intc_ptr,   # title
103            ll_uchar_ptr,  # decimal
104            ll_uchar_ptr,  # digit
105            ll_ushort_ptr, # flags
106        ])
107        fn = builder.module.get_or_insert_function(
108            fnty, name="numba_gettyperecord")
109        upper = cgutils.alloca_once(builder, ll_intc, name='upper')
110        lower = cgutils.alloca_once(builder, ll_intc, name='lower')
111        title = cgutils.alloca_once(builder, ll_intc, name='title')
112        decimal = cgutils.alloca_once(builder, ll_uchar, name='decimal')
113        digit = cgutils.alloca_once(builder, ll_uchar, name='digit')
114        flags = cgutils.alloca_once(builder, ll_ushort, name='flags')
115
116        byref = [ upper, lower, title, decimal, digit, flags]
117        builder.call(fn, [args[0]] + byref)
118        buf = []
119        for x in byref:
120            buf.append(builder.load(x))
121
122        res = context.make_tuple(builder, signature.return_type, tuple(buf))
123        return impl_ret_untracked(context, builder, signature.return_type, res)
124
125    tupty = types.NamedTuple([types.intc, types.intc, types.intc, types.uchar,
126                              types.uchar, types.ushort], typerecord)
127    sig = tupty(_Py_UCS4)
128    return sig, details
129
130
131@overload(_PyUnicode_gettyperecord)
132def gettyperecord_impl(a):
133    """
134    Provides a _PyUnicode_gettyperecord binding, for convenience it will accept
135    single character strings and code points.
136    """
137    if isinstance(a, types.UnicodeType):
138        from numba.cpython.unicode import _get_code_point
139
140        def impl(a):
141            if len(a) > 1:
142                msg = "gettyperecord takes a single unicode character"
143                raise ValueError(msg)
144            code_point = _get_code_point(a, 0)
145            data = _gettyperecord_impl(_Py_UCS4(code_point))
146            return data
147        return impl
148    if isinstance(a, types.Integer):
149        return lambda a: _gettyperecord_impl(_Py_UCS4(a))
150
151
152# whilst it's possible to grab the _PyUnicode_ExtendedCase symbol as it's global
153# it is safer to use a defined api:
154@intrinsic
155def _PyUnicode_ExtendedCase(typingctx, index):
156    """
157    Accessor function for the _PyUnicode_ExtendedCase array, binds to
158    numba_get_PyUnicode_ExtendedCase which wraps the array and does the lookup
159    """
160    if not isinstance(index, types.Integer):
161        raise TypingError("Expected an index")
162
163    def details(context, builder, signature, args):
164        ll_Py_UCS4 = context.get_value_type(_Py_UCS4)
165        ll_intc = context.get_value_type(types.intc)
166        fnty = lc.Type.function(ll_Py_UCS4, [ll_intc])
167        fn = builder.module.get_or_insert_function(
168            fnty, name="numba_get_PyUnicode_ExtendedCase")
169        return builder.call(fn, [args[0]])
170
171    sig = _Py_UCS4(types.intc)
172    return sig, details
173
174# The following functions are replications of the functions with the same name
175# in CPython's Objects/unicodectype.c
176
177
178# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L64-L71    # noqa: E501
179@register_jitable
180def _PyUnicode_ToTitlecase(ch):
181    ctype = _PyUnicode_gettyperecord(ch)
182    if (ctype.flags & _PyUnicode_TyperecordMasks.EXTENDED_CASE_MASK):
183        return _PyUnicode_ExtendedCase(ctype.title & 0xFFFF)
184    return ch + ctype.title
185
186
187# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L76-L81    # noqa: E501
188@register_jitable
189def _PyUnicode_IsTitlecase(ch):
190    ctype = _PyUnicode_gettyperecord(ch)
191    return ctype.flags & _PyUnicode_TyperecordMasks.TITLE_MASK != 0
192
193
194# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L86-L91    # noqa: E501
195@register_jitable
196def _PyUnicode_IsXidStart(ch):
197    ctype = _PyUnicode_gettyperecord(ch)
198    return ctype.flags & _PyUnicode_TyperecordMasks.XID_START_MASK != 0
199
200
201# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L96-L101    # noqa: E501
202@register_jitable
203def _PyUnicode_IsXidContinue(ch):
204    ctype = _PyUnicode_gettyperecord(ch)
205    return ctype.flags & _PyUnicode_TyperecordMasks.XID_CONTINUE_MASK != 0
206
207
208@register_jitable
209def _PyUnicode_ToDecimalDigit(ch):
210    ctype = _PyUnicode_gettyperecord(ch)
211    if ctype.flags & _PyUnicode_TyperecordMasks.DECIMAL_MASK:
212        return ctype.decimal
213    return -1
214
215
216# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L123-L1128  # noqa: E501
217@register_jitable
218def _PyUnicode_ToDigit(ch):
219    ctype = _PyUnicode_gettyperecord(ch)
220    if ctype.flags & _PyUnicode_TyperecordMasks.DIGIT_MASK:
221        return ctype.digit
222    return -1
223
224
225# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L140-L145    # noqa: E501
226@register_jitable
227def _PyUnicode_IsNumeric(ch):
228    ctype = _PyUnicode_gettyperecord(ch)
229    return ctype.flags & _PyUnicode_TyperecordMasks.NUMERIC_MASK != 0
230
231
232# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L160-L165    # noqa: E501
233@register_jitable
234def _PyUnicode_IsPrintable(ch):
235    ctype = _PyUnicode_gettyperecord(ch)
236    return ctype.flags & _PyUnicode_TyperecordMasks.PRINTABLE_MASK != 0
237
238
239# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L170-L175    # noqa: E501
240@register_jitable
241def _PyUnicode_IsLowercase(ch):
242    ctype = _PyUnicode_gettyperecord(ch)
243    return ctype.flags & _PyUnicode_TyperecordMasks.LOWER_MASK != 0
244
245
246# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L180-L185    # noqa: E501
247@register_jitable
248def _PyUnicode_IsUppercase(ch):
249    ctype = _PyUnicode_gettyperecord(ch)
250    return ctype.flags & _PyUnicode_TyperecordMasks.UPPER_MASK != 0
251
252
253@register_jitable
254def _PyUnicode_IsLineBreak(ch):
255    ctype = _PyUnicode_gettyperecord(ch)
256    return ctype.flags & _PyUnicode_TyperecordMasks.LINEBREAK_MASK != 0
257
258
259@register_jitable
260def _PyUnicode_ToUppercase(ch):
261    raise NotImplementedError
262
263
264@register_jitable
265def _PyUnicode_ToLowercase(ch):
266    raise NotImplementedError
267
268
269# From: https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodectype.c#L211-L225    # noqa: E501
270@register_jitable
271def _PyUnicode_ToLowerFull(ch, res):
272    ctype = _PyUnicode_gettyperecord(ch)
273    if (ctype.flags & _PyUnicode_TyperecordMasks.EXTENDED_CASE_MASK):
274        index = ctype.lower & 0xFFFF
275        n = ctype.lower >> 24
276        for i in range(n):
277            res[i] = _PyUnicode_ExtendedCase(index + i)
278        return n
279    res[0] = ch + ctype.lower
280    return 1
281
282
283# From: https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodectype.c#L227-L241    # noqa: E501
284@register_jitable
285def _PyUnicode_ToTitleFull(ch, res):
286    ctype = _PyUnicode_gettyperecord(ch)
287    if (ctype.flags & _PyUnicode_TyperecordMasks.EXTENDED_CASE_MASK):
288        index = ctype.title & 0xFFFF
289        n = ctype.title >> 24
290        for i in range(n):
291            res[i] = _PyUnicode_ExtendedCase(index + i)
292        return n
293    res[0] = ch + ctype.title
294    return 1
295
296
297# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L243-L257    # noqa: E501
298@register_jitable
299def _PyUnicode_ToUpperFull(ch, res):
300    ctype = _PyUnicode_gettyperecord(ch)
301    if (ctype.flags & _PyUnicode_TyperecordMasks.EXTENDED_CASE_MASK):
302        index = ctype.upper & 0xFFFF
303        n = ctype.upper >> 24
304        for i in range(n):
305            # Perhaps needed to use unicode._set_code_point() here
306            res[i] = _PyUnicode_ExtendedCase(index + i)
307        return n
308    res[0] = ch + ctype.upper
309    return 1
310
311
312# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L259-L272    # noqa: E501
313@register_jitable
314def _PyUnicode_ToFoldedFull(ch, res):
315    ctype = _PyUnicode_gettyperecord(ch)
316    extended_case_mask = _PyUnicode_TyperecordMasks.EXTENDED_CASE_MASK
317    if ctype.flags & extended_case_mask and (ctype.lower >> 20) & 7:
318        index = (ctype.lower & 0xFFFF) + (ctype.lower >> 24)
319        n = (ctype.lower >> 20) & 7
320        for i in range(n):
321            res[i] = _PyUnicode_ExtendedCase(index + i)
322        return n
323    return _PyUnicode_ToLowerFull(ch, res)
324
325
326# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L274-L279    # noqa: E501
327@register_jitable
328def _PyUnicode_IsCased(ch):
329    ctype = _PyUnicode_gettyperecord(ch)
330    return ctype.flags & _PyUnicode_TyperecordMasks.CASED_MASK != 0
331
332
333# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L281-L286    # noqa: E501
334@register_jitable
335def _PyUnicode_IsCaseIgnorable(ch):
336    ctype = _PyUnicode_gettyperecord(ch)
337    return ctype.flags & _PyUnicode_TyperecordMasks.CASE_IGNORABLE_MASK != 0
338
339
340# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L123-L135    # noqa: E501
341@register_jitable
342def _PyUnicode_IsDigit(ch):
343    if _PyUnicode_ToDigit(ch) < 0:
344        return 0
345    return 1
346
347
348# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L106-L118    # noqa: E501
349@register_jitable
350def _PyUnicode_IsDecimalDigit(ch):
351    if _PyUnicode_ToDecimalDigit(ch) < 0:
352        return 0
353    return 1
354
355
356# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L291-L296    # noqa: E501
357@register_jitable
358def _PyUnicode_IsSpace(ch):
359    ctype = _PyUnicode_gettyperecord(ch)
360    return ctype.flags & _PyUnicode_TyperecordMasks.SPACE_MASK != 0
361
362
363@register_jitable
364def _PyUnicode_IsAlpha(ch):
365    ctype = _PyUnicode_gettyperecord(ch)
366    return ctype.flags & _PyUnicode_TyperecordMasks.ALPHA_MASK != 0
367
368
369# End code related to/from CPython's unicodectype impl
370# ------------------------------------------------------------------------------
371
372
373# ------------------------------------------------------------------------------
374# Start code related to/from CPython's pyctype
375
376# From the definition in CPython's Include/pyctype.h
377# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L5-L11    # noqa: E501
378class _PY_CTF(IntEnum):
379    LOWER = 0x01
380    UPPER = 0x02
381    ALPHA = 0x01 | 0x02
382    DIGIT = 0x04
383    ALNUM = 0x01 | 0x02 | 0x04
384    SPACE = 0x08
385    XDIGIT = 0x10
386
387
388# From the definition in CPython's Python/pyctype.c
389# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Python/pyctype.c#L5    # noqa: E501
390_Py_ctype_table = np.array([
391    0,  # 0x0 '\x00'
392    0,  # 0x1 '\x01'
393    0,  # 0x2 '\x02'
394    0,  # 0x3 '\x03'
395    0,  # 0x4 '\x04'
396    0,  # 0x5 '\x05'
397    0,  # 0x6 '\x06'
398    0,  # 0x7 '\x07'
399    0,  # 0x8 '\x08'
400    _PY_CTF.SPACE,  # 0x9 '\t'
401    _PY_CTF.SPACE,  # 0xa '\n'
402    _PY_CTF.SPACE,  # 0xb '\v'
403    _PY_CTF.SPACE,  # 0xc '\f'
404    _PY_CTF.SPACE,  # 0xd '\r'
405    0,  # 0xe '\x0e'
406    0,  # 0xf '\x0f'
407    0,  # 0x10 '\x10'
408    0,  # 0x11 '\x11'
409    0,  # 0x12 '\x12'
410    0,  # 0x13 '\x13'
411    0,  # 0x14 '\x14'
412    0,  # 0x15 '\x15'
413    0,  # 0x16 '\x16'
414    0,  # 0x17 '\x17'
415    0,  # 0x18 '\x18'
416    0,  # 0x19 '\x19'
417    0,  # 0x1a '\x1a'
418    0,  # 0x1b '\x1b'
419    0,  # 0x1c '\x1c'
420    0,  # 0x1d '\x1d'
421    0,  # 0x1e '\x1e'
422    0,  # 0x1f '\x1f'
423    _PY_CTF.SPACE,  # 0x20 ' '
424    0,  # 0x21 '!'
425    0,  # 0x22 '"'
426    0,  # 0x23 '#'
427    0,  # 0x24 '$'
428    0,  # 0x25 '%'
429    0,  # 0x26 '&'
430    0,  # 0x27 "'"
431    0,  # 0x28 '('
432    0,  # 0x29 ')'
433    0,  # 0x2a '*'
434    0,  # 0x2b '+'
435    0,  # 0x2c ','
436    0,  # 0x2d '-'
437    0,  # 0x2e '.'
438    0,  # 0x2f '/'
439    _PY_CTF.DIGIT | _PY_CTF.XDIGIT,  # 0x30 '0'
440    _PY_CTF.DIGIT | _PY_CTF.XDIGIT,  # 0x31 '1'
441    _PY_CTF.DIGIT | _PY_CTF.XDIGIT,  # 0x32 '2'
442    _PY_CTF.DIGIT | _PY_CTF.XDIGIT,  # 0x33 '3'
443    _PY_CTF.DIGIT | _PY_CTF.XDIGIT,  # 0x34 '4'
444    _PY_CTF.DIGIT | _PY_CTF.XDIGIT,  # 0x35 '5'
445    _PY_CTF.DIGIT | _PY_CTF.XDIGIT,  # 0x36 '6'
446    _PY_CTF.DIGIT | _PY_CTF.XDIGIT,  # 0x37 '7'
447    _PY_CTF.DIGIT | _PY_CTF.XDIGIT,  # 0x38 '8'
448    _PY_CTF.DIGIT | _PY_CTF.XDIGIT,  # 0x39 '9'
449    0,  # 0x3a ':'
450    0,  # 0x3b ';'
451    0,  # 0x3c '<'
452    0,  # 0x3d '='
453    0,  # 0x3e '>'
454    0,  # 0x3f '?'
455    0,  # 0x40 '@'
456    _PY_CTF.UPPER | _PY_CTF.XDIGIT,  # 0x41 'A'
457    _PY_CTF.UPPER | _PY_CTF.XDIGIT,  # 0x42 'B'
458    _PY_CTF.UPPER | _PY_CTF.XDIGIT,  # 0x43 'C'
459    _PY_CTF.UPPER | _PY_CTF.XDIGIT,  # 0x44 'D'
460    _PY_CTF.UPPER | _PY_CTF.XDIGIT,  # 0x45 'E'
461    _PY_CTF.UPPER | _PY_CTF.XDIGIT,  # 0x46 'F'
462    _PY_CTF.UPPER,  # 0x47 'G'
463    _PY_CTF.UPPER,  # 0x48 'H'
464    _PY_CTF.UPPER,  # 0x49 'I'
465    _PY_CTF.UPPER,  # 0x4a 'J'
466    _PY_CTF.UPPER,  # 0x4b 'K'
467    _PY_CTF.UPPER,  # 0x4c 'L'
468    _PY_CTF.UPPER,  # 0x4d 'M'
469    _PY_CTF.UPPER,  # 0x4e 'N'
470    _PY_CTF.UPPER,  # 0x4f 'O'
471    _PY_CTF.UPPER,  # 0x50 'P'
472    _PY_CTF.UPPER,  # 0x51 'Q'
473    _PY_CTF.UPPER,  # 0x52 'R'
474    _PY_CTF.UPPER,  # 0x53 'S'
475    _PY_CTF.UPPER,  # 0x54 'T'
476    _PY_CTF.UPPER,  # 0x55 'U'
477    _PY_CTF.UPPER,  # 0x56 'V'
478    _PY_CTF.UPPER,  # 0x57 'W'
479    _PY_CTF.UPPER,  # 0x58 'X'
480    _PY_CTF.UPPER,  # 0x59 'Y'
481    _PY_CTF.UPPER,  # 0x5a 'Z'
482    0,  # 0x5b '['
483    0,  # 0x5c '\\'
484    0,  # 0x5d ']'
485    0,  # 0x5e '^'
486    0,  # 0x5f '_'
487    0,  # 0x60 '`'
488    _PY_CTF.LOWER | _PY_CTF.XDIGIT,  # 0x61 'a'
489    _PY_CTF.LOWER | _PY_CTF.XDIGIT,  # 0x62 'b'
490    _PY_CTF.LOWER | _PY_CTF.XDIGIT,  # 0x63 'c'
491    _PY_CTF.LOWER | _PY_CTF.XDIGIT,  # 0x64 'd'
492    _PY_CTF.LOWER | _PY_CTF.XDIGIT,  # 0x65 'e'
493    _PY_CTF.LOWER | _PY_CTF.XDIGIT,  # 0x66 'f'
494    _PY_CTF.LOWER,  # 0x67 'g'
495    _PY_CTF.LOWER,  # 0x68 'h'
496    _PY_CTF.LOWER,  # 0x69 'i'
497    _PY_CTF.LOWER,  # 0x6a 'j'
498    _PY_CTF.LOWER,  # 0x6b 'k'
499    _PY_CTF.LOWER,  # 0x6c 'l'
500    _PY_CTF.LOWER,  # 0x6d 'm'
501    _PY_CTF.LOWER,  # 0x6e 'n'
502    _PY_CTF.LOWER,  # 0x6f 'o'
503    _PY_CTF.LOWER,  # 0x70 'p'
504    _PY_CTF.LOWER,  # 0x71 'q'
505    _PY_CTF.LOWER,  # 0x72 'r'
506    _PY_CTF.LOWER,  # 0x73 's'
507    _PY_CTF.LOWER,  # 0x74 't'
508    _PY_CTF.LOWER,  # 0x75 'u'
509    _PY_CTF.LOWER,  # 0x76 'v'
510    _PY_CTF.LOWER,  # 0x77 'w'
511    _PY_CTF.LOWER,  # 0x78 'x'
512    _PY_CTF.LOWER,  # 0x79 'y'
513    _PY_CTF.LOWER,  # 0x7a 'z'
514    0,  # 0x7b '{'
515    0,  # 0x7c '|'
516    0,  # 0x7d '}'
517    0,  # 0x7e '~'
518    0,  # 0x7f '\x7f'
519    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
520    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
521    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
522    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
523    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
524    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
526    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
527], dtype=np.intc)
528
529
530# From the definition in CPython's Python/pyctype.c
531# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Python/pyctype.c#L145    # noqa: E501
532_Py_ctype_tolower = np.array([
533    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
534    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
535    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
536    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
537    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
538    0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
539    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
540    0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
541    0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
542    0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
543    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
544    0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
545    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
546    0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
547    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
548    0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
549    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
550    0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
551    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
552    0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
553    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
554    0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
555    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
556    0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
557    0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
558    0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
559    0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
560    0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
561    0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
562    0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
563    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
564    0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
565], dtype=np.uint8)
566
567
568# From the definition in CPython's Python/pyctype.c
569# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Python/pyctype.c#L180
570_Py_ctype_toupper = np.array([
571    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
572    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
573    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
574    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
575    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
576    0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
577    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
578    0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
579    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
580    0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
581    0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
582    0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
583    0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
584    0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
585    0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
586    0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
587    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
588    0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
589    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
590    0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
591    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
592    0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
593    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
594    0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
595    0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
596    0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
597    0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
598    0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
599    0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
600    0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
601    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
602    0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
603], dtype=np.uint8)
604
605
606class _PY_CTF_LB(IntEnum):
607    LINE_BREAK = 0x01
608    LINE_FEED = 0x02
609    CARRIAGE_RETURN = 0x04
610
611
612_Py_ctype_islinebreak = np.array([
613    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
614    _PY_CTF_LB.LINE_BREAK | _PY_CTF_LB.LINE_FEED,  # 0xa '\n'
615    _PY_CTF_LB.LINE_BREAK,  # 0xb '\v'
616    _PY_CTF_LB.LINE_BREAK,  # 0xc '\f'
617    _PY_CTF_LB.LINE_BREAK | _PY_CTF_LB.CARRIAGE_RETURN,  # 0xd '\r'
618    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
619    _PY_CTF_LB.LINE_BREAK,  # 0x1c '\x1c'
620    _PY_CTF_LB.LINE_BREAK,  # 0x1d '\x1d'
621    _PY_CTF_LB.LINE_BREAK,  # 0x1e '\x1e'
622    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
626    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
627    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
628    _PY_CTF_LB.LINE_BREAK,  # 0x85 '\x85'
629    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
630    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
631    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
632    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
633    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
634    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
635    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
636    0, 0, 0,
637], dtype=np.intc)
638
639
640# Translation of:
641# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pymacro.h#L25    # noqa: E501
642@register_jitable
643def _Py_CHARMASK(ch):
644    """
645    Equivalent to the CPython macro `Py_CHARMASK()`, masks off all but the
646    lowest 256 bits of ch.
647    """
648    return types.uint8(ch) & types.uint8(0xff)
649
650
651# Translation of:
652# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L30    # noqa: E501
653@register_jitable
654def _Py_TOUPPER(ch):
655    """
656    Equivalent to the CPython macro `Py_TOUPPER()` converts an ASCII range
657    code point to the upper equivalent
658    """
659    return _Py_ctype_toupper[_Py_CHARMASK(ch)]
660
661
662# Translation of:
663# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L29    # noqa: E501
664@register_jitable
665def _Py_TOLOWER(ch):
666    """
667    Equivalent to the CPython macro `Py_TOLOWER()` converts an ASCII range
668    code point to the lower equivalent
669    """
670    return _Py_ctype_tolower[_Py_CHARMASK(ch)]
671
672
673# Translation of:
674# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L18    # noqa: E501
675@register_jitable
676def _Py_ISLOWER(ch):
677    """
678    Equivalent to the CPython macro `Py_ISLOWER()`
679    """
680    return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.LOWER
681
682
683# Translation of:
684# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L19    # noqa: E501
685@register_jitable
686def _Py_ISUPPER(ch):
687    """
688    Equivalent to the CPython macro `Py_ISUPPER()`
689    """
690    return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.UPPER
691
692
693# Translation of:
694# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L20    # noqa: E501
695@register_jitable
696def _Py_ISALPHA(ch):
697    """
698    Equivalent to the CPython macro `Py_ISALPHA()`
699    """
700    return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.ALPHA
701
702
703# Translation of:
704# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L21    # noqa: E501
705@register_jitable
706def _Py_ISDIGIT(ch):
707    """
708    Equivalent to the CPython macro `Py_ISDIGIT()`
709    """
710    return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.DIGIT
711
712
713# Translation of:
714# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L22    # noqa: E501
715@register_jitable
716def _Py_ISXDIGIT(ch):
717    """
718    Equivalent to the CPython macro `Py_ISXDIGIT()`
719    """
720    return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.XDIGIT
721
722
723# Translation of:
724# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L23    # noqa: E501
725@register_jitable
726def _Py_ISALNUM(ch):
727    """
728    Equivalent to the CPython macro `Py_ISALNUM()`
729    """
730    return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.ALNUM
731
732
733# Translation of:
734# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L24    # noqa: E501
735@register_jitable
736def _Py_ISSPACE(ch):
737    """
738    Equivalent to the CPython macro `Py_ISSPACE()`
739    """
740    return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.SPACE
741
742
743@register_jitable
744def _Py_ISLINEBREAK(ch):
745    """Check if character is ASCII line break"""
746    return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.LINE_BREAK
747
748
749@register_jitable
750def _Py_ISLINEFEED(ch):
751    """Check if character is line feed `\n`"""
752    return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.LINE_FEED
753
754
755@register_jitable
756def _Py_ISCARRIAGERETURN(ch):
757    """Check if character is carriage return `\r`"""
758    return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.CARRIAGE_RETURN
759
760
761# End code related to/from CPython's pyctype
762# ------------------------------------------------------------------------------
763