1import sys
2import operator
3
4import numpy as np
5from llvmlite.ir import IntType, Constant
6
7from numba.core.extending import (
8    models,
9    register_model,
10    make_attribute_wrapper,
11    unbox,
12    box,
13    NativeValue,
14    overload,
15    overload_method,
16    intrinsic,
17    register_jitable,
18)
19from numba.core.imputils import (lower_constant, lower_cast, lower_builtin,
20                                 iternext_impl, impl_ret_new_ref, RefType)
21from numba.core.datamodel import register_default, StructModel
22from numba.core import utils, types, cgutils
23from numba.core.pythonapi import (
24    PY_UNICODE_1BYTE_KIND,
25    PY_UNICODE_2BYTE_KIND,
26    PY_UNICODE_4BYTE_KIND,
27    PY_UNICODE_WCHAR_KIND,
28)
29from numba._helperlib import c_helpers
30from numba.cpython.hashing import _Py_hash_t
31from numba.core.unsafe.bytes import memcpy_region
32from numba.core.errors import TypingError
33from numba.cpython.unicode_support import (_Py_TOUPPER, _Py_TOLOWER, _Py_UCS4,
34                                           _Py_ISALNUM,
35                                           _PyUnicode_ToUpperFull,
36                                           _PyUnicode_ToLowerFull,
37                                           _PyUnicode_ToFoldedFull,
38                                           _PyUnicode_ToTitleFull,
39                                           _PyUnicode_IsPrintable,
40                                           _PyUnicode_IsSpace,
41                                           _Py_ISSPACE,
42                                           _PyUnicode_IsXidStart,
43                                           _PyUnicode_IsXidContinue,
44                                           _PyUnicode_IsCased,
45                                           _PyUnicode_IsCaseIgnorable,
46                                           _PyUnicode_IsUppercase,
47                                           _PyUnicode_IsLowercase,
48                                           _PyUnicode_IsLineBreak,
49                                           _Py_ISLINEBREAK,
50                                           _Py_ISLINEFEED,
51                                           _Py_ISCARRIAGERETURN,
52                                           _PyUnicode_IsTitlecase,
53                                           _Py_ISLOWER,
54                                           _Py_ISUPPER,
55                                           _Py_TAB,
56                                           _Py_LINEFEED,
57                                           _Py_CARRIAGE_RETURN,
58                                           _Py_SPACE,
59                                           _PyUnicode_IsAlpha,
60                                           _PyUnicode_IsNumeric,
61                                           _Py_ISALPHA,
62                                           _PyUnicode_IsDigit,
63                                           _PyUnicode_IsDecimalDigit)
64from numba.cpython import slicing
65
66
67_py38_or_later = utils.PYVERSION >= (3, 8)
68
69# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L84-L85    # noqa: E501
70_MAX_UNICODE = 0x10ffff
71
72# DATA MODEL
73
74
75@register_model(types.UnicodeType)
76class UnicodeModel(models.StructModel):
77    def __init__(self, dmm, fe_type):
78        members = [
79            ('data', types.voidptr),
80            ('length', types.intp),
81            ('kind', types.int32),
82            ('is_ascii', types.uint32),
83            ('hash', _Py_hash_t),
84            ('meminfo', types.MemInfoPointer(types.voidptr)),
85            # A pointer to the owner python str/unicode object
86            ('parent', types.pyobject),
87        ]
88        models.StructModel.__init__(self, dmm, fe_type, members)
89
90
91make_attribute_wrapper(types.UnicodeType, 'data', '_data')
92make_attribute_wrapper(types.UnicodeType, 'length', '_length')
93make_attribute_wrapper(types.UnicodeType, 'kind', '_kind')
94make_attribute_wrapper(types.UnicodeType, 'is_ascii', '_is_ascii')
95make_attribute_wrapper(types.UnicodeType, 'hash', '_hash')
96
97
98@register_default(types.UnicodeIteratorType)
99class UnicodeIteratorModel(StructModel):
100    def __init__(self, dmm, fe_type):
101        members = [('index', types.EphemeralPointer(types.uintp)),
102                   ('data', fe_type.data)]
103        super(UnicodeIteratorModel, self).__init__(dmm, fe_type, members)
104
105# CAST
106
107
108def compile_time_get_string_data(obj):
109    """Get string data from a python string for use at compile-time to embed
110    the string data into the LLVM module.
111    """
112    from ctypes import (
113        CFUNCTYPE, c_void_p, c_int, c_uint, c_ssize_t, c_ubyte, py_object,
114        POINTER, byref,
115    )
116
117    extract_unicode_fn = c_helpers['extract_unicode']
118    proto = CFUNCTYPE(c_void_p, py_object, POINTER(c_ssize_t), POINTER(c_int),
119                      POINTER(c_uint), POINTER(c_ssize_t))
120    fn = proto(extract_unicode_fn)
121    length = c_ssize_t()
122    kind = c_int()
123    is_ascii = c_uint()
124    hashv = c_ssize_t()
125    data = fn(obj, byref(length), byref(kind), byref(is_ascii), byref(hashv))
126    if data is None:
127        raise ValueError("cannot extract unicode data from the given string")
128    length = length.value
129    kind = kind.value
130    is_ascii = is_ascii.value
131    nbytes = (length + 1) * _kind_to_byte_width(kind)
132    out = (c_ubyte * nbytes).from_address(data)
133    return bytes(out), length, kind, is_ascii, hashv.value
134
135
136def make_string_from_constant(context, builder, typ, literal_string):
137    """
138    Get string data by `compile_time_get_string_data()` and return a
139    unicode_type LLVM value
140    """
141    databytes, length, kind, is_ascii, hashv = \
142        compile_time_get_string_data(literal_string)
143    mod = builder.module
144    gv = context.insert_const_bytes(mod, databytes)
145    uni_str = cgutils.create_struct_proxy(typ)(context, builder)
146    uni_str.data = gv
147    uni_str.length = uni_str.length.type(length)
148    uni_str.kind = uni_str.kind.type(kind)
149    uni_str.is_ascii = uni_str.is_ascii.type(is_ascii)
150    # Set hash to -1 to indicate that it should be computed.
151    # We cannot bake in the hash value because of hashseed randomization.
152    uni_str.hash = uni_str.hash.type(-1)
153    return uni_str._getvalue()
154
155
156@lower_cast(types.StringLiteral, types.unicode_type)
157def cast_from_literal(context, builder, fromty, toty, val):
158    return make_string_from_constant(
159        context, builder, toty, fromty.literal_value,
160    )
161
162
163# CONSTANT
164
165@lower_constant(types.unicode_type)
166def constant_unicode(context, builder, typ, pyval):
167    return make_string_from_constant(context, builder, typ, pyval)
168
169
170# BOXING
171
172
173@unbox(types.UnicodeType)
174def unbox_unicode_str(typ, obj, c):
175    """
176    Convert a unicode str object to a native unicode structure.
177    """
178    ok, data, length, kind, is_ascii, hashv = \
179        c.pyapi.string_as_string_size_and_kind(obj)
180    uni_str = cgutils.create_struct_proxy(typ)(c.context, c.builder)
181    uni_str.data = data
182    uni_str.length = length
183    uni_str.kind = kind
184    uni_str.is_ascii = is_ascii
185    uni_str.hash = hashv
186    uni_str.meminfo = c.pyapi.nrt_meminfo_new_from_pyobject(
187        data,  # the borrowed data pointer
188        obj,   # the owner pyobject; the call will incref it.
189    )
190    uni_str.parent = obj
191
192    is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred())
193    return NativeValue(uni_str._getvalue(), is_error=is_error)
194
195
196@box(types.UnicodeType)
197def box_unicode_str(typ, val, c):
198    """
199    Convert a native unicode structure to a unicode string
200    """
201    uni_str = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
202    res = c.pyapi.string_from_kind_and_data(
203        uni_str.kind, uni_str.data, uni_str.length)
204    # hash isn't needed now, just compute it so it ends up in the unicodeobject
205    # hash cache, cpython doesn't always do this, depends how a string was
206    # created it's safe, just burns the cycles required to hash on @box
207    c.pyapi.object_hash(res)
208    c.context.nrt.decref(c.builder, typ, val)
209    return res
210
211
212# HELPER FUNCTIONS
213
214
215def make_deref_codegen(bitsize):
216    def codegen(context, builder, signature, args):
217        data, idx = args
218        ptr = builder.bitcast(data, IntType(bitsize).as_pointer())
219        ch = builder.load(builder.gep(ptr, [idx]))
220        return builder.zext(ch, IntType(32))
221
222    return codegen
223
224
225@intrinsic
226def deref_uint8(typingctx, data, offset):
227    sig = types.uint32(types.voidptr, types.intp)
228    return sig, make_deref_codegen(8)
229
230
231@intrinsic
232def deref_uint16(typingctx, data, offset):
233    sig = types.uint32(types.voidptr, types.intp)
234    return sig, make_deref_codegen(16)
235
236
237@intrinsic
238def deref_uint32(typingctx, data, offset):
239    sig = types.uint32(types.voidptr, types.intp)
240    return sig, make_deref_codegen(32)
241
242
243@intrinsic
244def _malloc_string(typingctx, kind, char_bytes, length, is_ascii):
245    """make empty string with data buffer of size alloc_bytes.
246
247    Must set length and kind values for string after it is returned
248    """
249    def details(context, builder, signature, args):
250        [kind_val, char_bytes_val, length_val, is_ascii_val] = args
251
252        # fill the struct
253        uni_str_ctor = cgutils.create_struct_proxy(types.unicode_type)
254        uni_str = uni_str_ctor(context, builder)
255        # add null padding character
256        nbytes_val = builder.mul(char_bytes_val,
257                                 builder.add(length_val,
258                                             Constant(length_val.type, 1)))
259        uni_str.meminfo = context.nrt.meminfo_alloc(builder, nbytes_val)
260        uni_str.kind = kind_val
261        uni_str.is_ascii = is_ascii_val
262        uni_str.length = length_val
263        # empty string has hash value -1 to indicate "need to compute hash"
264        uni_str.hash = context.get_constant(_Py_hash_t, -1)
265        uni_str.data = context.nrt.meminfo_data(builder, uni_str.meminfo)
266        # Set parent to NULL
267        uni_str.parent = cgutils.get_null_value(uni_str.parent.type)
268        return uni_str._getvalue()
269
270    sig = types.unicode_type(types.int32, types.intp, types.intp, types.uint32)
271    return sig, details
272
273
274@register_jitable
275def _empty_string(kind, length, is_ascii=0):
276    char_width = _kind_to_byte_width(kind)
277    s = _malloc_string(kind, char_width, length, is_ascii)
278    _set_code_point(s, length, np.uint32(0))    # Write NULL character
279    return s
280
281
282# Disable RefCt for performance.
283@register_jitable(_nrt=False)
284def _get_code_point(a, i):
285    if a._kind == PY_UNICODE_1BYTE_KIND:
286        return deref_uint8(a._data, i)
287    elif a._kind == PY_UNICODE_2BYTE_KIND:
288        return deref_uint16(a._data, i)
289    elif a._kind == PY_UNICODE_4BYTE_KIND:
290        return deref_uint32(a._data, i)
291    else:
292        # there's also a wchar kind, but that's one of the above,
293        # so skipping for this example
294        return 0
295
296####
297
298
299def make_set_codegen(bitsize):
300    def codegen(context, builder, signature, args):
301        data, idx, ch = args
302        if bitsize < 32:
303            ch = builder.trunc(ch, IntType(bitsize))
304        ptr = builder.bitcast(data, IntType(bitsize).as_pointer())
305        builder.store(ch, builder.gep(ptr, [idx]))
306        return context.get_dummy_value()
307
308    return codegen
309
310
311@intrinsic
312def set_uint8(typingctx, data, idx, ch):
313    sig = types.void(types.voidptr, types.int64, types.uint32)
314    return sig, make_set_codegen(8)
315
316
317@intrinsic
318def set_uint16(typingctx, data, idx, ch):
319    sig = types.void(types.voidptr, types.int64, types.uint32)
320    return sig, make_set_codegen(16)
321
322
323@intrinsic
324def set_uint32(typingctx, data, idx, ch):
325    sig = types.void(types.voidptr, types.int64, types.uint32)
326    return sig, make_set_codegen(32)
327
328
329@register_jitable(_nrt=False)
330def _set_code_point(a, i, ch):
331    # WARNING: This method is very dangerous:
332    #   * Assumes that data contents can be changed (only allowed for new
333    #     strings)
334    #   * Assumes that the kind of unicode string is sufficiently wide to
335    #     accept ch.  Will truncate ch to make it fit.
336    #   * Assumes that i is within the valid boundaries of the function
337    if a._kind == PY_UNICODE_1BYTE_KIND:
338        set_uint8(a._data, i, ch)
339    elif a._kind == PY_UNICODE_2BYTE_KIND:
340        set_uint16(a._data, i, ch)
341    elif a._kind == PY_UNICODE_4BYTE_KIND:
342        set_uint32(a._data, i, ch)
343    else:
344        raise AssertionError(
345            "Unexpected unicode representation in _set_code_point")
346
347
348@register_jitable
349def _pick_kind(kind1, kind2):
350    if kind1 == PY_UNICODE_WCHAR_KIND or kind2 == PY_UNICODE_WCHAR_KIND:
351        raise AssertionError("PY_UNICODE_WCHAR_KIND unsupported")
352
353    if kind1 == PY_UNICODE_1BYTE_KIND:
354        return kind2
355    elif kind1 == PY_UNICODE_2BYTE_KIND:
356        if kind2 == PY_UNICODE_4BYTE_KIND:
357            return kind2
358        else:
359            return kind1
360    elif kind1 == PY_UNICODE_4BYTE_KIND:
361        return kind1
362    else:
363        raise AssertionError("Unexpected unicode representation in _pick_kind")
364
365
366@register_jitable
367def _pick_ascii(is_ascii1, is_ascii2):
368    if is_ascii1 == 1 and is_ascii2 == 1:
369        return types.uint32(1)
370    return types.uint32(0)
371
372
373@register_jitable
374def _kind_to_byte_width(kind):
375    if kind == PY_UNICODE_1BYTE_KIND:
376        return 1
377    elif kind == PY_UNICODE_2BYTE_KIND:
378        return 2
379    elif kind == PY_UNICODE_4BYTE_KIND:
380        return 4
381    elif kind == PY_UNICODE_WCHAR_KIND:
382        raise AssertionError("PY_UNICODE_WCHAR_KIND unsupported")
383    else:
384        raise AssertionError("Unexpected unicode encoding encountered")
385
386
387@register_jitable(_nrt=False)
388def _cmp_region(a, a_offset, b, b_offset, n):
389    if n == 0:
390        return 0
391    elif a_offset + n > a._length:
392        return -1
393    elif b_offset + n > b._length:
394        return 1
395
396    for i in range(n):
397        a_chr = _get_code_point(a, a_offset + i)
398        b_chr = _get_code_point(b, b_offset + i)
399        if a_chr < b_chr:
400            return -1
401        elif a_chr > b_chr:
402            return 1
403
404    return 0
405
406
407@register_jitable
408def _codepoint_to_kind(cp):
409    """
410    Compute the minimum unicode kind needed to hold a given codepoint
411    """
412    if cp < 256:
413        return PY_UNICODE_1BYTE_KIND
414    elif cp < 65536:
415        return PY_UNICODE_2BYTE_KIND
416    else:
417        # Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)
418        MAX_UNICODE = 0x10ffff
419        if cp > MAX_UNICODE:
420            msg = "Invalid codepoint. Found value greater than Unicode maximum"
421            raise ValueError(msg)
422        return PY_UNICODE_4BYTE_KIND
423
424
425@register_jitable
426def _codepoint_is_ascii(ch):
427    """
428    Returns true if a codepoint is in the ASCII range
429    """
430    return ch < 128
431
432
433# PUBLIC API
434
435
436@overload(str)
437def unicode_str(s):
438    if isinstance(s, types.UnicodeType):
439        return lambda s: s
440
441
442@overload(len)
443def unicode_len(s):
444    if isinstance(s, types.UnicodeType):
445        def len_impl(s):
446            return s._length
447        return len_impl
448
449
450@overload(operator.eq)
451def unicode_eq(a, b):
452    if not (a.is_internal and b.is_internal):
453        return
454    accept = (types.UnicodeType, types.StringLiteral, types.UnicodeCharSeq)
455    a_unicode = isinstance(a, accept)
456    b_unicode = isinstance(b, accept)
457    if a_unicode and b_unicode:
458        def eq_impl(a, b):
459            # the str() is for UnicodeCharSeq, it's a nop else
460            a = str(a)
461            b = str(b)
462            if len(a) != len(b):
463                return False
464            return _cmp_region(a, 0, b, 0, len(a)) == 0
465        return eq_impl
466    elif a_unicode ^ b_unicode:
467        # one of the things is unicode, everything compares False
468        def eq_impl(a, b):
469            return False
470        return eq_impl
471
472
473@overload(operator.ne)
474def unicode_ne(a, b):
475    if not (a.is_internal and b.is_internal):
476        return
477    accept = (types.UnicodeType, types.StringLiteral, types.UnicodeCharSeq)
478    a_unicode = isinstance(a, accept)
479    b_unicode = isinstance(b, accept)
480    if a_unicode and b_unicode:
481        def ne_impl(a, b):
482            return not (a == b)
483        return ne_impl
484    elif a_unicode ^ b_unicode:
485        # one of the things is unicode, everything compares True
486        def eq_impl(a, b):
487            return True
488        return eq_impl
489
490
491@overload(operator.lt)
492def unicode_lt(a, b):
493    a_unicode = isinstance(a, (types.UnicodeType, types.StringLiteral))
494    b_unicode = isinstance(b, (types.UnicodeType, types.StringLiteral))
495    if a_unicode and b_unicode:
496        def lt_impl(a, b):
497            minlen = min(len(a), len(b))
498            eqcode = _cmp_region(a, 0, b, 0, minlen)
499            if eqcode == -1:
500                return True
501            elif eqcode == 0:
502                return len(a) < len(b)
503            return False
504        return lt_impl
505
506
507@overload(operator.gt)
508def unicode_gt(a, b):
509    a_unicode = isinstance(a, (types.UnicodeType, types.StringLiteral))
510    b_unicode = isinstance(b, (types.UnicodeType, types.StringLiteral))
511    if a_unicode and b_unicode:
512        def gt_impl(a, b):
513            minlen = min(len(a), len(b))
514            eqcode = _cmp_region(a, 0, b, 0, minlen)
515            if eqcode == 1:
516                return True
517            elif eqcode == 0:
518                return len(a) > len(b)
519            return False
520        return gt_impl
521
522
523@overload(operator.le)
524def unicode_le(a, b):
525    a_unicode = isinstance(a, (types.UnicodeType, types.StringLiteral))
526    b_unicode = isinstance(b, (types.UnicodeType, types.StringLiteral))
527    if a_unicode and b_unicode:
528        def le_impl(a, b):
529            return not (a > b)
530        return le_impl
531
532
533@overload(operator.ge)
534def unicode_ge(a, b):
535    a_unicode = isinstance(a, (types.UnicodeType, types.StringLiteral))
536    b_unicode = isinstance(b, (types.UnicodeType, types.StringLiteral))
537    if a_unicode and b_unicode:
538        def ge_impl(a, b):
539            return not (a < b)
540        return ge_impl
541
542
543@overload(operator.contains)
544def unicode_contains(a, b):
545    if isinstance(a, types.UnicodeType) and isinstance(b, types.UnicodeType):
546        def contains_impl(a, b):
547            # note parameter swap: contains(a, b) == b in a
548            return _find(a, b) > -1
549        return contains_impl
550
551
552def unicode_idx_check_type(ty, name):
553    """Check object belongs to one of specific types
554    ty: type
555        Type of the object
556    name: str
557        Name of the object
558    """
559    thety = ty
560    # if the type is omitted, the concrete type is the value
561    if isinstance(ty, types.Omitted):
562        thety = ty.value
563    # if the type is optional, the concrete type is the captured type
564    elif isinstance(ty, types.Optional):
565        thety = ty.type
566
567    accepted = (types.Integer, types.NoneType)
568    if thety is not None and not isinstance(thety, accepted):
569        raise TypingError('"{}" must be {}, not {}'.format(name, accepted, ty))
570
571
572def unicode_sub_check_type(ty, name):
573    """Check object belongs to unicode type"""
574    if not isinstance(ty, types.UnicodeType):
575        msg = '"{}" must be {}, not {}'.format(name, types.UnicodeType, ty)
576        raise TypingError(msg)
577
578
579def generate_finder(find_func):
580    """Generate finder either left or right."""
581    def impl(data, substr, start=None, end=None):
582        length = len(data)
583        sub_length = len(substr)
584        if start is None:
585            start = 0
586        if end is None:
587            end = length
588
589        start, end = _adjust_indices(length, start, end)
590        if end - start < sub_length:
591            return -1
592
593        return find_func(data, substr, start, end)
594
595    return impl
596
597
598@register_jitable
599def _finder(data, substr, start, end):
600    """Left finder."""
601    if len(substr) == 0:
602        return start
603    for i in range(start, min(len(data), end) - len(substr) + 1):
604        if _cmp_region(data, i, substr, 0, len(substr)) == 0:
605            return i
606    return -1
607
608
609@register_jitable
610def _rfinder(data, substr, start, end):
611    """Right finder."""
612    if len(substr) == 0:
613        return end
614    for i in range(min(len(data), end) - len(substr), start - 1, -1):
615        if _cmp_region(data, i, substr, 0, len(substr)) == 0:
616            return i
617    return -1
618
619
620_find = register_jitable(generate_finder(_finder))
621_rfind = register_jitable(generate_finder(_rfinder))
622
623
624@overload_method(types.UnicodeType, 'find')
625def unicode_find(data, substr, start=None, end=None):
626    """Implements str.find()"""
627    if isinstance(substr, types.UnicodeCharSeq):
628        def find_impl(data, substr, start=None, end=None):
629            return data.find(str(substr))
630        return find_impl
631
632    unicode_idx_check_type(start, 'start')
633    unicode_idx_check_type(end, 'end')
634    unicode_sub_check_type(substr, 'substr')
635
636    return _find
637
638
639@overload_method(types.UnicodeType, 'rfind')
640def unicode_rfind(data, substr, start=None, end=None):
641    """Implements str.rfind()"""
642    if isinstance(substr, types.UnicodeCharSeq):
643        def rfind_impl(data, substr, start=None, end=None):
644            return data.rfind(str(substr))
645        return rfind_impl
646
647    unicode_idx_check_type(start, 'start')
648    unicode_idx_check_type(end, 'end')
649    unicode_sub_check_type(substr, 'substr')
650
651    return _rfind
652
653
654# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12831-L12857    # noqa: E501
655@overload_method(types.UnicodeType, 'rindex')
656def unicode_rindex(s, sub, start=None, end=None):
657    """Implements str.rindex()"""
658    unicode_idx_check_type(start, 'start')
659    unicode_idx_check_type(end, 'end')
660    unicode_sub_check_type(sub, 'sub')
661
662    def rindex_impl(s, sub, start=None, end=None):
663        result = s.rfind(sub, start, end)
664        if result < 0:
665            raise ValueError('substring not found')
666
667        return result
668
669    return rindex_impl
670
671
672# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11692-L11718    # noqa: E501
673@overload_method(types.UnicodeType, 'index')
674def unicode_index(s, sub, start=None, end=None):
675    """Implements str.index()"""
676    unicode_idx_check_type(start, 'start')
677    unicode_idx_check_type(end, 'end')
678    unicode_sub_check_type(sub, 'sub')
679
680    def index_impl(s, sub, start=None, end=None):
681        result = s.find(sub, start, end)
682        if result < 0:
683            raise ValueError('substring not found')
684
685        return result
686
687    return index_impl
688
689
690# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12922-L12976    # noqa: E501
691@overload_method(types.UnicodeType, 'partition')
692def unicode_partition(data, sep):
693    """Implements str.partition()"""
694    thety = sep
695    # if the type is omitted, the concrete type is the value
696    if isinstance(sep, types.Omitted):
697        thety = sep.value
698    # if the type is optional, the concrete type is the captured type
699    elif isinstance(sep, types.Optional):
700        thety = sep.type
701
702    accepted = (types.UnicodeType, types.UnicodeCharSeq)
703    if thety is not None and not isinstance(thety, accepted):
704        msg = '"{}" must be {}, not {}'.format('sep', accepted, sep)
705        raise TypingError(msg)
706
707    def impl(data, sep):
708        # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/partition.h#L7-L60    # noqa: E501
709        sep = str(sep)
710        empty_str = _empty_string(data._kind, 0, data._is_ascii)
711        sep_length = len(sep)
712        if data._kind < sep._kind or len(data) < sep_length:
713            return data, empty_str, empty_str
714
715        if sep_length == 0:
716            raise ValueError('empty separator')
717
718        pos = data.find(sep)
719        if pos < 0:
720            return data, empty_str, empty_str
721
722        return data[0:pos], sep, data[pos + sep_length:len(data)]
723
724    return impl
725
726
727@overload_method(types.UnicodeType, 'count')
728def unicode_count(src, sub, start=None, end=None):
729
730    _count_args_types_check(start)
731    _count_args_types_check(end)
732
733    if isinstance(sub, types.UnicodeType):
734        def count_impl(src, sub, start=None, end=None):
735            count = 0
736            src_len = len(src)
737            sub_len = len(sub)
738
739            start = _normalize_slice_idx_count(start, src_len, 0)
740            end = _normalize_slice_idx_count(end, src_len, src_len)
741
742            if end - start < 0 or start > src_len:
743                return 0
744
745            src = src[start : end]
746            src_len = len(src)
747            start, end = 0, src_len
748            if sub_len == 0:
749                return src_len + 1
750
751            while(start + sub_len <= src_len):
752                if src[start : start + sub_len] == sub:
753                    count += 1
754                    start += sub_len
755                else:
756                    start += 1
757            return count
758        return count_impl
759    error_msg = "The substring must be a UnicodeType, not {}"
760    raise TypingError(error_msg.format(type(sub)))
761
762
763# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12979-L13033    # noqa: E501
764@overload_method(types.UnicodeType, 'rpartition')
765def unicode_rpartition(data, sep):
766    """Implements str.rpartition()"""
767    thety = sep
768    # if the type is omitted, the concrete type is the value
769    if isinstance(sep, types.Omitted):
770        thety = sep.value
771    # if the type is optional, the concrete type is the captured type
772    elif isinstance(sep, types.Optional):
773        thety = sep.type
774
775    accepted = (types.UnicodeType, types.UnicodeCharSeq)
776    if thety is not None and not isinstance(thety, accepted):
777        msg = '"{}" must be {}, not {}'.format('sep', accepted, sep)
778        raise TypingError(msg)
779
780    def impl(data, sep):
781        # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/partition.h#L62-L115    # noqa: E501
782        sep = str(sep)
783        empty_str = _empty_string(data._kind, 0, data._is_ascii)
784        sep_length = len(sep)
785        if data._kind < sep._kind or len(data) < sep_length:
786            return empty_str, empty_str, data
787
788        if sep_length == 0:
789            raise ValueError('empty separator')
790
791        pos = data.rfind(sep)
792        if pos < 0:
793            return empty_str, empty_str, data
794
795        return data[0:pos], sep, data[pos + sep_length:len(data)]
796
797    return impl
798
799
800@overload_method(types.UnicodeType, 'startswith')
801def unicode_startswith(a, b):
802    if isinstance(b, types.UnicodeType):
803        def startswith_impl(a, b):
804            return _cmp_region(a, 0, b, 0, len(b)) == 0
805        return startswith_impl
806    if isinstance(b, types.UnicodeCharSeq):
807        def startswith_impl(a, b):
808            return a.startswith(str(b))
809        return startswith_impl
810
811
812# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9342-L9354    # noqa: E501
813@register_jitable
814def _adjust_indices(length, start, end):
815    if end > length:
816        end = length
817    if end < 0:
818        end += length
819        if end < 0:
820            end = 0
821    if start < 0:
822        start += length
823        if start < 0:
824            start = 0
825
826    return start, end
827
828
829@overload_method(types.UnicodeType, 'endswith')
830def unicode_endswith(s, substr, start=None, end=None):
831    if not (start is None or isinstance(start, (types.Omitted,
832                                                types.Integer,
833                                                types.NoneType))):
834        raise TypingError('The arg must be a Integer or None')
835
836    if not (end is None or isinstance(end, (types.Omitted,
837                                            types.Integer,
838                                            types.NoneType))):
839        raise TypingError('The arg must be a Integer or None')
840
841    if isinstance(substr, (types.Tuple, types.UniTuple)):
842        def endswith_impl(s, substr, start=None, end=None):
843            for item in substr:
844                if s.endswith(item, start, end) is True:
845                    return True
846
847            return False
848        return endswith_impl
849
850    if isinstance(substr, types.UnicodeType):
851        def endswith_impl(s, substr, start=None, end=None):
852            length = len(s)
853            sub_length = len(substr)
854            if start is None:
855                start = 0
856            if end is None:
857                end = length
858
859            start, end = _adjust_indices(length, start, end)
860            if end - start < sub_length:
861                return False
862
863            if sub_length == 0:
864                return True
865
866            s = s[start:end]
867            offset = len(s) - sub_length
868
869            return _cmp_region(s, offset, substr, 0, sub_length) == 0
870        return endswith_impl
871
872    if isinstance(substr, types.UnicodeCharSeq):
873        def endswith_impl(s, substr, start=None, end=None):
874            return s.endswith(str(substr), start, end)
875        return endswith_impl
876
877
878# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11519-L11595    # noqa: E501
879@overload_method(types.UnicodeType, 'expandtabs')
880def unicode_expandtabs(data, tabsize=8):
881    """Implements str.expandtabs()"""
882    thety = tabsize
883    # if the type is omitted, the concrete type is the value
884    if isinstance(tabsize, types.Omitted):
885        thety = tabsize.value
886    # if the type is optional, the concrete type is the captured type
887    elif isinstance(tabsize, types.Optional):
888        thety = tabsize.type
889
890    accepted = (types.Integer, int)
891    if thety is not None and not isinstance(thety, accepted):
892        raise TypingError(
893            '"tabsize" must be {}, not {}'.format(accepted, tabsize))
894
895    def expandtabs_impl(data, tabsize=8):
896        length = len(data)
897        j = line_pos = 0
898        found = False
899        for i in range(length):
900            code_point = _get_code_point(data, i)
901            if code_point == _Py_TAB:
902                found = True
903                if tabsize > 0:
904                    # cannot overflow
905                    incr = tabsize - (line_pos % tabsize)
906                    if j > sys.maxsize - incr:
907                        raise OverflowError('new string is too long')
908                    line_pos += incr
909                    j += incr
910            else:
911                if j > sys.maxsize - 1:
912                    raise OverflowError('new string is too long')
913                line_pos += 1
914                j += 1
915                if code_point in (_Py_LINEFEED, _Py_CARRIAGE_RETURN):
916                    line_pos = 0
917
918        if not found:
919            return data
920
921        res = _empty_string(data._kind, j, data._is_ascii)
922        j = line_pos = 0
923        for i in range(length):
924            code_point = _get_code_point(data, i)
925            if code_point == _Py_TAB:
926                if tabsize > 0:
927                    incr = tabsize - (line_pos % tabsize)
928                    line_pos += incr
929                    for idx in range(j, j + incr):
930                        _set_code_point(res, idx, _Py_SPACE)
931                    j += incr
932            else:
933                line_pos += 1
934                _set_code_point(res, j, code_point)
935                j += 1
936                if code_point in (_Py_LINEFEED, _Py_CARRIAGE_RETURN):
937                    line_pos = 0
938
939        return res
940
941    return expandtabs_impl
942
943
944@overload_method(types.UnicodeType, 'split')
945def unicode_split(a, sep=None, maxsplit=-1):
946    if not (maxsplit == -1 or
947            isinstance(maxsplit, (types.Omitted, types.Integer,
948                                  types.IntegerLiteral))):
949        return None  # fail typing if maxsplit is not an integer
950
951    if isinstance(sep, types.UnicodeCharSeq):
952        def split_impl(a, sep=None, maxsplit=-1):
953            return a.split(str(sep), maxsplit=maxsplit)
954        return split_impl
955
956    if isinstance(sep, types.UnicodeType):
957        def split_impl(a, sep=None, maxsplit=-1):
958            a_len = len(a)
959            sep_len = len(sep)
960
961            if sep_len == 0:
962                raise ValueError('empty separator')
963
964            parts = []
965            last = 0
966            idx = 0
967
968            if sep_len == 1 and maxsplit == -1:
969                sep_code_point = _get_code_point(sep, 0)
970                for idx in range(a_len):
971                    if _get_code_point(a, idx) == sep_code_point:
972                        parts.append(a[last:idx])
973                        last = idx + 1
974            else:
975                split_count = 0
976
977                while idx < a_len and (maxsplit == -1 or
978                                       split_count < maxsplit):
979                    if _cmp_region(a, idx, sep, 0, sep_len) == 0:
980                        parts.append(a[last:idx])
981                        idx += sep_len
982                        last = idx
983                        split_count += 1
984                    else:
985                        idx += 1
986
987            if last <= a_len:
988                parts.append(a[last:])
989
990            return parts
991        return split_impl
992    elif sep is None or isinstance(sep, types.NoneType) or \
993            getattr(sep, 'value', False) is None:
994        def split_whitespace_impl(a, sep=None, maxsplit=-1):
995            a_len = len(a)
996
997            parts = []
998            last = 0
999            idx = 0
1000            split_count = 0
1001            in_whitespace_block = True
1002
1003            for idx in range(a_len):
1004                code_point = _get_code_point(a, idx)
1005                is_whitespace = _PyUnicode_IsSpace(code_point)
1006                if in_whitespace_block:
1007                    if is_whitespace:
1008                        pass  # keep consuming space
1009                    else:
1010                        last = idx  # this is the start of the next string
1011                        in_whitespace_block = False
1012                else:
1013                    if not is_whitespace:
1014                        pass  # keep searching for whitespace transition
1015                    else:
1016                        parts.append(a[last:idx])
1017                        in_whitespace_block = True
1018                        split_count += 1
1019                        if maxsplit != -1 and split_count == maxsplit:
1020                            break
1021
1022            if last <= a_len and not in_whitespace_block:
1023                parts.append(a[last:])
1024
1025            return parts
1026        return split_whitespace_impl
1027
1028
1029def generate_rsplit_whitespace_impl(isspace_func):
1030    """Generate whitespace rsplit func based on either ascii or unicode"""
1031
1032    def rsplit_whitespace_impl(data, sep=None, maxsplit=-1):
1033        # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L192-L240    # noqa: E501
1034        if maxsplit < 0:
1035            maxsplit = sys.maxsize
1036
1037        result = []
1038        i = len(data) - 1
1039        while maxsplit > 0:
1040            while i >= 0:
1041                code_point = _get_code_point(data, i)
1042                if not isspace_func(code_point):
1043                    break
1044                i -= 1
1045            if i < 0:
1046                break
1047            j = i
1048            i -= 1
1049            while i >= 0:
1050                code_point = _get_code_point(data, i)
1051                if isspace_func(code_point):
1052                    break
1053                i -= 1
1054            result.append(data[i + 1:j + 1])
1055            maxsplit -= 1
1056
1057        if i >= 0:
1058            # Only occurs when maxsplit was reached
1059            # Skip any remaining whitespace and copy to beginning of string
1060            while i >= 0:
1061                code_point = _get_code_point(data, i)
1062                if not isspace_func(code_point):
1063                    break
1064                i -= 1
1065            if i >= 0:
1066                result.append(data[0:i + 1])
1067
1068        return result[::-1]
1069
1070    return rsplit_whitespace_impl
1071
1072
1073unicode_rsplit_whitespace_impl = register_jitable(
1074    generate_rsplit_whitespace_impl(_PyUnicode_IsSpace))
1075ascii_rsplit_whitespace_impl = register_jitable(
1076    generate_rsplit_whitespace_impl(_Py_ISSPACE))
1077
1078
1079# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L13095-L13108    # noqa: E501
1080@overload_method(types.UnicodeType, 'rsplit')
1081def unicode_rsplit(data, sep=None, maxsplit=-1):
1082    """Implements str.unicode_rsplit()"""
1083
1084    def _unicode_rsplit_check_type(ty, name, accepted):
1085        """Check object belongs to one of specified types"""
1086        thety = ty
1087        # if the type is omitted, the concrete type is the value
1088        if isinstance(ty, types.Omitted):
1089            thety = ty.value
1090        # if the type is optional, the concrete type is the captured type
1091        elif isinstance(ty, types.Optional):
1092            thety = ty.type
1093
1094        if thety is not None and not isinstance(thety, accepted):
1095            raise TypingError(
1096                '"{}" must be {}, not {}'.format(name, accepted, ty))
1097
1098    _unicode_rsplit_check_type(sep, 'sep', (types.UnicodeType,
1099                                            types.UnicodeCharSeq,
1100                                            types.NoneType))
1101    _unicode_rsplit_check_type(maxsplit, 'maxsplit', (types.Integer, int))
1102
1103    if sep is None or isinstance(sep, (types.NoneType, types.Omitted)):
1104
1105        def rsplit_whitespace_impl(data, sep=None, maxsplit=-1):
1106            if data._is_ascii:
1107                return ascii_rsplit_whitespace_impl(data, sep, maxsplit)
1108            return unicode_rsplit_whitespace_impl(data, sep, maxsplit)
1109
1110        return rsplit_whitespace_impl
1111
1112    def rsplit_impl(data, sep=None, maxsplit=-1):
1113        sep = str(sep)
1114        # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L286-L333    # noqa: E501
1115        if data._kind < sep._kind or len(data) < len(sep):
1116            return [data]
1117
1118        def _rsplit_char(data, ch, maxsplit):
1119            # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L242-L284    # noqa: E501
1120            result = []
1121            ch_code_point = _get_code_point(ch, 0)
1122            i = j = len(data) - 1
1123            while i >= 0 and maxsplit > 0:
1124                data_code_point = _get_code_point(data, i)
1125                if data_code_point == ch_code_point:
1126                    result.append(data[i + 1 : j + 1])
1127                    j = i = i - 1
1128                    maxsplit -= 1
1129                i -= 1
1130            if j >= -1:
1131                result.append(data[0 : j + 1])
1132
1133            return result[::-1]
1134
1135        if maxsplit < 0:
1136            maxsplit = sys.maxsize
1137
1138        sep_length = len(sep)
1139
1140        if sep_length == 0:
1141            raise ValueError('empty separator')
1142        if sep_length == 1:
1143            return _rsplit_char(data, sep, maxsplit)
1144
1145        result = []
1146        j = len(data)
1147        while maxsplit > 0:
1148            pos = data.rfind(sep, start=0, end=j)
1149            if pos < 0:
1150                break
1151            result.append(data[pos + sep_length:j])
1152            j = pos
1153            maxsplit -= 1
1154
1155        result.append(data[0:j])
1156
1157        return result[::-1]
1158
1159    return rsplit_impl
1160
1161
1162@overload_method(types.UnicodeType, 'center')
1163def unicode_center(string, width, fillchar=' '):
1164    if not isinstance(width, types.Integer):
1165        raise TypingError('The width must be an Integer')
1166
1167    if isinstance(fillchar, types.UnicodeCharSeq):
1168        def center_impl(string, width, fillchar=' '):
1169            return string.center(width, str(fillchar))
1170        return center_impl
1171
1172    if not (fillchar == ' ' or
1173            isinstance(fillchar, (types.Omitted, types.UnicodeType))):
1174        raise TypingError('The fillchar must be a UnicodeType')
1175
1176    def center_impl(string, width, fillchar=' '):
1177        str_len = len(string)
1178        fillchar_len = len(fillchar)
1179
1180        if fillchar_len != 1:
1181            raise ValueError('The fill character must be exactly one '
1182                             'character long')
1183
1184        if width <= str_len:
1185            return string
1186
1187        allmargin = width - str_len
1188        lmargin = (allmargin // 2) + (allmargin & width & 1)
1189        rmargin = allmargin - lmargin
1190
1191        l_string = fillchar * lmargin
1192        if lmargin == rmargin:
1193            return l_string + string + l_string
1194        else:
1195            return l_string + string + (fillchar * rmargin)
1196
1197    return center_impl
1198
1199
1200def gen_unicode_Xjust(STRING_FIRST):
1201    def unicode_Xjust(string, width, fillchar=' '):
1202        if not isinstance(width, types.Integer):
1203            raise TypingError('The width must be an Integer')
1204
1205        if isinstance(fillchar, types.UnicodeCharSeq):
1206            if STRING_FIRST:
1207                def ljust_impl(string, width, fillchar=' '):
1208                    return string.ljust(width, str(fillchar))
1209                return ljust_impl
1210            else:
1211                def rjust_impl(string, width, fillchar=' '):
1212                    return string.rjust(width, str(fillchar))
1213                return rjust_impl
1214
1215        if not (fillchar == ' ' or
1216                isinstance(fillchar, (types.Omitted, types.UnicodeType))):
1217            raise TypingError('The fillchar must be a UnicodeType')
1218
1219        def impl(string, width, fillchar=' '):
1220            str_len = len(string)
1221            fillchar_len = len(fillchar)
1222
1223            if fillchar_len != 1:
1224                raise ValueError('The fill character must be exactly one '
1225                                 'character long')
1226
1227            if width <= str_len:
1228                return string
1229
1230            newstr = (fillchar * (width - str_len))
1231            if STRING_FIRST:
1232                return string + newstr
1233            else:
1234                return newstr + string
1235
1236        return impl
1237
1238    return unicode_Xjust
1239
1240
1241overload_method(types.UnicodeType, 'rjust')(gen_unicode_Xjust(False))
1242overload_method(types.UnicodeType, 'ljust')(gen_unicode_Xjust(True))
1243
1244
1245def generate_splitlines_func(is_line_break_func):
1246    """Generate splitlines performer based on ascii or unicode line breaks."""
1247    def impl(data, keepends):
1248        # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L335-L389    # noqa: E501
1249        length = len(data)
1250        result = []
1251        i = j = 0
1252        while i < length:
1253            # find a line and append it
1254            while i < length:
1255                code_point = _get_code_point(data, i)
1256                if is_line_break_func(code_point):
1257                    break
1258                i += 1
1259
1260            # skip the line break reading CRLF as one line break
1261            eol = i
1262            if i < length:
1263                if i + 1 < length:
1264                    cur_cp = _get_code_point(data, i)
1265                    next_cp = _get_code_point(data, i + 1)
1266                    if _Py_ISCARRIAGERETURN(cur_cp) and _Py_ISLINEFEED(next_cp):
1267                        i += 1
1268                i += 1
1269                if keepends:
1270                    eol = i
1271
1272            result.append(data[j:eol])
1273            j = i
1274
1275        return result
1276
1277    return impl
1278
1279
1280_ascii_splitlines = register_jitable(generate_splitlines_func(_Py_ISLINEBREAK))
1281_unicode_splitlines = register_jitable(generate_splitlines_func(
1282    _PyUnicode_IsLineBreak))
1283
1284
1285# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10196-L10229    # noqa: E501
1286@overload_method(types.UnicodeType, 'splitlines')
1287def unicode_splitlines(data, keepends=False):
1288    """Implements str.splitlines()"""
1289    thety = keepends
1290    # if the type is omitted, the concrete type is the value
1291    if isinstance(keepends, types.Omitted):
1292        thety = keepends.value
1293    # if the type is optional, the concrete type is the captured type
1294    elif isinstance(keepends, types.Optional):
1295        thety = keepends.type
1296
1297    accepted = (types.Integer, int, types.Boolean, bool)
1298    if thety is not None and not isinstance(thety, accepted):
1299        raise TypingError(
1300            '"{}" must be {}, not {}'.format('keepends', accepted, keepends))
1301
1302    def splitlines_impl(data, keepends=False):
1303        if data._is_ascii:
1304            return _ascii_splitlines(data, keepends)
1305
1306        return _unicode_splitlines(data, keepends)
1307
1308    return splitlines_impl
1309
1310
1311@register_jitable
1312def join_list(sep, parts):
1313    parts_len = len(parts)
1314    if parts_len == 0:
1315        return ''
1316
1317    # Precompute size and char_width of result
1318    sep_len = len(sep)
1319    length = (parts_len - 1) * sep_len
1320    kind = sep._kind
1321    is_ascii = sep._is_ascii
1322    for p in parts:
1323        length += len(p)
1324        kind = _pick_kind(kind, p._kind)
1325        is_ascii = _pick_ascii(is_ascii, p._is_ascii)
1326
1327    result = _empty_string(kind, length, is_ascii)
1328
1329    # populate string
1330    part = parts[0]
1331    _strncpy(result, 0, part, 0, len(part))
1332    dst_offset = len(part)
1333    for idx in range(1, parts_len):
1334        _strncpy(result, dst_offset, sep, 0, sep_len)
1335        dst_offset += sep_len
1336        part = parts[idx]
1337        _strncpy(result, dst_offset, part, 0, len(part))
1338        dst_offset += len(part)
1339
1340    return result
1341
1342
1343@overload_method(types.UnicodeType, 'join')
1344def unicode_join(sep, parts):
1345
1346    if isinstance(parts, types.List):
1347        if isinstance(parts.dtype, types.UnicodeType):
1348            def join_list_impl(sep, parts):
1349                return join_list(sep, parts)
1350            return join_list_impl
1351        elif isinstance(parts.dtype, types.UnicodeCharSeq):
1352            def join_list_impl(sep, parts):
1353                _parts = [str(p) for p in parts]
1354                return join_list(sep, _parts)
1355            return join_list_impl
1356        else:
1357            pass  # lists of any other type not supported
1358    elif isinstance(parts, types.IterableType):
1359        def join_iter_impl(sep, parts):
1360            parts_list = [p for p in parts]
1361            return join_list(sep, parts_list)
1362        return join_iter_impl
1363    elif isinstance(parts, types.UnicodeType):
1364        # Temporary workaround until UnicodeType is iterable
1365        def join_str_impl(sep, parts):
1366            parts_list = [parts[i] for i in range(len(parts))]
1367            return join_list(sep, parts_list)
1368        return join_str_impl
1369
1370
1371@overload_method(types.UnicodeType, 'zfill')
1372def unicode_zfill(string, width):
1373    if not isinstance(width, types.Integer):
1374        raise TypingError("<width> must be an Integer")
1375
1376    def zfill_impl(string, width):
1377
1378        str_len = len(string)
1379
1380        if width <= str_len:
1381            return string
1382
1383        first_char = string[0] if str_len else ''
1384        padding = '0' * (width - str_len)
1385
1386        if first_char in ['+', '-']:
1387            newstr = first_char + padding + string[1:]
1388        else:
1389            newstr = padding + string
1390
1391        return newstr
1392
1393    return zfill_impl
1394
1395
1396# ------------------------------------------------------------------------------
1397# Strip functions
1398# ------------------------------------------------------------------------------
1399@register_jitable
1400def unicode_strip_left_bound(string, chars):
1401    str_len = len(string)
1402
1403    i = 0
1404    if chars is not None:
1405        for i in range(str_len):
1406            if string[i] not in chars:
1407                return i
1408    else:
1409        for i in range(str_len):
1410            if not _PyUnicode_IsSpace(string[i]):
1411                return i
1412
1413    return str_len
1414
1415
1416@register_jitable
1417def unicode_strip_right_bound(string, chars):
1418    str_len = len(string)
1419    i = 0
1420    if chars is not None:
1421        for i in range(str_len - 1, -1, -1):
1422            if string[i] not in chars:
1423                i += 1
1424                break
1425    else:
1426        for i in range(str_len - 1, -1, -1):
1427            if not _PyUnicode_IsSpace(string[i]):
1428                i += 1
1429                break
1430
1431    return i
1432
1433
1434def unicode_strip_types_check(chars):
1435    if isinstance(chars, types.Optional):
1436        chars = chars.type  # catch optional type with invalid non-None type
1437    if not (chars is None or isinstance(chars, (types.Omitted,
1438                                                types.UnicodeType,
1439                                                types.NoneType))):
1440        raise TypingError('The arg must be a UnicodeType or None')
1441
1442
1443def _count_args_types_check(arg):
1444    if isinstance(arg, types.Optional):
1445        arg = arg.type
1446    if not (arg is None or isinstance(arg, (types.Omitted,
1447                                            types.Integer,
1448                                            types.NoneType))):
1449        raise TypingError("The slice indices must be an Integer or None")
1450
1451
1452@overload_method(types.UnicodeType, 'lstrip')
1453def unicode_lstrip(string, chars=None):
1454
1455    if isinstance(chars, types.UnicodeCharSeq):
1456        def lstrip_impl(string, chars=None):
1457            return string.lstrip(str(chars))
1458        return lstrip_impl
1459
1460    unicode_strip_types_check(chars)
1461
1462    def lstrip_impl(string, chars=None):
1463        return string[unicode_strip_left_bound(string, chars):]
1464    return lstrip_impl
1465
1466
1467@overload_method(types.UnicodeType, 'rstrip')
1468def unicode_rstrip(string, chars=None):
1469
1470    if isinstance(chars, types.UnicodeCharSeq):
1471        def rstrip_impl(string, chars=None):
1472            return string.rstrip(str(chars))
1473        return rstrip_impl
1474
1475    unicode_strip_types_check(chars)
1476
1477    def rstrip_impl(string, chars=None):
1478        return string[:unicode_strip_right_bound(string, chars)]
1479    return rstrip_impl
1480
1481
1482@overload_method(types.UnicodeType, 'strip')
1483def unicode_strip(string, chars=None):
1484
1485    if isinstance(chars, types.UnicodeCharSeq):
1486        def strip_impl(string, chars=None):
1487            return string.strip(str(chars))
1488        return strip_impl
1489
1490    unicode_strip_types_check(chars)
1491
1492    def strip_impl(string, chars=None):
1493        lb = unicode_strip_left_bound(string, chars)
1494        rb = unicode_strip_right_bound(string, chars)
1495        return string[lb:rb]
1496    return strip_impl
1497
1498
1499# ------------------------------------------------------------------------------
1500# Slice functions
1501# ------------------------------------------------------------------------------
1502
1503@register_jitable
1504def normalize_str_idx(idx, length, is_start=True):
1505    """
1506    Parameters
1507    ----------
1508    idx : int or None
1509        the index
1510    length : int
1511        the string length
1512    is_start : bool; optional with defaults to True
1513        Is it the *start* or the *stop* of the slice?
1514
1515    Returns
1516    -------
1517    norm_idx : int
1518        normalized index
1519    """
1520    if idx is None:
1521        if is_start:
1522            return 0
1523        else:
1524            return length
1525    elif idx < 0:
1526        idx += length
1527
1528    if idx < 0 or idx >= length:
1529        raise IndexError("string index out of range")
1530
1531    return idx
1532
1533
1534@register_jitable
1535def _normalize_slice_idx_count(arg, slice_len, default):
1536    """
1537    Used for unicode_count
1538
1539    If arg < -slice_len, returns 0 (prevents circle)
1540
1541    If arg is within slice, e.g -slice_len <= arg < slice_len
1542    returns its real index via arg % slice_len
1543
1544    If arg > slice_len, returns arg (in this case count must
1545    return 0 if it is start index)
1546    """
1547
1548    if arg is None:
1549        return default
1550    if -slice_len <= arg < slice_len:
1551        return arg % slice_len
1552    return 0 if arg < 0 else arg
1553
1554
1555@intrinsic
1556def _normalize_slice(typingctx, sliceobj, length):
1557    """Fix slice object.
1558    """
1559    sig = sliceobj(sliceobj, length)
1560
1561    def codegen(context, builder, sig, args):
1562        [slicetype, lengthtype] = sig.args
1563        [sliceobj, length] = args
1564        slice = context.make_helper(builder, slicetype, sliceobj)
1565        slicing.guard_invalid_slice(context, builder, slicetype, slice)
1566        slicing.fix_slice(builder, slice, length)
1567        return slice._getvalue()
1568
1569    return sig, codegen
1570
1571
1572@intrinsic
1573def _slice_span(typingctx, sliceobj):
1574    """Compute the span from the given slice object.
1575    """
1576    sig = types.intp(sliceobj)
1577
1578    def codegen(context, builder, sig, args):
1579        [slicetype] = sig.args
1580        [sliceobj] = args
1581        slice = context.make_helper(builder, slicetype, sliceobj)
1582        result_size = slicing.get_slice_length(builder, slice)
1583        return result_size
1584
1585    return sig, codegen
1586
1587
1588@register_jitable(_nrt=False)
1589def _strncpy(dst, dst_offset, src, src_offset, n):
1590    if src._kind == dst._kind:
1591        byte_width = _kind_to_byte_width(src._kind)
1592        src_byte_offset = byte_width * src_offset
1593        dst_byte_offset = byte_width * dst_offset
1594        nbytes = n * byte_width
1595        memcpy_region(dst._data, dst_byte_offset, src._data,
1596                      src_byte_offset, nbytes, align=1)
1597    else:
1598        for i in range(n):
1599            _set_code_point(dst, dst_offset + i,
1600                            _get_code_point(src, src_offset + i))
1601
1602
1603@intrinsic
1604def _get_str_slice_view(typingctx, src_t, start_t, length_t):
1605    """Create a slice of a unicode string using a view of its data to avoid
1606    extra allocation.
1607    """
1608    assert src_t == types.unicode_type
1609
1610    def codegen(context, builder, sig, args):
1611        src, start, length = args
1612        in_str = cgutils.create_struct_proxy(
1613            types.unicode_type)(context, builder, value=src)
1614        view_str = cgutils.create_struct_proxy(
1615            types.unicode_type)(context, builder)
1616        view_str.meminfo = in_str.meminfo
1617        view_str.kind = in_str.kind
1618        view_str.is_ascii = in_str.is_ascii
1619        view_str.length = length
1620        # hash value -1 to indicate "need to compute hash"
1621        view_str.hash = context.get_constant(_Py_hash_t, -1)
1622        # get a pointer to start of slice data
1623        bw_typ = context.typing_context.resolve_value_type(_kind_to_byte_width)
1624        bw_sig = bw_typ.get_call_type(
1625            context.typing_context, (types.int32,), {})
1626        bw_impl = context.get_function(bw_typ, bw_sig)
1627        byte_width = bw_impl(builder, (in_str.kind,))
1628        offset = builder.mul(start, byte_width)
1629        view_str.data = builder.gep(in_str.data, [offset])
1630        # Set parent pyobject to NULL
1631        view_str.parent = cgutils.get_null_value(view_str.parent.type)
1632        # incref original string
1633        if context.enable_nrt:
1634            context.nrt.incref(builder, sig.args[0], src)
1635        return view_str._getvalue()
1636
1637    sig = types.unicode_type(types.unicode_type, types.intp, types.intp)
1638    return sig, codegen
1639
1640
1641@overload(operator.getitem)
1642def unicode_getitem(s, idx):
1643    if isinstance(s, types.UnicodeType):
1644        if isinstance(idx, types.Integer):
1645            def getitem_char(s, idx):
1646                idx = normalize_str_idx(idx, len(s))
1647                ret = _empty_string(s._kind, 1, s._is_ascii)
1648                _set_code_point(ret, 0, _get_code_point(s, idx))
1649                return ret
1650            return getitem_char
1651        elif isinstance(idx, types.SliceType):
1652            def getitem_slice(s, idx):
1653                slice_idx = _normalize_slice(idx, len(s))
1654                span = _slice_span(slice_idx)
1655
1656                if slice_idx.step == 1:
1657                    return _get_str_slice_view(s, slice_idx.start, span)
1658                else:
1659                    ret = _empty_string(s._kind, span, s._is_ascii)
1660                    cur = slice_idx.start
1661                    for i in range(span):
1662                        _set_code_point(ret, i, _get_code_point(s, cur))
1663                        cur += slice_idx.step
1664                    return ret
1665            return getitem_slice
1666
1667
1668# ------------------------------------------------------------------------------
1669# String operations
1670# ------------------------------------------------------------------------------
1671
1672
1673@overload(operator.add)
1674@overload(operator.iadd)
1675def unicode_concat(a, b):
1676    if isinstance(a, types.UnicodeType) and isinstance(b, types.UnicodeType):
1677        def concat_impl(a, b):
1678            new_length = a._length + b._length
1679            new_kind = _pick_kind(a._kind, b._kind)
1680            new_ascii = _pick_ascii(a._is_ascii, b._is_ascii)
1681            result = _empty_string(new_kind, new_length, new_ascii)
1682            for i in range(len(a)):
1683                _set_code_point(result, i, _get_code_point(a, i))
1684            for j in range(len(b)):
1685                _set_code_point(result, len(a) + j, _get_code_point(b, j))
1686            return result
1687        return concat_impl
1688
1689    if isinstance(a, types.UnicodeType) and isinstance(b, types.UnicodeCharSeq):
1690        def concat_impl(a, b):
1691            return a + str(b)
1692        return concat_impl
1693
1694
1695@register_jitable
1696def _repeat_impl(str_arg, mult_arg):
1697    if str_arg == '' or mult_arg < 1:
1698        return ''
1699    elif mult_arg == 1:
1700        return str_arg
1701    else:
1702        new_length = str_arg._length * mult_arg
1703        new_kind = str_arg._kind
1704        result = _empty_string(new_kind, new_length, str_arg._is_ascii)
1705        # make initial copy into result
1706        len_a = len(str_arg)
1707        _strncpy(result, 0, str_arg, 0, len_a)
1708        # loop through powers of 2 for efficient copying
1709        copy_size = len_a
1710        while 2 * copy_size <= new_length:
1711            _strncpy(result, copy_size, result, 0, copy_size)
1712            copy_size *= 2
1713
1714        if not 2 * copy_size == new_length:
1715            # if copy_size not an exact multiple it then needs
1716            # to complete the rest of the copies
1717            rest = new_length - copy_size
1718            _strncpy(result, copy_size, result, copy_size - rest, rest)
1719            return result
1720
1721
1722@overload(operator.mul)
1723def unicode_repeat(a, b):
1724    if isinstance(a, types.UnicodeType) and isinstance(b, types.Integer):
1725        def wrap(a, b):
1726            return _repeat_impl(a, b)
1727        return wrap
1728    elif isinstance(a, types.Integer) and isinstance(b, types.UnicodeType):
1729        def wrap(a, b):
1730            return _repeat_impl(b, a)
1731        return wrap
1732
1733
1734@overload(operator.not_)
1735def unicode_not(a):
1736    if isinstance(a, types.UnicodeType):
1737        def impl(a):
1738            return len(a) == 0
1739        return impl
1740
1741
1742@overload_method(types.UnicodeType, 'replace')
1743def unicode_replace(s, old_str, new_str, count=-1):
1744    thety = count
1745    if isinstance(count, types.Omitted):
1746        thety = count.value
1747    elif isinstance(count, types.Optional):
1748        thety = count.type
1749
1750    if not isinstance(thety, (int, types.Integer)):
1751        raise TypingError('Unsupported parameters. The parametrs '
1752                          'must be Integer. Given count: {}'.format(count))
1753
1754    if not isinstance(old_str, (types.UnicodeType, types.NoneType)):
1755        raise TypingError('The object must be a UnicodeType.'
1756                          ' Given: {}'.format(old_str))
1757
1758    if not isinstance(new_str, types.UnicodeType):
1759        raise TypingError('The object must be a UnicodeType.'
1760                          ' Given: {}'.format(new_str))
1761
1762    def impl(s, old_str, new_str, count=-1):
1763        if count == 0:
1764            return s
1765        if old_str == '':
1766            schars = list(s)
1767            if count == -1:
1768                return new_str + new_str.join(schars) + new_str
1769            split_result = [new_str]
1770            min_count = min(len(schars), count)
1771            for i in range(min_count):
1772                split_result.append(schars[i])
1773                if i + 1 != min_count:
1774                    split_result.append(new_str)
1775                else:
1776                    split_result.append(''.join(schars[(i + 1):]))
1777            if count > len(schars):
1778                split_result.append(new_str)
1779            return ''.join(split_result)
1780        schars = s.split(old_str, count)
1781        result = new_str.join(schars)
1782        return result
1783
1784    return impl
1785
1786# ------------------------------------------------------------------------------
1787# String `is*()` methods
1788# ------------------------------------------------------------------------------
1789
1790
1791# generates isalpha/isalnum
1792def gen_isAlX(ascii_func, unicode_func):
1793    def unicode_isAlX(data):
1794
1795        def impl(data):
1796            length = len(data)
1797            if length == 0:
1798                return False
1799
1800            if length == 1:
1801                code_point = _get_code_point(data, 0)
1802                if data._is_ascii:
1803                    return ascii_func(code_point)
1804                else:
1805                    return unicode_func(code_point)
1806
1807            if data._is_ascii:
1808                for i in range(length):
1809                    code_point = _get_code_point(data, i)
1810                    if not ascii_func(code_point):
1811                        return False
1812
1813            for i in range(length):
1814                code_point = _get_code_point(data, i)
1815                if not unicode_func(code_point):
1816                    return False
1817
1818            return True
1819
1820        return impl
1821    return unicode_isAlX
1822
1823
1824# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11928-L11964    # noqa: E501
1825overload_method(types.UnicodeType, 'isalpha')(gen_isAlX(_Py_ISALPHA,
1826                                                        _PyUnicode_IsAlpha))
1827
1828_unicode_is_alnum = register_jitable(lambda x:
1829                                     (_PyUnicode_IsNumeric(x) or
1830                                      _PyUnicode_IsAlpha(x)))
1831
1832# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11975-L12006    # noqa: E501
1833overload_method(types.UnicodeType, 'isalnum')(gen_isAlX(_Py_ISALNUM,
1834                                                        _unicode_is_alnum))
1835
1836
1837def _is_upper(is_lower, is_upper, is_title):
1838    # impl is an approximate translation of:
1839    # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11794-L11827    # noqa: E501
1840    # mixed with:
1841    # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L218-L242    # noqa: E501
1842    def impl(a):
1843        l = len(a)
1844        if l == 1:
1845            return is_upper(_get_code_point(a, 0))
1846        if l == 0:
1847            return False
1848        cased = False
1849        for idx in range(l):
1850            code_point = _get_code_point(a, idx)
1851            if is_lower(code_point) or is_title(code_point):
1852                return False
1853            elif(not cased and is_upper(code_point)):
1854                cased = True
1855        return cased
1856    return impl
1857
1858
1859_always_false = register_jitable(lambda x: False)
1860_ascii_is_upper = register_jitable(_is_upper(_Py_ISLOWER, _Py_ISUPPER,
1861                                             _always_false))
1862_unicode_is_upper = register_jitable(_is_upper(_PyUnicode_IsLowercase,
1863                                               _PyUnicode_IsUppercase,
1864                                               _PyUnicode_IsTitlecase))
1865
1866
1867@overload_method(types.UnicodeType, 'isupper')
1868def unicode_isupper(a):
1869    """
1870    Implements .isupper()
1871    """
1872    def impl(a):
1873        if a._is_ascii:
1874            return _ascii_is_upper(a)
1875        else:
1876            return _unicode_is_upper(a)
1877    return impl
1878
1879
1880if utils.PYVERSION >= (3, 7):
1881    @overload_method(types.UnicodeType, 'isascii')
1882    def unicode_isascii(data):
1883        """Implements UnicodeType.isascii()"""
1884
1885        def impl(data):
1886            return data._is_ascii
1887        return impl
1888
1889
1890@overload_method(types.UnicodeType, 'istitle')
1891def unicode_istitle(data):
1892    """
1893    Implements UnicodeType.istitle()
1894    The algorithm is an approximate translation from CPython:
1895    https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11829-L11885 # noqa: E501
1896    """
1897
1898    def impl(data):
1899        length = len(data)
1900        if length == 1:
1901            char = _get_code_point(data, 0)
1902            return _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char)
1903
1904        if length == 0:
1905            return False
1906
1907        cased = False
1908        previous_is_cased = False
1909        for idx in range(length):
1910            char = _get_code_point(data, idx)
1911            if _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char):
1912                if previous_is_cased:
1913                    return False
1914                previous_is_cased = True
1915                cased = True
1916            elif _PyUnicode_IsLowercase(char):
1917                if not previous_is_cased:
1918                    return False
1919                previous_is_cased = True
1920                cased = True
1921            else:
1922                previous_is_cased = False
1923
1924        return cased
1925    return impl
1926
1927
1928@overload_method(types.UnicodeType, 'islower')
1929def unicode_islower(data):
1930    """
1931    impl is an approximate translation of:
1932    https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L11900-L11933    # noqa: E501
1933    mixed with:
1934    https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/bytes_methods.c#L131-L156    # noqa: E501
1935    """
1936
1937    def impl(data):
1938        length = len(data)
1939        if length == 1:
1940            return _PyUnicode_IsLowercase(_get_code_point(data, 0))
1941        if length == 0:
1942            return False
1943
1944        cased = False
1945        for idx in range(length):
1946            cp = _get_code_point(data, idx)
1947            if _PyUnicode_IsUppercase(cp) or _PyUnicode_IsTitlecase(cp):
1948                return False
1949            elif not cased and _PyUnicode_IsLowercase(cp):
1950                cased = True
1951        return cased
1952    return impl
1953
1954
1955# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12126-L12161    # noqa: E501
1956@overload_method(types.UnicodeType, 'isidentifier')
1957def unicode_isidentifier(data):
1958    """Implements UnicodeType.isidentifier()"""
1959
1960    def impl(data):
1961        length = len(data)
1962        if length == 0:
1963            return False
1964
1965        first_cp = _get_code_point(data, 0)
1966        if not _PyUnicode_IsXidStart(first_cp) and first_cp != 0x5F:
1967            return False
1968
1969        for i in range(1, length):
1970            code_point = _get_code_point(data, i)
1971            if not _PyUnicode_IsXidContinue(code_point):
1972                return False
1973
1974        return True
1975
1976    return impl
1977
1978
1979# generator for simple unicode "isX" methods
1980def gen_isX(_PyUnicode_IS_func, empty_is_false=True):
1981    def unicode_isX(data):
1982        def impl(data):
1983            length = len(data)
1984            if length == 1:
1985                return _PyUnicode_IS_func(_get_code_point(data, 0))
1986
1987            if empty_is_false and length == 0:
1988                return False
1989
1990            for i in range(length):
1991                code_point = _get_code_point(data, i)
1992                if not _PyUnicode_IS_func(code_point):
1993                    return False
1994
1995            return True
1996
1997        return impl
1998    return unicode_isX
1999
2000
2001# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11896-L11925    # noqa: E501
2002overload_method(types.UnicodeType, 'isspace')(gen_isX(_PyUnicode_IsSpace))
2003
2004# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12096-L12124    # noqa: E501
2005overload_method(types.UnicodeType, 'isnumeric')(gen_isX(_PyUnicode_IsNumeric))
2006
2007# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12056-L12085    # noqa: E501
2008overload_method(types.UnicodeType, 'isdigit')(gen_isX(_PyUnicode_IsDigit))
2009
2010# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12017-L12045    # noqa: E501
2011overload_method(types.UnicodeType, 'isdecimal')(
2012    gen_isX(_PyUnicode_IsDecimalDigit))
2013
2014# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12188-L12213    # noqa: E501
2015overload_method(types.UnicodeType, 'isprintable')(
2016    gen_isX(_PyUnicode_IsPrintable, False))
2017
2018# ------------------------------------------------------------------------------
2019# String methods that apply a transformation to the characters themselves
2020# ------------------------------------------------------------------------------
2021
2022
2023# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908    # noqa: E501
2024def case_operation(ascii_func, unicode_func):
2025    """Generate common case operation performer."""
2026    def impl(data):
2027        length = len(data)
2028        if length == 0:
2029            return _empty_string(data._kind, length, data._is_ascii)
2030
2031        if data._is_ascii:
2032            res = _empty_string(data._kind, length, 1)
2033            ascii_func(data, res)
2034            return res
2035
2036        # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908    # noqa: E501
2037        tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii)
2038        # maxchar should be inside of a list to be pass as argument by reference
2039        maxchars = [0]
2040        newlength = unicode_func(data, length, tmp, maxchars)
2041        maxchar = maxchars[0]
2042        newkind = _codepoint_to_kind(maxchar)
2043        res = _empty_string(newkind, newlength, _codepoint_is_ascii(maxchar))
2044        for i in range(newlength):
2045            _set_code_point(res, i, _get_code_point(tmp, i))
2046
2047        return res
2048
2049    return impl
2050
2051
2052# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9856-L9883    # noqa: E501
2053@register_jitable
2054def _handle_capital_sigma(data, length, idx):
2055    """This is a translation of the function that handles the capital sigma."""
2056    c = 0
2057    j = idx - 1
2058    while j >= 0:
2059        c = _get_code_point(data, j)
2060        if not _PyUnicode_IsCaseIgnorable(c):
2061            break
2062        j -= 1
2063    final_sigma = (j >= 0 and _PyUnicode_IsCased(c))
2064    if final_sigma:
2065        j = idx + 1
2066        while j < length:
2067            c = _get_code_point(data, j)
2068            if not _PyUnicode_IsCaseIgnorable(c):
2069                break
2070            j += 1
2071        final_sigma = (j == length or (not _PyUnicode_IsCased(c)))
2072
2073    return 0x3c2 if final_sigma else 0x3c3
2074
2075
2076# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9885-L9895    # noqa: E501
2077@register_jitable
2078def _lower_ucs4(code_point, data, length, idx, mapped):
2079    """This is a translation of the function that lowers a character."""
2080    if code_point == 0x3A3:
2081        mapped[0] = _handle_capital_sigma(data, length, idx)
2082        return 1
2083    return _PyUnicode_ToLowerFull(code_point, mapped)
2084
2085
2086# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9946-L9965    # noqa: E501
2087def _gen_unicode_upper_or_lower(lower):
2088    def _do_upper_or_lower(data, length, res, maxchars):
2089        k = 0
2090        for idx in range(length):
2091            mapped = np.zeros(3, dtype=_Py_UCS4)
2092            code_point = _get_code_point(data, idx)
2093            if lower:
2094                n_res = _lower_ucs4(code_point, data, length, idx, mapped)
2095            else:
2096                # might be needed if call _do_upper_or_lower in unicode_upper
2097                n_res = _PyUnicode_ToUpperFull(code_point, mapped)
2098            for m in mapped[:n_res]:
2099                maxchars[0] = max(maxchars[0], m)
2100                _set_code_point(res, k, m)
2101                k += 1
2102        return k
2103    return _do_upper_or_lower
2104
2105
2106_unicode_upper = register_jitable(_gen_unicode_upper_or_lower(False))
2107_unicode_lower = register_jitable(_gen_unicode_upper_or_lower(True))
2108
2109
2110def _gen_ascii_upper_or_lower(func):
2111    def _ascii_upper_or_lower(data, res):
2112        for idx in range(len(data)):
2113            code_point = _get_code_point(data, idx)
2114            _set_code_point(res, idx, func(code_point))
2115    return _ascii_upper_or_lower
2116
2117
2118_ascii_upper = register_jitable(_gen_ascii_upper_or_lower(_Py_TOUPPER))
2119_ascii_lower = register_jitable(_gen_ascii_upper_or_lower(_Py_TOLOWER))
2120
2121
2122@overload_method(types.UnicodeType, 'lower')
2123def unicode_lower(data):
2124    """Implements .lower()"""
2125    return case_operation(_ascii_lower, _unicode_lower)
2126
2127
2128@overload_method(types.UnicodeType, 'upper')
2129def unicode_upper(data):
2130    """Implements .upper()"""
2131    return case_operation(_ascii_upper, _unicode_upper)
2132
2133
2134# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9819-L9834    # noqa: E501
2135@register_jitable
2136def _unicode_casefold(data, length, res, maxchars):
2137    k = 0
2138    mapped = np.zeros(3, dtype=_Py_UCS4)
2139    for idx in range(length):
2140        mapped.fill(0)
2141        code_point = _get_code_point(data, idx)
2142        n_res = _PyUnicode_ToFoldedFull(code_point, mapped)
2143        for m in mapped[:n_res]:
2144            maxchar = maxchars[0]
2145            maxchars[0] = max(maxchar, m)
2146            _set_code_point(res, k, m)
2147            k += 1
2148
2149    return k
2150
2151
2152@register_jitable
2153def _ascii_casefold(data, res):
2154    for idx in range(len(data)):
2155        code_point = _get_code_point(data, idx)
2156        _set_code_point(res, idx, _Py_TOLOWER(code_point))
2157
2158
2159@overload_method(types.UnicodeType, 'casefold')
2160def unicode_casefold(data):
2161    """Implements str.casefold()"""
2162    return case_operation(_ascii_casefold, _unicode_casefold)
2163
2164
2165# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9737-L9759    # noqa: E501
2166@register_jitable
2167def _unicode_capitalize(data, length, res, maxchars):
2168    k = 0
2169    maxchar = 0
2170    mapped = np.zeros(3, dtype=_Py_UCS4)
2171    code_point = _get_code_point(data, 0)
2172
2173    # https://github.com/python/cpython/commit/b015fc86f7b1f35283804bfee788cce0a5495df7/Objects/unicodeobject.c#diff-220e5da0d1c8abf508b25c02da6ca16c    # noqa: E501
2174    if _py38_or_later:
2175        n_res = _PyUnicode_ToTitleFull(code_point, mapped)
2176    else:
2177        n_res = _PyUnicode_ToUpperFull(code_point, mapped)
2178
2179    for m in mapped[:n_res]:
2180        maxchar = max(maxchar, m)
2181        _set_code_point(res, k, m)
2182        k += 1
2183    for idx in range(1, length):
2184        mapped.fill(0)
2185        code_point = _get_code_point(data, idx)
2186        n_res = _lower_ucs4(code_point, data, length, idx, mapped)
2187        for m in mapped[:n_res]:
2188            maxchar = max(maxchar, m)
2189            _set_code_point(res, k, m)
2190            k += 1
2191    maxchars[0] = maxchar
2192    return k
2193
2194
2195# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L361-L382    # noqa: E501
2196@register_jitable
2197def _ascii_capitalize(data, res):
2198    code_point = _get_code_point(data, 0)
2199    _set_code_point(res, 0, _Py_TOUPPER(code_point))
2200    for idx in range(1, len(data)):
2201        code_point = _get_code_point(data, idx)
2202        _set_code_point(res, idx, _Py_TOLOWER(code_point))
2203
2204
2205# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10765-L10774    # noqa: E501
2206@overload_method(types.UnicodeType, 'capitalize')
2207def unicode_capitalize(data):
2208    return case_operation(_ascii_capitalize, _unicode_capitalize)
2209
2210
2211# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9996-L10021    # noqa: E501
2212@register_jitable
2213def _unicode_title(data, length, res, maxchars):
2214    """This is a translation of the function that titles a unicode string."""
2215    k = 0
2216    previous_cased = False
2217    mapped = np.empty(3, dtype=_Py_UCS4)
2218    for idx in range(length):
2219        mapped.fill(0)
2220        code_point = _get_code_point(data, idx)
2221        if previous_cased:
2222            n_res = _lower_ucs4(code_point, data, length, idx, mapped)
2223        else:
2224            n_res = _PyUnicode_ToTitleFull(_Py_UCS4(code_point), mapped)
2225        for m in mapped[:n_res]:
2226            maxchar, = maxchars
2227            maxchars[0] = max(maxchar, m)
2228            _set_code_point(res, k, m)
2229            k += 1
2230        previous_cased = _PyUnicode_IsCased(_Py_UCS4(code_point))
2231    return k
2232
2233
2234# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L332-L352    # noqa: E501
2235@register_jitable
2236def _ascii_title(data, res):
2237    """ Does .title() on an ASCII string """
2238    previous_is_cased = False
2239    for idx in range(len(data)):
2240        code_point = _get_code_point(data, idx)
2241        if _Py_ISLOWER(code_point):
2242            if not previous_is_cased:
2243                code_point = _Py_TOUPPER(code_point)
2244            previous_is_cased = True
2245        elif _Py_ISUPPER(code_point):
2246            if previous_is_cased:
2247                code_point = _Py_TOLOWER(code_point)
2248            previous_is_cased = True
2249        else:
2250            previous_is_cased = False
2251        _set_code_point(res, idx, code_point)
2252
2253
2254# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L10023-L10069    # noqa: E501
2255@overload_method(types.UnicodeType, 'title')
2256def unicode_title(data):
2257    """Implements str.title()"""
2258    # https://docs.python.org/3/library/stdtypes.html#str.title
2259    return case_operation(_ascii_title, _unicode_title)
2260
2261
2262# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L391-L408    # noqa: E501
2263@register_jitable
2264def _ascii_swapcase(data, res):
2265    for idx in range(len(data)):
2266        code_point = _get_code_point(data, idx)
2267        if _Py_ISUPPER(code_point):
2268            code_point = _Py_TOLOWER(code_point)
2269        elif _Py_ISLOWER(code_point):
2270            code_point = _Py_TOUPPER(code_point)
2271        _set_code_point(res, idx, code_point)
2272
2273
2274# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9761-L9784    # noqa: E501
2275@register_jitable
2276def _unicode_swapcase(data, length, res, maxchars):
2277    k = 0
2278    maxchar = 0
2279    mapped = np.empty(3, dtype=_Py_UCS4)
2280    for idx in range(length):
2281        mapped.fill(0)
2282        code_point = _get_code_point(data, idx)
2283        if _PyUnicode_IsUppercase(code_point):
2284            n_res = _lower_ucs4(code_point, data, length, idx, mapped)
2285        elif _PyUnicode_IsLowercase(code_point):
2286            n_res = _PyUnicode_ToUpperFull(code_point, mapped)
2287        else:
2288            n_res = 1
2289            mapped[0] = code_point
2290        for m in mapped[:n_res]:
2291            maxchar = max(maxchar, m)
2292            _set_code_point(res, k, m)
2293            k += 1
2294    maxchars[0] = maxchar
2295    return k
2296
2297
2298@overload_method(types.UnicodeType, 'swapcase')
2299def unicode_swapcase(data):
2300    return case_operation(_ascii_swapcase, _unicode_swapcase)
2301
2302
2303# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Python/bltinmodule.c#L1781-L1824    # noqa: E501
2304@overload(ord)
2305def ol_ord(c):
2306    if isinstance(c, types.UnicodeType):
2307        def impl(c):
2308            lc = len(c)
2309            if lc != 1:
2310                # CPython does TypeError
2311                raise TypeError("ord() expected a character")
2312            return _get_code_point(c, 0)
2313        return impl
2314
2315
2316# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L2005-L2028    # noqa: E501
2317# This looks a bit different to the cpython implementation but, with the
2318# exception of a latin1 fast path is logically the same. It finds the "kind" of
2319# the codepoint `ch`, creates a length 1 string of that kind and then injects
2320# the code point into the zero position of that string. Cpython does similar but
2321# branches for each kind (this is encapsulated in Numba's _set_code_point).
2322@register_jitable
2323def _unicode_char(ch):
2324    assert ch <= _MAX_UNICODE
2325    kind = _codepoint_to_kind(ch)
2326    ret = _empty_string(kind, 1, kind == PY_UNICODE_1BYTE_KIND)
2327    _set_code_point(ret, 0, ch)
2328    return ret
2329
2330
2331_out_of_range_msg = "chr() arg not in range(0x%hx)" % _MAX_UNICODE
2332
2333
2334# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L3045-L3055    # noqa: E501
2335@register_jitable
2336def _PyUnicode_FromOrdinal(ordinal):
2337    if (ordinal < 0 or ordinal > _MAX_UNICODE):
2338        raise ValueError(_out_of_range_msg)
2339
2340    return _unicode_char(_Py_UCS4(ordinal))
2341
2342
2343# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Python/bltinmodule.c#L715-L720    # noqa: E501
2344@overload(chr)
2345def ol_chr(i):
2346    if isinstance(i, types.Integer):
2347        def impl(i):
2348            return _PyUnicode_FromOrdinal(i)
2349        return impl
2350
2351
2352@overload(str)
2353def integer_str(n):
2354    if isinstance(n, types.Integer):
2355        ten = n(10)
2356
2357        def impl(n):
2358            flag = False
2359            if n < 0:
2360                n = -n
2361                flag = True
2362            if n == 0:
2363                return '0'
2364            l = []
2365            while n > 0:
2366                c = chr(ord('0') + (n % ten))
2367                n = n // ten
2368                l.append(c)
2369            if flag:
2370                l.append('-')
2371            return ''.join(l[::-1])
2372        return impl
2373
2374# ------------------------------------------------------------------------------
2375# iteration
2376# ------------------------------------------------------------------------------
2377
2378
2379@lower_builtin('getiter', types.UnicodeType)
2380def getiter_unicode(context, builder, sig, args):
2381    [ty] = sig.args
2382    [data] = args
2383
2384    iterobj = context.make_helper(builder, sig.return_type)
2385
2386    # set the index to zero
2387    zero = context.get_constant(types.uintp, 0)
2388    indexptr = cgutils.alloca_once_value(builder, zero)
2389
2390    iterobj.index = indexptr
2391
2392    # wire in the unicode type data
2393    iterobj.data = data
2394
2395    # incref as needed
2396    if context.enable_nrt:
2397        context.nrt.incref(builder, ty, data)
2398
2399    res = iterobj._getvalue()
2400    return impl_ret_new_ref(context, builder, sig.return_type, res)
2401
2402
2403@lower_builtin('iternext', types.UnicodeIteratorType)
2404# a new ref counted object is put into result._yield so set the new_ref to True!
2405@iternext_impl(RefType.NEW)
2406def iternext_unicode(context, builder, sig, args, result):
2407    [iterty] = sig.args
2408    [iter] = args
2409
2410    tyctx = context.typing_context
2411
2412    # get ref to unicode.__getitem__
2413    fnty = tyctx.resolve_value_type(operator.getitem)
2414    getitem_sig = fnty.get_call_type(tyctx, (types.unicode_type, types.uintp),
2415                                     {})
2416    getitem_impl = context.get_function(fnty, getitem_sig)
2417
2418    # get ref to unicode.__len__
2419    fnty = tyctx.resolve_value_type(len)
2420    len_sig = fnty.get_call_type(tyctx, (types.unicode_type,), {})
2421    len_impl = context.get_function(fnty, len_sig)
2422
2423    # grab unicode iterator struct
2424    iterobj = context.make_helper(builder, iterty, value=iter)
2425
2426    # find the length of the string
2427    strlen = len_impl(builder, (iterobj.data,))
2428
2429    # find the current index
2430    index = builder.load(iterobj.index)
2431
2432    # see if the index is in range
2433    is_valid = builder.icmp_unsigned('<', index, strlen)
2434    result.set_valid(is_valid)
2435
2436    with builder.if_then(is_valid):
2437        # return value at index
2438        gotitem = getitem_impl(builder, (iterobj.data, index,))
2439        result.yield_(gotitem)
2440
2441        # bump index for next cycle
2442        nindex = cgutils.increment_index(builder, index)
2443        builder.store(nindex, iterobj.index)
2444