1import sys 2import operator 3 4import numpy as np 5from llvmlite.ir import IntType, Constant 6 7from numba.core.extending import ( 8 models, 9 register_model, 10 make_attribute_wrapper, 11 unbox, 12 box, 13 NativeValue, 14 overload, 15 overload_method, 16 intrinsic, 17 register_jitable, 18) 19from numba.core.imputils import (lower_constant, lower_cast, lower_builtin, 20 iternext_impl, impl_ret_new_ref, RefType) 21from numba.core.datamodel import register_default, StructModel 22from numba.core import utils, types, cgutils 23from numba.core.pythonapi import ( 24 PY_UNICODE_1BYTE_KIND, 25 PY_UNICODE_2BYTE_KIND, 26 PY_UNICODE_4BYTE_KIND, 27 PY_UNICODE_WCHAR_KIND, 28) 29from numba._helperlib import c_helpers 30from numba.cpython.hashing import _Py_hash_t 31from numba.core.unsafe.bytes import memcpy_region 32from numba.core.errors import TypingError 33from numba.cpython.unicode_support import (_Py_TOUPPER, _Py_TOLOWER, _Py_UCS4, 34 _Py_ISALNUM, 35 _PyUnicode_ToUpperFull, 36 _PyUnicode_ToLowerFull, 37 _PyUnicode_ToFoldedFull, 38 _PyUnicode_ToTitleFull, 39 _PyUnicode_IsPrintable, 40 _PyUnicode_IsSpace, 41 _Py_ISSPACE, 42 _PyUnicode_IsXidStart, 43 _PyUnicode_IsXidContinue, 44 _PyUnicode_IsCased, 45 _PyUnicode_IsCaseIgnorable, 46 _PyUnicode_IsUppercase, 47 _PyUnicode_IsLowercase, 48 _PyUnicode_IsLineBreak, 49 _Py_ISLINEBREAK, 50 _Py_ISLINEFEED, 51 _Py_ISCARRIAGERETURN, 52 _PyUnicode_IsTitlecase, 53 _Py_ISLOWER, 54 _Py_ISUPPER, 55 _Py_TAB, 56 _Py_LINEFEED, 57 _Py_CARRIAGE_RETURN, 58 _Py_SPACE, 59 _PyUnicode_IsAlpha, 60 _PyUnicode_IsNumeric, 61 _Py_ISALPHA, 62 _PyUnicode_IsDigit, 63 _PyUnicode_IsDecimalDigit) 64from numba.cpython import slicing 65 66 67_py38_or_later = utils.PYVERSION >= (3, 8) 68 69# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L84-L85 # noqa: E501 70_MAX_UNICODE = 0x10ffff 71 72# DATA MODEL 73 74 75@register_model(types.UnicodeType) 76class UnicodeModel(models.StructModel): 77 def __init__(self, dmm, fe_type): 78 members = [ 79 ('data', types.voidptr), 80 ('length', types.intp), 81 ('kind', types.int32), 82 ('is_ascii', types.uint32), 83 ('hash', _Py_hash_t), 84 ('meminfo', types.MemInfoPointer(types.voidptr)), 85 # A pointer to the owner python str/unicode object 86 ('parent', types.pyobject), 87 ] 88 models.StructModel.__init__(self, dmm, fe_type, members) 89 90 91make_attribute_wrapper(types.UnicodeType, 'data', '_data') 92make_attribute_wrapper(types.UnicodeType, 'length', '_length') 93make_attribute_wrapper(types.UnicodeType, 'kind', '_kind') 94make_attribute_wrapper(types.UnicodeType, 'is_ascii', '_is_ascii') 95make_attribute_wrapper(types.UnicodeType, 'hash', '_hash') 96 97 98@register_default(types.UnicodeIteratorType) 99class UnicodeIteratorModel(StructModel): 100 def __init__(self, dmm, fe_type): 101 members = [('index', types.EphemeralPointer(types.uintp)), 102 ('data', fe_type.data)] 103 super(UnicodeIteratorModel, self).__init__(dmm, fe_type, members) 104 105# CAST 106 107 108def compile_time_get_string_data(obj): 109 """Get string data from a python string for use at compile-time to embed 110 the string data into the LLVM module. 111 """ 112 from ctypes import ( 113 CFUNCTYPE, c_void_p, c_int, c_uint, c_ssize_t, c_ubyte, py_object, 114 POINTER, byref, 115 ) 116 117 extract_unicode_fn = c_helpers['extract_unicode'] 118 proto = CFUNCTYPE(c_void_p, py_object, POINTER(c_ssize_t), POINTER(c_int), 119 POINTER(c_uint), POINTER(c_ssize_t)) 120 fn = proto(extract_unicode_fn) 121 length = c_ssize_t() 122 kind = c_int() 123 is_ascii = c_uint() 124 hashv = c_ssize_t() 125 data = fn(obj, byref(length), byref(kind), byref(is_ascii), byref(hashv)) 126 if data is None: 127 raise ValueError("cannot extract unicode data from the given string") 128 length = length.value 129 kind = kind.value 130 is_ascii = is_ascii.value 131 nbytes = (length + 1) * _kind_to_byte_width(kind) 132 out = (c_ubyte * nbytes).from_address(data) 133 return bytes(out), length, kind, is_ascii, hashv.value 134 135 136def make_string_from_constant(context, builder, typ, literal_string): 137 """ 138 Get string data by `compile_time_get_string_data()` and return a 139 unicode_type LLVM value 140 """ 141 databytes, length, kind, is_ascii, hashv = \ 142 compile_time_get_string_data(literal_string) 143 mod = builder.module 144 gv = context.insert_const_bytes(mod, databytes) 145 uni_str = cgutils.create_struct_proxy(typ)(context, builder) 146 uni_str.data = gv 147 uni_str.length = uni_str.length.type(length) 148 uni_str.kind = uni_str.kind.type(kind) 149 uni_str.is_ascii = uni_str.is_ascii.type(is_ascii) 150 # Set hash to -1 to indicate that it should be computed. 151 # We cannot bake in the hash value because of hashseed randomization. 152 uni_str.hash = uni_str.hash.type(-1) 153 return uni_str._getvalue() 154 155 156@lower_cast(types.StringLiteral, types.unicode_type) 157def cast_from_literal(context, builder, fromty, toty, val): 158 return make_string_from_constant( 159 context, builder, toty, fromty.literal_value, 160 ) 161 162 163# CONSTANT 164 165@lower_constant(types.unicode_type) 166def constant_unicode(context, builder, typ, pyval): 167 return make_string_from_constant(context, builder, typ, pyval) 168 169 170# BOXING 171 172 173@unbox(types.UnicodeType) 174def unbox_unicode_str(typ, obj, c): 175 """ 176 Convert a unicode str object to a native unicode structure. 177 """ 178 ok, data, length, kind, is_ascii, hashv = \ 179 c.pyapi.string_as_string_size_and_kind(obj) 180 uni_str = cgutils.create_struct_proxy(typ)(c.context, c.builder) 181 uni_str.data = data 182 uni_str.length = length 183 uni_str.kind = kind 184 uni_str.is_ascii = is_ascii 185 uni_str.hash = hashv 186 uni_str.meminfo = c.pyapi.nrt_meminfo_new_from_pyobject( 187 data, # the borrowed data pointer 188 obj, # the owner pyobject; the call will incref it. 189 ) 190 uni_str.parent = obj 191 192 is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred()) 193 return NativeValue(uni_str._getvalue(), is_error=is_error) 194 195 196@box(types.UnicodeType) 197def box_unicode_str(typ, val, c): 198 """ 199 Convert a native unicode structure to a unicode string 200 """ 201 uni_str = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val) 202 res = c.pyapi.string_from_kind_and_data( 203 uni_str.kind, uni_str.data, uni_str.length) 204 # hash isn't needed now, just compute it so it ends up in the unicodeobject 205 # hash cache, cpython doesn't always do this, depends how a string was 206 # created it's safe, just burns the cycles required to hash on @box 207 c.pyapi.object_hash(res) 208 c.context.nrt.decref(c.builder, typ, val) 209 return res 210 211 212# HELPER FUNCTIONS 213 214 215def make_deref_codegen(bitsize): 216 def codegen(context, builder, signature, args): 217 data, idx = args 218 ptr = builder.bitcast(data, IntType(bitsize).as_pointer()) 219 ch = builder.load(builder.gep(ptr, [idx])) 220 return builder.zext(ch, IntType(32)) 221 222 return codegen 223 224 225@intrinsic 226def deref_uint8(typingctx, data, offset): 227 sig = types.uint32(types.voidptr, types.intp) 228 return sig, make_deref_codegen(8) 229 230 231@intrinsic 232def deref_uint16(typingctx, data, offset): 233 sig = types.uint32(types.voidptr, types.intp) 234 return sig, make_deref_codegen(16) 235 236 237@intrinsic 238def deref_uint32(typingctx, data, offset): 239 sig = types.uint32(types.voidptr, types.intp) 240 return sig, make_deref_codegen(32) 241 242 243@intrinsic 244def _malloc_string(typingctx, kind, char_bytes, length, is_ascii): 245 """make empty string with data buffer of size alloc_bytes. 246 247 Must set length and kind values for string after it is returned 248 """ 249 def details(context, builder, signature, args): 250 [kind_val, char_bytes_val, length_val, is_ascii_val] = args 251 252 # fill the struct 253 uni_str_ctor = cgutils.create_struct_proxy(types.unicode_type) 254 uni_str = uni_str_ctor(context, builder) 255 # add null padding character 256 nbytes_val = builder.mul(char_bytes_val, 257 builder.add(length_val, 258 Constant(length_val.type, 1))) 259 uni_str.meminfo = context.nrt.meminfo_alloc(builder, nbytes_val) 260 uni_str.kind = kind_val 261 uni_str.is_ascii = is_ascii_val 262 uni_str.length = length_val 263 # empty string has hash value -1 to indicate "need to compute hash" 264 uni_str.hash = context.get_constant(_Py_hash_t, -1) 265 uni_str.data = context.nrt.meminfo_data(builder, uni_str.meminfo) 266 # Set parent to NULL 267 uni_str.parent = cgutils.get_null_value(uni_str.parent.type) 268 return uni_str._getvalue() 269 270 sig = types.unicode_type(types.int32, types.intp, types.intp, types.uint32) 271 return sig, details 272 273 274@register_jitable 275def _empty_string(kind, length, is_ascii=0): 276 char_width = _kind_to_byte_width(kind) 277 s = _malloc_string(kind, char_width, length, is_ascii) 278 _set_code_point(s, length, np.uint32(0)) # Write NULL character 279 return s 280 281 282# Disable RefCt for performance. 283@register_jitable(_nrt=False) 284def _get_code_point(a, i): 285 if a._kind == PY_UNICODE_1BYTE_KIND: 286 return deref_uint8(a._data, i) 287 elif a._kind == PY_UNICODE_2BYTE_KIND: 288 return deref_uint16(a._data, i) 289 elif a._kind == PY_UNICODE_4BYTE_KIND: 290 return deref_uint32(a._data, i) 291 else: 292 # there's also a wchar kind, but that's one of the above, 293 # so skipping for this example 294 return 0 295 296#### 297 298 299def make_set_codegen(bitsize): 300 def codegen(context, builder, signature, args): 301 data, idx, ch = args 302 if bitsize < 32: 303 ch = builder.trunc(ch, IntType(bitsize)) 304 ptr = builder.bitcast(data, IntType(bitsize).as_pointer()) 305 builder.store(ch, builder.gep(ptr, [idx])) 306 return context.get_dummy_value() 307 308 return codegen 309 310 311@intrinsic 312def set_uint8(typingctx, data, idx, ch): 313 sig = types.void(types.voidptr, types.int64, types.uint32) 314 return sig, make_set_codegen(8) 315 316 317@intrinsic 318def set_uint16(typingctx, data, idx, ch): 319 sig = types.void(types.voidptr, types.int64, types.uint32) 320 return sig, make_set_codegen(16) 321 322 323@intrinsic 324def set_uint32(typingctx, data, idx, ch): 325 sig = types.void(types.voidptr, types.int64, types.uint32) 326 return sig, make_set_codegen(32) 327 328 329@register_jitable(_nrt=False) 330def _set_code_point(a, i, ch): 331 # WARNING: This method is very dangerous: 332 # * Assumes that data contents can be changed (only allowed for new 333 # strings) 334 # * Assumes that the kind of unicode string is sufficiently wide to 335 # accept ch. Will truncate ch to make it fit. 336 # * Assumes that i is within the valid boundaries of the function 337 if a._kind == PY_UNICODE_1BYTE_KIND: 338 set_uint8(a._data, i, ch) 339 elif a._kind == PY_UNICODE_2BYTE_KIND: 340 set_uint16(a._data, i, ch) 341 elif a._kind == PY_UNICODE_4BYTE_KIND: 342 set_uint32(a._data, i, ch) 343 else: 344 raise AssertionError( 345 "Unexpected unicode representation in _set_code_point") 346 347 348@register_jitable 349def _pick_kind(kind1, kind2): 350 if kind1 == PY_UNICODE_WCHAR_KIND or kind2 == PY_UNICODE_WCHAR_KIND: 351 raise AssertionError("PY_UNICODE_WCHAR_KIND unsupported") 352 353 if kind1 == PY_UNICODE_1BYTE_KIND: 354 return kind2 355 elif kind1 == PY_UNICODE_2BYTE_KIND: 356 if kind2 == PY_UNICODE_4BYTE_KIND: 357 return kind2 358 else: 359 return kind1 360 elif kind1 == PY_UNICODE_4BYTE_KIND: 361 return kind1 362 else: 363 raise AssertionError("Unexpected unicode representation in _pick_kind") 364 365 366@register_jitable 367def _pick_ascii(is_ascii1, is_ascii2): 368 if is_ascii1 == 1 and is_ascii2 == 1: 369 return types.uint32(1) 370 return types.uint32(0) 371 372 373@register_jitable 374def _kind_to_byte_width(kind): 375 if kind == PY_UNICODE_1BYTE_KIND: 376 return 1 377 elif kind == PY_UNICODE_2BYTE_KIND: 378 return 2 379 elif kind == PY_UNICODE_4BYTE_KIND: 380 return 4 381 elif kind == PY_UNICODE_WCHAR_KIND: 382 raise AssertionError("PY_UNICODE_WCHAR_KIND unsupported") 383 else: 384 raise AssertionError("Unexpected unicode encoding encountered") 385 386 387@register_jitable(_nrt=False) 388def _cmp_region(a, a_offset, b, b_offset, n): 389 if n == 0: 390 return 0 391 elif a_offset + n > a._length: 392 return -1 393 elif b_offset + n > b._length: 394 return 1 395 396 for i in range(n): 397 a_chr = _get_code_point(a, a_offset + i) 398 b_chr = _get_code_point(b, b_offset + i) 399 if a_chr < b_chr: 400 return -1 401 elif a_chr > b_chr: 402 return 1 403 404 return 0 405 406 407@register_jitable 408def _codepoint_to_kind(cp): 409 """ 410 Compute the minimum unicode kind needed to hold a given codepoint 411 """ 412 if cp < 256: 413 return PY_UNICODE_1BYTE_KIND 414 elif cp < 65536: 415 return PY_UNICODE_2BYTE_KIND 416 else: 417 # Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) 418 MAX_UNICODE = 0x10ffff 419 if cp > MAX_UNICODE: 420 msg = "Invalid codepoint. Found value greater than Unicode maximum" 421 raise ValueError(msg) 422 return PY_UNICODE_4BYTE_KIND 423 424 425@register_jitable 426def _codepoint_is_ascii(ch): 427 """ 428 Returns true if a codepoint is in the ASCII range 429 """ 430 return ch < 128 431 432 433# PUBLIC API 434 435 436@overload(str) 437def unicode_str(s): 438 if isinstance(s, types.UnicodeType): 439 return lambda s: s 440 441 442@overload(len) 443def unicode_len(s): 444 if isinstance(s, types.UnicodeType): 445 def len_impl(s): 446 return s._length 447 return len_impl 448 449 450@overload(operator.eq) 451def unicode_eq(a, b): 452 if not (a.is_internal and b.is_internal): 453 return 454 accept = (types.UnicodeType, types.StringLiteral, types.UnicodeCharSeq) 455 a_unicode = isinstance(a, accept) 456 b_unicode = isinstance(b, accept) 457 if a_unicode and b_unicode: 458 def eq_impl(a, b): 459 # the str() is for UnicodeCharSeq, it's a nop else 460 a = str(a) 461 b = str(b) 462 if len(a) != len(b): 463 return False 464 return _cmp_region(a, 0, b, 0, len(a)) == 0 465 return eq_impl 466 elif a_unicode ^ b_unicode: 467 # one of the things is unicode, everything compares False 468 def eq_impl(a, b): 469 return False 470 return eq_impl 471 472 473@overload(operator.ne) 474def unicode_ne(a, b): 475 if not (a.is_internal and b.is_internal): 476 return 477 accept = (types.UnicodeType, types.StringLiteral, types.UnicodeCharSeq) 478 a_unicode = isinstance(a, accept) 479 b_unicode = isinstance(b, accept) 480 if a_unicode and b_unicode: 481 def ne_impl(a, b): 482 return not (a == b) 483 return ne_impl 484 elif a_unicode ^ b_unicode: 485 # one of the things is unicode, everything compares True 486 def eq_impl(a, b): 487 return True 488 return eq_impl 489 490 491@overload(operator.lt) 492def unicode_lt(a, b): 493 a_unicode = isinstance(a, (types.UnicodeType, types.StringLiteral)) 494 b_unicode = isinstance(b, (types.UnicodeType, types.StringLiteral)) 495 if a_unicode and b_unicode: 496 def lt_impl(a, b): 497 minlen = min(len(a), len(b)) 498 eqcode = _cmp_region(a, 0, b, 0, minlen) 499 if eqcode == -1: 500 return True 501 elif eqcode == 0: 502 return len(a) < len(b) 503 return False 504 return lt_impl 505 506 507@overload(operator.gt) 508def unicode_gt(a, b): 509 a_unicode = isinstance(a, (types.UnicodeType, types.StringLiteral)) 510 b_unicode = isinstance(b, (types.UnicodeType, types.StringLiteral)) 511 if a_unicode and b_unicode: 512 def gt_impl(a, b): 513 minlen = min(len(a), len(b)) 514 eqcode = _cmp_region(a, 0, b, 0, minlen) 515 if eqcode == 1: 516 return True 517 elif eqcode == 0: 518 return len(a) > len(b) 519 return False 520 return gt_impl 521 522 523@overload(operator.le) 524def unicode_le(a, b): 525 a_unicode = isinstance(a, (types.UnicodeType, types.StringLiteral)) 526 b_unicode = isinstance(b, (types.UnicodeType, types.StringLiteral)) 527 if a_unicode and b_unicode: 528 def le_impl(a, b): 529 return not (a > b) 530 return le_impl 531 532 533@overload(operator.ge) 534def unicode_ge(a, b): 535 a_unicode = isinstance(a, (types.UnicodeType, types.StringLiteral)) 536 b_unicode = isinstance(b, (types.UnicodeType, types.StringLiteral)) 537 if a_unicode and b_unicode: 538 def ge_impl(a, b): 539 return not (a < b) 540 return ge_impl 541 542 543@overload(operator.contains) 544def unicode_contains(a, b): 545 if isinstance(a, types.UnicodeType) and isinstance(b, types.UnicodeType): 546 def contains_impl(a, b): 547 # note parameter swap: contains(a, b) == b in a 548 return _find(a, b) > -1 549 return contains_impl 550 551 552def unicode_idx_check_type(ty, name): 553 """Check object belongs to one of specific types 554 ty: type 555 Type of the object 556 name: str 557 Name of the object 558 """ 559 thety = ty 560 # if the type is omitted, the concrete type is the value 561 if isinstance(ty, types.Omitted): 562 thety = ty.value 563 # if the type is optional, the concrete type is the captured type 564 elif isinstance(ty, types.Optional): 565 thety = ty.type 566 567 accepted = (types.Integer, types.NoneType) 568 if thety is not None and not isinstance(thety, accepted): 569 raise TypingError('"{}" must be {}, not {}'.format(name, accepted, ty)) 570 571 572def unicode_sub_check_type(ty, name): 573 """Check object belongs to unicode type""" 574 if not isinstance(ty, types.UnicodeType): 575 msg = '"{}" must be {}, not {}'.format(name, types.UnicodeType, ty) 576 raise TypingError(msg) 577 578 579def generate_finder(find_func): 580 """Generate finder either left or right.""" 581 def impl(data, substr, start=None, end=None): 582 length = len(data) 583 sub_length = len(substr) 584 if start is None: 585 start = 0 586 if end is None: 587 end = length 588 589 start, end = _adjust_indices(length, start, end) 590 if end - start < sub_length: 591 return -1 592 593 return find_func(data, substr, start, end) 594 595 return impl 596 597 598@register_jitable 599def _finder(data, substr, start, end): 600 """Left finder.""" 601 if len(substr) == 0: 602 return start 603 for i in range(start, min(len(data), end) - len(substr) + 1): 604 if _cmp_region(data, i, substr, 0, len(substr)) == 0: 605 return i 606 return -1 607 608 609@register_jitable 610def _rfinder(data, substr, start, end): 611 """Right finder.""" 612 if len(substr) == 0: 613 return end 614 for i in range(min(len(data), end) - len(substr), start - 1, -1): 615 if _cmp_region(data, i, substr, 0, len(substr)) == 0: 616 return i 617 return -1 618 619 620_find = register_jitable(generate_finder(_finder)) 621_rfind = register_jitable(generate_finder(_rfinder)) 622 623 624@overload_method(types.UnicodeType, 'find') 625def unicode_find(data, substr, start=None, end=None): 626 """Implements str.find()""" 627 if isinstance(substr, types.UnicodeCharSeq): 628 def find_impl(data, substr, start=None, end=None): 629 return data.find(str(substr)) 630 return find_impl 631 632 unicode_idx_check_type(start, 'start') 633 unicode_idx_check_type(end, 'end') 634 unicode_sub_check_type(substr, 'substr') 635 636 return _find 637 638 639@overload_method(types.UnicodeType, 'rfind') 640def unicode_rfind(data, substr, start=None, end=None): 641 """Implements str.rfind()""" 642 if isinstance(substr, types.UnicodeCharSeq): 643 def rfind_impl(data, substr, start=None, end=None): 644 return data.rfind(str(substr)) 645 return rfind_impl 646 647 unicode_idx_check_type(start, 'start') 648 unicode_idx_check_type(end, 'end') 649 unicode_sub_check_type(substr, 'substr') 650 651 return _rfind 652 653 654# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12831-L12857 # noqa: E501 655@overload_method(types.UnicodeType, 'rindex') 656def unicode_rindex(s, sub, start=None, end=None): 657 """Implements str.rindex()""" 658 unicode_idx_check_type(start, 'start') 659 unicode_idx_check_type(end, 'end') 660 unicode_sub_check_type(sub, 'sub') 661 662 def rindex_impl(s, sub, start=None, end=None): 663 result = s.rfind(sub, start, end) 664 if result < 0: 665 raise ValueError('substring not found') 666 667 return result 668 669 return rindex_impl 670 671 672# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11692-L11718 # noqa: E501 673@overload_method(types.UnicodeType, 'index') 674def unicode_index(s, sub, start=None, end=None): 675 """Implements str.index()""" 676 unicode_idx_check_type(start, 'start') 677 unicode_idx_check_type(end, 'end') 678 unicode_sub_check_type(sub, 'sub') 679 680 def index_impl(s, sub, start=None, end=None): 681 result = s.find(sub, start, end) 682 if result < 0: 683 raise ValueError('substring not found') 684 685 return result 686 687 return index_impl 688 689 690# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12922-L12976 # noqa: E501 691@overload_method(types.UnicodeType, 'partition') 692def unicode_partition(data, sep): 693 """Implements str.partition()""" 694 thety = sep 695 # if the type is omitted, the concrete type is the value 696 if isinstance(sep, types.Omitted): 697 thety = sep.value 698 # if the type is optional, the concrete type is the captured type 699 elif isinstance(sep, types.Optional): 700 thety = sep.type 701 702 accepted = (types.UnicodeType, types.UnicodeCharSeq) 703 if thety is not None and not isinstance(thety, accepted): 704 msg = '"{}" must be {}, not {}'.format('sep', accepted, sep) 705 raise TypingError(msg) 706 707 def impl(data, sep): 708 # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/partition.h#L7-L60 # noqa: E501 709 sep = str(sep) 710 empty_str = _empty_string(data._kind, 0, data._is_ascii) 711 sep_length = len(sep) 712 if data._kind < sep._kind or len(data) < sep_length: 713 return data, empty_str, empty_str 714 715 if sep_length == 0: 716 raise ValueError('empty separator') 717 718 pos = data.find(sep) 719 if pos < 0: 720 return data, empty_str, empty_str 721 722 return data[0:pos], sep, data[pos + sep_length:len(data)] 723 724 return impl 725 726 727@overload_method(types.UnicodeType, 'count') 728def unicode_count(src, sub, start=None, end=None): 729 730 _count_args_types_check(start) 731 _count_args_types_check(end) 732 733 if isinstance(sub, types.UnicodeType): 734 def count_impl(src, sub, start=None, end=None): 735 count = 0 736 src_len = len(src) 737 sub_len = len(sub) 738 739 start = _normalize_slice_idx_count(start, src_len, 0) 740 end = _normalize_slice_idx_count(end, src_len, src_len) 741 742 if end - start < 0 or start > src_len: 743 return 0 744 745 src = src[start : end] 746 src_len = len(src) 747 start, end = 0, src_len 748 if sub_len == 0: 749 return src_len + 1 750 751 while(start + sub_len <= src_len): 752 if src[start : start + sub_len] == sub: 753 count += 1 754 start += sub_len 755 else: 756 start += 1 757 return count 758 return count_impl 759 error_msg = "The substring must be a UnicodeType, not {}" 760 raise TypingError(error_msg.format(type(sub))) 761 762 763# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12979-L13033 # noqa: E501 764@overload_method(types.UnicodeType, 'rpartition') 765def unicode_rpartition(data, sep): 766 """Implements str.rpartition()""" 767 thety = sep 768 # if the type is omitted, the concrete type is the value 769 if isinstance(sep, types.Omitted): 770 thety = sep.value 771 # if the type is optional, the concrete type is the captured type 772 elif isinstance(sep, types.Optional): 773 thety = sep.type 774 775 accepted = (types.UnicodeType, types.UnicodeCharSeq) 776 if thety is not None and not isinstance(thety, accepted): 777 msg = '"{}" must be {}, not {}'.format('sep', accepted, sep) 778 raise TypingError(msg) 779 780 def impl(data, sep): 781 # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/partition.h#L62-L115 # noqa: E501 782 sep = str(sep) 783 empty_str = _empty_string(data._kind, 0, data._is_ascii) 784 sep_length = len(sep) 785 if data._kind < sep._kind or len(data) < sep_length: 786 return empty_str, empty_str, data 787 788 if sep_length == 0: 789 raise ValueError('empty separator') 790 791 pos = data.rfind(sep) 792 if pos < 0: 793 return empty_str, empty_str, data 794 795 return data[0:pos], sep, data[pos + sep_length:len(data)] 796 797 return impl 798 799 800@overload_method(types.UnicodeType, 'startswith') 801def unicode_startswith(a, b): 802 if isinstance(b, types.UnicodeType): 803 def startswith_impl(a, b): 804 return _cmp_region(a, 0, b, 0, len(b)) == 0 805 return startswith_impl 806 if isinstance(b, types.UnicodeCharSeq): 807 def startswith_impl(a, b): 808 return a.startswith(str(b)) 809 return startswith_impl 810 811 812# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9342-L9354 # noqa: E501 813@register_jitable 814def _adjust_indices(length, start, end): 815 if end > length: 816 end = length 817 if end < 0: 818 end += length 819 if end < 0: 820 end = 0 821 if start < 0: 822 start += length 823 if start < 0: 824 start = 0 825 826 return start, end 827 828 829@overload_method(types.UnicodeType, 'endswith') 830def unicode_endswith(s, substr, start=None, end=None): 831 if not (start is None or isinstance(start, (types.Omitted, 832 types.Integer, 833 types.NoneType))): 834 raise TypingError('The arg must be a Integer or None') 835 836 if not (end is None or isinstance(end, (types.Omitted, 837 types.Integer, 838 types.NoneType))): 839 raise TypingError('The arg must be a Integer or None') 840 841 if isinstance(substr, (types.Tuple, types.UniTuple)): 842 def endswith_impl(s, substr, start=None, end=None): 843 for item in substr: 844 if s.endswith(item, start, end) is True: 845 return True 846 847 return False 848 return endswith_impl 849 850 if isinstance(substr, types.UnicodeType): 851 def endswith_impl(s, substr, start=None, end=None): 852 length = len(s) 853 sub_length = len(substr) 854 if start is None: 855 start = 0 856 if end is None: 857 end = length 858 859 start, end = _adjust_indices(length, start, end) 860 if end - start < sub_length: 861 return False 862 863 if sub_length == 0: 864 return True 865 866 s = s[start:end] 867 offset = len(s) - sub_length 868 869 return _cmp_region(s, offset, substr, 0, sub_length) == 0 870 return endswith_impl 871 872 if isinstance(substr, types.UnicodeCharSeq): 873 def endswith_impl(s, substr, start=None, end=None): 874 return s.endswith(str(substr), start, end) 875 return endswith_impl 876 877 878# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11519-L11595 # noqa: E501 879@overload_method(types.UnicodeType, 'expandtabs') 880def unicode_expandtabs(data, tabsize=8): 881 """Implements str.expandtabs()""" 882 thety = tabsize 883 # if the type is omitted, the concrete type is the value 884 if isinstance(tabsize, types.Omitted): 885 thety = tabsize.value 886 # if the type is optional, the concrete type is the captured type 887 elif isinstance(tabsize, types.Optional): 888 thety = tabsize.type 889 890 accepted = (types.Integer, int) 891 if thety is not None and not isinstance(thety, accepted): 892 raise TypingError( 893 '"tabsize" must be {}, not {}'.format(accepted, tabsize)) 894 895 def expandtabs_impl(data, tabsize=8): 896 length = len(data) 897 j = line_pos = 0 898 found = False 899 for i in range(length): 900 code_point = _get_code_point(data, i) 901 if code_point == _Py_TAB: 902 found = True 903 if tabsize > 0: 904 # cannot overflow 905 incr = tabsize - (line_pos % tabsize) 906 if j > sys.maxsize - incr: 907 raise OverflowError('new string is too long') 908 line_pos += incr 909 j += incr 910 else: 911 if j > sys.maxsize - 1: 912 raise OverflowError('new string is too long') 913 line_pos += 1 914 j += 1 915 if code_point in (_Py_LINEFEED, _Py_CARRIAGE_RETURN): 916 line_pos = 0 917 918 if not found: 919 return data 920 921 res = _empty_string(data._kind, j, data._is_ascii) 922 j = line_pos = 0 923 for i in range(length): 924 code_point = _get_code_point(data, i) 925 if code_point == _Py_TAB: 926 if tabsize > 0: 927 incr = tabsize - (line_pos % tabsize) 928 line_pos += incr 929 for idx in range(j, j + incr): 930 _set_code_point(res, idx, _Py_SPACE) 931 j += incr 932 else: 933 line_pos += 1 934 _set_code_point(res, j, code_point) 935 j += 1 936 if code_point in (_Py_LINEFEED, _Py_CARRIAGE_RETURN): 937 line_pos = 0 938 939 return res 940 941 return expandtabs_impl 942 943 944@overload_method(types.UnicodeType, 'split') 945def unicode_split(a, sep=None, maxsplit=-1): 946 if not (maxsplit == -1 or 947 isinstance(maxsplit, (types.Omitted, types.Integer, 948 types.IntegerLiteral))): 949 return None # fail typing if maxsplit is not an integer 950 951 if isinstance(sep, types.UnicodeCharSeq): 952 def split_impl(a, sep=None, maxsplit=-1): 953 return a.split(str(sep), maxsplit=maxsplit) 954 return split_impl 955 956 if isinstance(sep, types.UnicodeType): 957 def split_impl(a, sep=None, maxsplit=-1): 958 a_len = len(a) 959 sep_len = len(sep) 960 961 if sep_len == 0: 962 raise ValueError('empty separator') 963 964 parts = [] 965 last = 0 966 idx = 0 967 968 if sep_len == 1 and maxsplit == -1: 969 sep_code_point = _get_code_point(sep, 0) 970 for idx in range(a_len): 971 if _get_code_point(a, idx) == sep_code_point: 972 parts.append(a[last:idx]) 973 last = idx + 1 974 else: 975 split_count = 0 976 977 while idx < a_len and (maxsplit == -1 or 978 split_count < maxsplit): 979 if _cmp_region(a, idx, sep, 0, sep_len) == 0: 980 parts.append(a[last:idx]) 981 idx += sep_len 982 last = idx 983 split_count += 1 984 else: 985 idx += 1 986 987 if last <= a_len: 988 parts.append(a[last:]) 989 990 return parts 991 return split_impl 992 elif sep is None or isinstance(sep, types.NoneType) or \ 993 getattr(sep, 'value', False) is None: 994 def split_whitespace_impl(a, sep=None, maxsplit=-1): 995 a_len = len(a) 996 997 parts = [] 998 last = 0 999 idx = 0 1000 split_count = 0 1001 in_whitespace_block = True 1002 1003 for idx in range(a_len): 1004 code_point = _get_code_point(a, idx) 1005 is_whitespace = _PyUnicode_IsSpace(code_point) 1006 if in_whitespace_block: 1007 if is_whitespace: 1008 pass # keep consuming space 1009 else: 1010 last = idx # this is the start of the next string 1011 in_whitespace_block = False 1012 else: 1013 if not is_whitespace: 1014 pass # keep searching for whitespace transition 1015 else: 1016 parts.append(a[last:idx]) 1017 in_whitespace_block = True 1018 split_count += 1 1019 if maxsplit != -1 and split_count == maxsplit: 1020 break 1021 1022 if last <= a_len and not in_whitespace_block: 1023 parts.append(a[last:]) 1024 1025 return parts 1026 return split_whitespace_impl 1027 1028 1029def generate_rsplit_whitespace_impl(isspace_func): 1030 """Generate whitespace rsplit func based on either ascii or unicode""" 1031 1032 def rsplit_whitespace_impl(data, sep=None, maxsplit=-1): 1033 # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L192-L240 # noqa: E501 1034 if maxsplit < 0: 1035 maxsplit = sys.maxsize 1036 1037 result = [] 1038 i = len(data) - 1 1039 while maxsplit > 0: 1040 while i >= 0: 1041 code_point = _get_code_point(data, i) 1042 if not isspace_func(code_point): 1043 break 1044 i -= 1 1045 if i < 0: 1046 break 1047 j = i 1048 i -= 1 1049 while i >= 0: 1050 code_point = _get_code_point(data, i) 1051 if isspace_func(code_point): 1052 break 1053 i -= 1 1054 result.append(data[i + 1:j + 1]) 1055 maxsplit -= 1 1056 1057 if i >= 0: 1058 # Only occurs when maxsplit was reached 1059 # Skip any remaining whitespace and copy to beginning of string 1060 while i >= 0: 1061 code_point = _get_code_point(data, i) 1062 if not isspace_func(code_point): 1063 break 1064 i -= 1 1065 if i >= 0: 1066 result.append(data[0:i + 1]) 1067 1068 return result[::-1] 1069 1070 return rsplit_whitespace_impl 1071 1072 1073unicode_rsplit_whitespace_impl = register_jitable( 1074 generate_rsplit_whitespace_impl(_PyUnicode_IsSpace)) 1075ascii_rsplit_whitespace_impl = register_jitable( 1076 generate_rsplit_whitespace_impl(_Py_ISSPACE)) 1077 1078 1079# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L13095-L13108 # noqa: E501 1080@overload_method(types.UnicodeType, 'rsplit') 1081def unicode_rsplit(data, sep=None, maxsplit=-1): 1082 """Implements str.unicode_rsplit()""" 1083 1084 def _unicode_rsplit_check_type(ty, name, accepted): 1085 """Check object belongs to one of specified types""" 1086 thety = ty 1087 # if the type is omitted, the concrete type is the value 1088 if isinstance(ty, types.Omitted): 1089 thety = ty.value 1090 # if the type is optional, the concrete type is the captured type 1091 elif isinstance(ty, types.Optional): 1092 thety = ty.type 1093 1094 if thety is not None and not isinstance(thety, accepted): 1095 raise TypingError( 1096 '"{}" must be {}, not {}'.format(name, accepted, ty)) 1097 1098 _unicode_rsplit_check_type(sep, 'sep', (types.UnicodeType, 1099 types.UnicodeCharSeq, 1100 types.NoneType)) 1101 _unicode_rsplit_check_type(maxsplit, 'maxsplit', (types.Integer, int)) 1102 1103 if sep is None or isinstance(sep, (types.NoneType, types.Omitted)): 1104 1105 def rsplit_whitespace_impl(data, sep=None, maxsplit=-1): 1106 if data._is_ascii: 1107 return ascii_rsplit_whitespace_impl(data, sep, maxsplit) 1108 return unicode_rsplit_whitespace_impl(data, sep, maxsplit) 1109 1110 return rsplit_whitespace_impl 1111 1112 def rsplit_impl(data, sep=None, maxsplit=-1): 1113 sep = str(sep) 1114 # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L286-L333 # noqa: E501 1115 if data._kind < sep._kind or len(data) < len(sep): 1116 return [data] 1117 1118 def _rsplit_char(data, ch, maxsplit): 1119 # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L242-L284 # noqa: E501 1120 result = [] 1121 ch_code_point = _get_code_point(ch, 0) 1122 i = j = len(data) - 1 1123 while i >= 0 and maxsplit > 0: 1124 data_code_point = _get_code_point(data, i) 1125 if data_code_point == ch_code_point: 1126 result.append(data[i + 1 : j + 1]) 1127 j = i = i - 1 1128 maxsplit -= 1 1129 i -= 1 1130 if j >= -1: 1131 result.append(data[0 : j + 1]) 1132 1133 return result[::-1] 1134 1135 if maxsplit < 0: 1136 maxsplit = sys.maxsize 1137 1138 sep_length = len(sep) 1139 1140 if sep_length == 0: 1141 raise ValueError('empty separator') 1142 if sep_length == 1: 1143 return _rsplit_char(data, sep, maxsplit) 1144 1145 result = [] 1146 j = len(data) 1147 while maxsplit > 0: 1148 pos = data.rfind(sep, start=0, end=j) 1149 if pos < 0: 1150 break 1151 result.append(data[pos + sep_length:j]) 1152 j = pos 1153 maxsplit -= 1 1154 1155 result.append(data[0:j]) 1156 1157 return result[::-1] 1158 1159 return rsplit_impl 1160 1161 1162@overload_method(types.UnicodeType, 'center') 1163def unicode_center(string, width, fillchar=' '): 1164 if not isinstance(width, types.Integer): 1165 raise TypingError('The width must be an Integer') 1166 1167 if isinstance(fillchar, types.UnicodeCharSeq): 1168 def center_impl(string, width, fillchar=' '): 1169 return string.center(width, str(fillchar)) 1170 return center_impl 1171 1172 if not (fillchar == ' ' or 1173 isinstance(fillchar, (types.Omitted, types.UnicodeType))): 1174 raise TypingError('The fillchar must be a UnicodeType') 1175 1176 def center_impl(string, width, fillchar=' '): 1177 str_len = len(string) 1178 fillchar_len = len(fillchar) 1179 1180 if fillchar_len != 1: 1181 raise ValueError('The fill character must be exactly one ' 1182 'character long') 1183 1184 if width <= str_len: 1185 return string 1186 1187 allmargin = width - str_len 1188 lmargin = (allmargin // 2) + (allmargin & width & 1) 1189 rmargin = allmargin - lmargin 1190 1191 l_string = fillchar * lmargin 1192 if lmargin == rmargin: 1193 return l_string + string + l_string 1194 else: 1195 return l_string + string + (fillchar * rmargin) 1196 1197 return center_impl 1198 1199 1200def gen_unicode_Xjust(STRING_FIRST): 1201 def unicode_Xjust(string, width, fillchar=' '): 1202 if not isinstance(width, types.Integer): 1203 raise TypingError('The width must be an Integer') 1204 1205 if isinstance(fillchar, types.UnicodeCharSeq): 1206 if STRING_FIRST: 1207 def ljust_impl(string, width, fillchar=' '): 1208 return string.ljust(width, str(fillchar)) 1209 return ljust_impl 1210 else: 1211 def rjust_impl(string, width, fillchar=' '): 1212 return string.rjust(width, str(fillchar)) 1213 return rjust_impl 1214 1215 if not (fillchar == ' ' or 1216 isinstance(fillchar, (types.Omitted, types.UnicodeType))): 1217 raise TypingError('The fillchar must be a UnicodeType') 1218 1219 def impl(string, width, fillchar=' '): 1220 str_len = len(string) 1221 fillchar_len = len(fillchar) 1222 1223 if fillchar_len != 1: 1224 raise ValueError('The fill character must be exactly one ' 1225 'character long') 1226 1227 if width <= str_len: 1228 return string 1229 1230 newstr = (fillchar * (width - str_len)) 1231 if STRING_FIRST: 1232 return string + newstr 1233 else: 1234 return newstr + string 1235 1236 return impl 1237 1238 return unicode_Xjust 1239 1240 1241overload_method(types.UnicodeType, 'rjust')(gen_unicode_Xjust(False)) 1242overload_method(types.UnicodeType, 'ljust')(gen_unicode_Xjust(True)) 1243 1244 1245def generate_splitlines_func(is_line_break_func): 1246 """Generate splitlines performer based on ascii or unicode line breaks.""" 1247 def impl(data, keepends): 1248 # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L335-L389 # noqa: E501 1249 length = len(data) 1250 result = [] 1251 i = j = 0 1252 while i < length: 1253 # find a line and append it 1254 while i < length: 1255 code_point = _get_code_point(data, i) 1256 if is_line_break_func(code_point): 1257 break 1258 i += 1 1259 1260 # skip the line break reading CRLF as one line break 1261 eol = i 1262 if i < length: 1263 if i + 1 < length: 1264 cur_cp = _get_code_point(data, i) 1265 next_cp = _get_code_point(data, i + 1) 1266 if _Py_ISCARRIAGERETURN(cur_cp) and _Py_ISLINEFEED(next_cp): 1267 i += 1 1268 i += 1 1269 if keepends: 1270 eol = i 1271 1272 result.append(data[j:eol]) 1273 j = i 1274 1275 return result 1276 1277 return impl 1278 1279 1280_ascii_splitlines = register_jitable(generate_splitlines_func(_Py_ISLINEBREAK)) 1281_unicode_splitlines = register_jitable(generate_splitlines_func( 1282 _PyUnicode_IsLineBreak)) 1283 1284 1285# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10196-L10229 # noqa: E501 1286@overload_method(types.UnicodeType, 'splitlines') 1287def unicode_splitlines(data, keepends=False): 1288 """Implements str.splitlines()""" 1289 thety = keepends 1290 # if the type is omitted, the concrete type is the value 1291 if isinstance(keepends, types.Omitted): 1292 thety = keepends.value 1293 # if the type is optional, the concrete type is the captured type 1294 elif isinstance(keepends, types.Optional): 1295 thety = keepends.type 1296 1297 accepted = (types.Integer, int, types.Boolean, bool) 1298 if thety is not None and not isinstance(thety, accepted): 1299 raise TypingError( 1300 '"{}" must be {}, not {}'.format('keepends', accepted, keepends)) 1301 1302 def splitlines_impl(data, keepends=False): 1303 if data._is_ascii: 1304 return _ascii_splitlines(data, keepends) 1305 1306 return _unicode_splitlines(data, keepends) 1307 1308 return splitlines_impl 1309 1310 1311@register_jitable 1312def join_list(sep, parts): 1313 parts_len = len(parts) 1314 if parts_len == 0: 1315 return '' 1316 1317 # Precompute size and char_width of result 1318 sep_len = len(sep) 1319 length = (parts_len - 1) * sep_len 1320 kind = sep._kind 1321 is_ascii = sep._is_ascii 1322 for p in parts: 1323 length += len(p) 1324 kind = _pick_kind(kind, p._kind) 1325 is_ascii = _pick_ascii(is_ascii, p._is_ascii) 1326 1327 result = _empty_string(kind, length, is_ascii) 1328 1329 # populate string 1330 part = parts[0] 1331 _strncpy(result, 0, part, 0, len(part)) 1332 dst_offset = len(part) 1333 for idx in range(1, parts_len): 1334 _strncpy(result, dst_offset, sep, 0, sep_len) 1335 dst_offset += sep_len 1336 part = parts[idx] 1337 _strncpy(result, dst_offset, part, 0, len(part)) 1338 dst_offset += len(part) 1339 1340 return result 1341 1342 1343@overload_method(types.UnicodeType, 'join') 1344def unicode_join(sep, parts): 1345 1346 if isinstance(parts, types.List): 1347 if isinstance(parts.dtype, types.UnicodeType): 1348 def join_list_impl(sep, parts): 1349 return join_list(sep, parts) 1350 return join_list_impl 1351 elif isinstance(parts.dtype, types.UnicodeCharSeq): 1352 def join_list_impl(sep, parts): 1353 _parts = [str(p) for p in parts] 1354 return join_list(sep, _parts) 1355 return join_list_impl 1356 else: 1357 pass # lists of any other type not supported 1358 elif isinstance(parts, types.IterableType): 1359 def join_iter_impl(sep, parts): 1360 parts_list = [p for p in parts] 1361 return join_list(sep, parts_list) 1362 return join_iter_impl 1363 elif isinstance(parts, types.UnicodeType): 1364 # Temporary workaround until UnicodeType is iterable 1365 def join_str_impl(sep, parts): 1366 parts_list = [parts[i] for i in range(len(parts))] 1367 return join_list(sep, parts_list) 1368 return join_str_impl 1369 1370 1371@overload_method(types.UnicodeType, 'zfill') 1372def unicode_zfill(string, width): 1373 if not isinstance(width, types.Integer): 1374 raise TypingError("<width> must be an Integer") 1375 1376 def zfill_impl(string, width): 1377 1378 str_len = len(string) 1379 1380 if width <= str_len: 1381 return string 1382 1383 first_char = string[0] if str_len else '' 1384 padding = '0' * (width - str_len) 1385 1386 if first_char in ['+', '-']: 1387 newstr = first_char + padding + string[1:] 1388 else: 1389 newstr = padding + string 1390 1391 return newstr 1392 1393 return zfill_impl 1394 1395 1396# ------------------------------------------------------------------------------ 1397# Strip functions 1398# ------------------------------------------------------------------------------ 1399@register_jitable 1400def unicode_strip_left_bound(string, chars): 1401 str_len = len(string) 1402 1403 i = 0 1404 if chars is not None: 1405 for i in range(str_len): 1406 if string[i] not in chars: 1407 return i 1408 else: 1409 for i in range(str_len): 1410 if not _PyUnicode_IsSpace(string[i]): 1411 return i 1412 1413 return str_len 1414 1415 1416@register_jitable 1417def unicode_strip_right_bound(string, chars): 1418 str_len = len(string) 1419 i = 0 1420 if chars is not None: 1421 for i in range(str_len - 1, -1, -1): 1422 if string[i] not in chars: 1423 i += 1 1424 break 1425 else: 1426 for i in range(str_len - 1, -1, -1): 1427 if not _PyUnicode_IsSpace(string[i]): 1428 i += 1 1429 break 1430 1431 return i 1432 1433 1434def unicode_strip_types_check(chars): 1435 if isinstance(chars, types.Optional): 1436 chars = chars.type # catch optional type with invalid non-None type 1437 if not (chars is None or isinstance(chars, (types.Omitted, 1438 types.UnicodeType, 1439 types.NoneType))): 1440 raise TypingError('The arg must be a UnicodeType or None') 1441 1442 1443def _count_args_types_check(arg): 1444 if isinstance(arg, types.Optional): 1445 arg = arg.type 1446 if not (arg is None or isinstance(arg, (types.Omitted, 1447 types.Integer, 1448 types.NoneType))): 1449 raise TypingError("The slice indices must be an Integer or None") 1450 1451 1452@overload_method(types.UnicodeType, 'lstrip') 1453def unicode_lstrip(string, chars=None): 1454 1455 if isinstance(chars, types.UnicodeCharSeq): 1456 def lstrip_impl(string, chars=None): 1457 return string.lstrip(str(chars)) 1458 return lstrip_impl 1459 1460 unicode_strip_types_check(chars) 1461 1462 def lstrip_impl(string, chars=None): 1463 return string[unicode_strip_left_bound(string, chars):] 1464 return lstrip_impl 1465 1466 1467@overload_method(types.UnicodeType, 'rstrip') 1468def unicode_rstrip(string, chars=None): 1469 1470 if isinstance(chars, types.UnicodeCharSeq): 1471 def rstrip_impl(string, chars=None): 1472 return string.rstrip(str(chars)) 1473 return rstrip_impl 1474 1475 unicode_strip_types_check(chars) 1476 1477 def rstrip_impl(string, chars=None): 1478 return string[:unicode_strip_right_bound(string, chars)] 1479 return rstrip_impl 1480 1481 1482@overload_method(types.UnicodeType, 'strip') 1483def unicode_strip(string, chars=None): 1484 1485 if isinstance(chars, types.UnicodeCharSeq): 1486 def strip_impl(string, chars=None): 1487 return string.strip(str(chars)) 1488 return strip_impl 1489 1490 unicode_strip_types_check(chars) 1491 1492 def strip_impl(string, chars=None): 1493 lb = unicode_strip_left_bound(string, chars) 1494 rb = unicode_strip_right_bound(string, chars) 1495 return string[lb:rb] 1496 return strip_impl 1497 1498 1499# ------------------------------------------------------------------------------ 1500# Slice functions 1501# ------------------------------------------------------------------------------ 1502 1503@register_jitable 1504def normalize_str_idx(idx, length, is_start=True): 1505 """ 1506 Parameters 1507 ---------- 1508 idx : int or None 1509 the index 1510 length : int 1511 the string length 1512 is_start : bool; optional with defaults to True 1513 Is it the *start* or the *stop* of the slice? 1514 1515 Returns 1516 ------- 1517 norm_idx : int 1518 normalized index 1519 """ 1520 if idx is None: 1521 if is_start: 1522 return 0 1523 else: 1524 return length 1525 elif idx < 0: 1526 idx += length 1527 1528 if idx < 0 or idx >= length: 1529 raise IndexError("string index out of range") 1530 1531 return idx 1532 1533 1534@register_jitable 1535def _normalize_slice_idx_count(arg, slice_len, default): 1536 """ 1537 Used for unicode_count 1538 1539 If arg < -slice_len, returns 0 (prevents circle) 1540 1541 If arg is within slice, e.g -slice_len <= arg < slice_len 1542 returns its real index via arg % slice_len 1543 1544 If arg > slice_len, returns arg (in this case count must 1545 return 0 if it is start index) 1546 """ 1547 1548 if arg is None: 1549 return default 1550 if -slice_len <= arg < slice_len: 1551 return arg % slice_len 1552 return 0 if arg < 0 else arg 1553 1554 1555@intrinsic 1556def _normalize_slice(typingctx, sliceobj, length): 1557 """Fix slice object. 1558 """ 1559 sig = sliceobj(sliceobj, length) 1560 1561 def codegen(context, builder, sig, args): 1562 [slicetype, lengthtype] = sig.args 1563 [sliceobj, length] = args 1564 slice = context.make_helper(builder, slicetype, sliceobj) 1565 slicing.guard_invalid_slice(context, builder, slicetype, slice) 1566 slicing.fix_slice(builder, slice, length) 1567 return slice._getvalue() 1568 1569 return sig, codegen 1570 1571 1572@intrinsic 1573def _slice_span(typingctx, sliceobj): 1574 """Compute the span from the given slice object. 1575 """ 1576 sig = types.intp(sliceobj) 1577 1578 def codegen(context, builder, sig, args): 1579 [slicetype] = sig.args 1580 [sliceobj] = args 1581 slice = context.make_helper(builder, slicetype, sliceobj) 1582 result_size = slicing.get_slice_length(builder, slice) 1583 return result_size 1584 1585 return sig, codegen 1586 1587 1588@register_jitable(_nrt=False) 1589def _strncpy(dst, dst_offset, src, src_offset, n): 1590 if src._kind == dst._kind: 1591 byte_width = _kind_to_byte_width(src._kind) 1592 src_byte_offset = byte_width * src_offset 1593 dst_byte_offset = byte_width * dst_offset 1594 nbytes = n * byte_width 1595 memcpy_region(dst._data, dst_byte_offset, src._data, 1596 src_byte_offset, nbytes, align=1) 1597 else: 1598 for i in range(n): 1599 _set_code_point(dst, dst_offset + i, 1600 _get_code_point(src, src_offset + i)) 1601 1602 1603@intrinsic 1604def _get_str_slice_view(typingctx, src_t, start_t, length_t): 1605 """Create a slice of a unicode string using a view of its data to avoid 1606 extra allocation. 1607 """ 1608 assert src_t == types.unicode_type 1609 1610 def codegen(context, builder, sig, args): 1611 src, start, length = args 1612 in_str = cgutils.create_struct_proxy( 1613 types.unicode_type)(context, builder, value=src) 1614 view_str = cgutils.create_struct_proxy( 1615 types.unicode_type)(context, builder) 1616 view_str.meminfo = in_str.meminfo 1617 view_str.kind = in_str.kind 1618 view_str.is_ascii = in_str.is_ascii 1619 view_str.length = length 1620 # hash value -1 to indicate "need to compute hash" 1621 view_str.hash = context.get_constant(_Py_hash_t, -1) 1622 # get a pointer to start of slice data 1623 bw_typ = context.typing_context.resolve_value_type(_kind_to_byte_width) 1624 bw_sig = bw_typ.get_call_type( 1625 context.typing_context, (types.int32,), {}) 1626 bw_impl = context.get_function(bw_typ, bw_sig) 1627 byte_width = bw_impl(builder, (in_str.kind,)) 1628 offset = builder.mul(start, byte_width) 1629 view_str.data = builder.gep(in_str.data, [offset]) 1630 # Set parent pyobject to NULL 1631 view_str.parent = cgutils.get_null_value(view_str.parent.type) 1632 # incref original string 1633 if context.enable_nrt: 1634 context.nrt.incref(builder, sig.args[0], src) 1635 return view_str._getvalue() 1636 1637 sig = types.unicode_type(types.unicode_type, types.intp, types.intp) 1638 return sig, codegen 1639 1640 1641@overload(operator.getitem) 1642def unicode_getitem(s, idx): 1643 if isinstance(s, types.UnicodeType): 1644 if isinstance(idx, types.Integer): 1645 def getitem_char(s, idx): 1646 idx = normalize_str_idx(idx, len(s)) 1647 ret = _empty_string(s._kind, 1, s._is_ascii) 1648 _set_code_point(ret, 0, _get_code_point(s, idx)) 1649 return ret 1650 return getitem_char 1651 elif isinstance(idx, types.SliceType): 1652 def getitem_slice(s, idx): 1653 slice_idx = _normalize_slice(idx, len(s)) 1654 span = _slice_span(slice_idx) 1655 1656 if slice_idx.step == 1: 1657 return _get_str_slice_view(s, slice_idx.start, span) 1658 else: 1659 ret = _empty_string(s._kind, span, s._is_ascii) 1660 cur = slice_idx.start 1661 for i in range(span): 1662 _set_code_point(ret, i, _get_code_point(s, cur)) 1663 cur += slice_idx.step 1664 return ret 1665 return getitem_slice 1666 1667 1668# ------------------------------------------------------------------------------ 1669# String operations 1670# ------------------------------------------------------------------------------ 1671 1672 1673@overload(operator.add) 1674@overload(operator.iadd) 1675def unicode_concat(a, b): 1676 if isinstance(a, types.UnicodeType) and isinstance(b, types.UnicodeType): 1677 def concat_impl(a, b): 1678 new_length = a._length + b._length 1679 new_kind = _pick_kind(a._kind, b._kind) 1680 new_ascii = _pick_ascii(a._is_ascii, b._is_ascii) 1681 result = _empty_string(new_kind, new_length, new_ascii) 1682 for i in range(len(a)): 1683 _set_code_point(result, i, _get_code_point(a, i)) 1684 for j in range(len(b)): 1685 _set_code_point(result, len(a) + j, _get_code_point(b, j)) 1686 return result 1687 return concat_impl 1688 1689 if isinstance(a, types.UnicodeType) and isinstance(b, types.UnicodeCharSeq): 1690 def concat_impl(a, b): 1691 return a + str(b) 1692 return concat_impl 1693 1694 1695@register_jitable 1696def _repeat_impl(str_arg, mult_arg): 1697 if str_arg == '' or mult_arg < 1: 1698 return '' 1699 elif mult_arg == 1: 1700 return str_arg 1701 else: 1702 new_length = str_arg._length * mult_arg 1703 new_kind = str_arg._kind 1704 result = _empty_string(new_kind, new_length, str_arg._is_ascii) 1705 # make initial copy into result 1706 len_a = len(str_arg) 1707 _strncpy(result, 0, str_arg, 0, len_a) 1708 # loop through powers of 2 for efficient copying 1709 copy_size = len_a 1710 while 2 * copy_size <= new_length: 1711 _strncpy(result, copy_size, result, 0, copy_size) 1712 copy_size *= 2 1713 1714 if not 2 * copy_size == new_length: 1715 # if copy_size not an exact multiple it then needs 1716 # to complete the rest of the copies 1717 rest = new_length - copy_size 1718 _strncpy(result, copy_size, result, copy_size - rest, rest) 1719 return result 1720 1721 1722@overload(operator.mul) 1723def unicode_repeat(a, b): 1724 if isinstance(a, types.UnicodeType) and isinstance(b, types.Integer): 1725 def wrap(a, b): 1726 return _repeat_impl(a, b) 1727 return wrap 1728 elif isinstance(a, types.Integer) and isinstance(b, types.UnicodeType): 1729 def wrap(a, b): 1730 return _repeat_impl(b, a) 1731 return wrap 1732 1733 1734@overload(operator.not_) 1735def unicode_not(a): 1736 if isinstance(a, types.UnicodeType): 1737 def impl(a): 1738 return len(a) == 0 1739 return impl 1740 1741 1742@overload_method(types.UnicodeType, 'replace') 1743def unicode_replace(s, old_str, new_str, count=-1): 1744 thety = count 1745 if isinstance(count, types.Omitted): 1746 thety = count.value 1747 elif isinstance(count, types.Optional): 1748 thety = count.type 1749 1750 if not isinstance(thety, (int, types.Integer)): 1751 raise TypingError('Unsupported parameters. The parametrs ' 1752 'must be Integer. Given count: {}'.format(count)) 1753 1754 if not isinstance(old_str, (types.UnicodeType, types.NoneType)): 1755 raise TypingError('The object must be a UnicodeType.' 1756 ' Given: {}'.format(old_str)) 1757 1758 if not isinstance(new_str, types.UnicodeType): 1759 raise TypingError('The object must be a UnicodeType.' 1760 ' Given: {}'.format(new_str)) 1761 1762 def impl(s, old_str, new_str, count=-1): 1763 if count == 0: 1764 return s 1765 if old_str == '': 1766 schars = list(s) 1767 if count == -1: 1768 return new_str + new_str.join(schars) + new_str 1769 split_result = [new_str] 1770 min_count = min(len(schars), count) 1771 for i in range(min_count): 1772 split_result.append(schars[i]) 1773 if i + 1 != min_count: 1774 split_result.append(new_str) 1775 else: 1776 split_result.append(''.join(schars[(i + 1):])) 1777 if count > len(schars): 1778 split_result.append(new_str) 1779 return ''.join(split_result) 1780 schars = s.split(old_str, count) 1781 result = new_str.join(schars) 1782 return result 1783 1784 return impl 1785 1786# ------------------------------------------------------------------------------ 1787# String `is*()` methods 1788# ------------------------------------------------------------------------------ 1789 1790 1791# generates isalpha/isalnum 1792def gen_isAlX(ascii_func, unicode_func): 1793 def unicode_isAlX(data): 1794 1795 def impl(data): 1796 length = len(data) 1797 if length == 0: 1798 return False 1799 1800 if length == 1: 1801 code_point = _get_code_point(data, 0) 1802 if data._is_ascii: 1803 return ascii_func(code_point) 1804 else: 1805 return unicode_func(code_point) 1806 1807 if data._is_ascii: 1808 for i in range(length): 1809 code_point = _get_code_point(data, i) 1810 if not ascii_func(code_point): 1811 return False 1812 1813 for i in range(length): 1814 code_point = _get_code_point(data, i) 1815 if not unicode_func(code_point): 1816 return False 1817 1818 return True 1819 1820 return impl 1821 return unicode_isAlX 1822 1823 1824# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11928-L11964 # noqa: E501 1825overload_method(types.UnicodeType, 'isalpha')(gen_isAlX(_Py_ISALPHA, 1826 _PyUnicode_IsAlpha)) 1827 1828_unicode_is_alnum = register_jitable(lambda x: 1829 (_PyUnicode_IsNumeric(x) or 1830 _PyUnicode_IsAlpha(x))) 1831 1832# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11975-L12006 # noqa: E501 1833overload_method(types.UnicodeType, 'isalnum')(gen_isAlX(_Py_ISALNUM, 1834 _unicode_is_alnum)) 1835 1836 1837def _is_upper(is_lower, is_upper, is_title): 1838 # impl is an approximate translation of: 1839 # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11794-L11827 # noqa: E501 1840 # mixed with: 1841 # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L218-L242 # noqa: E501 1842 def impl(a): 1843 l = len(a) 1844 if l == 1: 1845 return is_upper(_get_code_point(a, 0)) 1846 if l == 0: 1847 return False 1848 cased = False 1849 for idx in range(l): 1850 code_point = _get_code_point(a, idx) 1851 if is_lower(code_point) or is_title(code_point): 1852 return False 1853 elif(not cased and is_upper(code_point)): 1854 cased = True 1855 return cased 1856 return impl 1857 1858 1859_always_false = register_jitable(lambda x: False) 1860_ascii_is_upper = register_jitable(_is_upper(_Py_ISLOWER, _Py_ISUPPER, 1861 _always_false)) 1862_unicode_is_upper = register_jitable(_is_upper(_PyUnicode_IsLowercase, 1863 _PyUnicode_IsUppercase, 1864 _PyUnicode_IsTitlecase)) 1865 1866 1867@overload_method(types.UnicodeType, 'isupper') 1868def unicode_isupper(a): 1869 """ 1870 Implements .isupper() 1871 """ 1872 def impl(a): 1873 if a._is_ascii: 1874 return _ascii_is_upper(a) 1875 else: 1876 return _unicode_is_upper(a) 1877 return impl 1878 1879 1880if utils.PYVERSION >= (3, 7): 1881 @overload_method(types.UnicodeType, 'isascii') 1882 def unicode_isascii(data): 1883 """Implements UnicodeType.isascii()""" 1884 1885 def impl(data): 1886 return data._is_ascii 1887 return impl 1888 1889 1890@overload_method(types.UnicodeType, 'istitle') 1891def unicode_istitle(data): 1892 """ 1893 Implements UnicodeType.istitle() 1894 The algorithm is an approximate translation from CPython: 1895 https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11829-L11885 # noqa: E501 1896 """ 1897 1898 def impl(data): 1899 length = len(data) 1900 if length == 1: 1901 char = _get_code_point(data, 0) 1902 return _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char) 1903 1904 if length == 0: 1905 return False 1906 1907 cased = False 1908 previous_is_cased = False 1909 for idx in range(length): 1910 char = _get_code_point(data, idx) 1911 if _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char): 1912 if previous_is_cased: 1913 return False 1914 previous_is_cased = True 1915 cased = True 1916 elif _PyUnicode_IsLowercase(char): 1917 if not previous_is_cased: 1918 return False 1919 previous_is_cased = True 1920 cased = True 1921 else: 1922 previous_is_cased = False 1923 1924 return cased 1925 return impl 1926 1927 1928@overload_method(types.UnicodeType, 'islower') 1929def unicode_islower(data): 1930 """ 1931 impl is an approximate translation of: 1932 https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L11900-L11933 # noqa: E501 1933 mixed with: 1934 https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/bytes_methods.c#L131-L156 # noqa: E501 1935 """ 1936 1937 def impl(data): 1938 length = len(data) 1939 if length == 1: 1940 return _PyUnicode_IsLowercase(_get_code_point(data, 0)) 1941 if length == 0: 1942 return False 1943 1944 cased = False 1945 for idx in range(length): 1946 cp = _get_code_point(data, idx) 1947 if _PyUnicode_IsUppercase(cp) or _PyUnicode_IsTitlecase(cp): 1948 return False 1949 elif not cased and _PyUnicode_IsLowercase(cp): 1950 cased = True 1951 return cased 1952 return impl 1953 1954 1955# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12126-L12161 # noqa: E501 1956@overload_method(types.UnicodeType, 'isidentifier') 1957def unicode_isidentifier(data): 1958 """Implements UnicodeType.isidentifier()""" 1959 1960 def impl(data): 1961 length = len(data) 1962 if length == 0: 1963 return False 1964 1965 first_cp = _get_code_point(data, 0) 1966 if not _PyUnicode_IsXidStart(first_cp) and first_cp != 0x5F: 1967 return False 1968 1969 for i in range(1, length): 1970 code_point = _get_code_point(data, i) 1971 if not _PyUnicode_IsXidContinue(code_point): 1972 return False 1973 1974 return True 1975 1976 return impl 1977 1978 1979# generator for simple unicode "isX" methods 1980def gen_isX(_PyUnicode_IS_func, empty_is_false=True): 1981 def unicode_isX(data): 1982 def impl(data): 1983 length = len(data) 1984 if length == 1: 1985 return _PyUnicode_IS_func(_get_code_point(data, 0)) 1986 1987 if empty_is_false and length == 0: 1988 return False 1989 1990 for i in range(length): 1991 code_point = _get_code_point(data, i) 1992 if not _PyUnicode_IS_func(code_point): 1993 return False 1994 1995 return True 1996 1997 return impl 1998 return unicode_isX 1999 2000 2001# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11896-L11925 # noqa: E501 2002overload_method(types.UnicodeType, 'isspace')(gen_isX(_PyUnicode_IsSpace)) 2003 2004# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12096-L12124 # noqa: E501 2005overload_method(types.UnicodeType, 'isnumeric')(gen_isX(_PyUnicode_IsNumeric)) 2006 2007# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12056-L12085 # noqa: E501 2008overload_method(types.UnicodeType, 'isdigit')(gen_isX(_PyUnicode_IsDigit)) 2009 2010# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12017-L12045 # noqa: E501 2011overload_method(types.UnicodeType, 'isdecimal')( 2012 gen_isX(_PyUnicode_IsDecimalDigit)) 2013 2014# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12188-L12213 # noqa: E501 2015overload_method(types.UnicodeType, 'isprintable')( 2016 gen_isX(_PyUnicode_IsPrintable, False)) 2017 2018# ------------------------------------------------------------------------------ 2019# String methods that apply a transformation to the characters themselves 2020# ------------------------------------------------------------------------------ 2021 2022 2023# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 2024def case_operation(ascii_func, unicode_func): 2025 """Generate common case operation performer.""" 2026 def impl(data): 2027 length = len(data) 2028 if length == 0: 2029 return _empty_string(data._kind, length, data._is_ascii) 2030 2031 if data._is_ascii: 2032 res = _empty_string(data._kind, length, 1) 2033 ascii_func(data, res) 2034 return res 2035 2036 # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 2037 tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii) 2038 # maxchar should be inside of a list to be pass as argument by reference 2039 maxchars = [0] 2040 newlength = unicode_func(data, length, tmp, maxchars) 2041 maxchar = maxchars[0] 2042 newkind = _codepoint_to_kind(maxchar) 2043 res = _empty_string(newkind, newlength, _codepoint_is_ascii(maxchar)) 2044 for i in range(newlength): 2045 _set_code_point(res, i, _get_code_point(tmp, i)) 2046 2047 return res 2048 2049 return impl 2050 2051 2052# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9856-L9883 # noqa: E501 2053@register_jitable 2054def _handle_capital_sigma(data, length, idx): 2055 """This is a translation of the function that handles the capital sigma.""" 2056 c = 0 2057 j = idx - 1 2058 while j >= 0: 2059 c = _get_code_point(data, j) 2060 if not _PyUnicode_IsCaseIgnorable(c): 2061 break 2062 j -= 1 2063 final_sigma = (j >= 0 and _PyUnicode_IsCased(c)) 2064 if final_sigma: 2065 j = idx + 1 2066 while j < length: 2067 c = _get_code_point(data, j) 2068 if not _PyUnicode_IsCaseIgnorable(c): 2069 break 2070 j += 1 2071 final_sigma = (j == length or (not _PyUnicode_IsCased(c))) 2072 2073 return 0x3c2 if final_sigma else 0x3c3 2074 2075 2076# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9885-L9895 # noqa: E501 2077@register_jitable 2078def _lower_ucs4(code_point, data, length, idx, mapped): 2079 """This is a translation of the function that lowers a character.""" 2080 if code_point == 0x3A3: 2081 mapped[0] = _handle_capital_sigma(data, length, idx) 2082 return 1 2083 return _PyUnicode_ToLowerFull(code_point, mapped) 2084 2085 2086# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9946-L9965 # noqa: E501 2087def _gen_unicode_upper_or_lower(lower): 2088 def _do_upper_or_lower(data, length, res, maxchars): 2089 k = 0 2090 for idx in range(length): 2091 mapped = np.zeros(3, dtype=_Py_UCS4) 2092 code_point = _get_code_point(data, idx) 2093 if lower: 2094 n_res = _lower_ucs4(code_point, data, length, idx, mapped) 2095 else: 2096 # might be needed if call _do_upper_or_lower in unicode_upper 2097 n_res = _PyUnicode_ToUpperFull(code_point, mapped) 2098 for m in mapped[:n_res]: 2099 maxchars[0] = max(maxchars[0], m) 2100 _set_code_point(res, k, m) 2101 k += 1 2102 return k 2103 return _do_upper_or_lower 2104 2105 2106_unicode_upper = register_jitable(_gen_unicode_upper_or_lower(False)) 2107_unicode_lower = register_jitable(_gen_unicode_upper_or_lower(True)) 2108 2109 2110def _gen_ascii_upper_or_lower(func): 2111 def _ascii_upper_or_lower(data, res): 2112 for idx in range(len(data)): 2113 code_point = _get_code_point(data, idx) 2114 _set_code_point(res, idx, func(code_point)) 2115 return _ascii_upper_or_lower 2116 2117 2118_ascii_upper = register_jitable(_gen_ascii_upper_or_lower(_Py_TOUPPER)) 2119_ascii_lower = register_jitable(_gen_ascii_upper_or_lower(_Py_TOLOWER)) 2120 2121 2122@overload_method(types.UnicodeType, 'lower') 2123def unicode_lower(data): 2124 """Implements .lower()""" 2125 return case_operation(_ascii_lower, _unicode_lower) 2126 2127 2128@overload_method(types.UnicodeType, 'upper') 2129def unicode_upper(data): 2130 """Implements .upper()""" 2131 return case_operation(_ascii_upper, _unicode_upper) 2132 2133 2134# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9819-L9834 # noqa: E501 2135@register_jitable 2136def _unicode_casefold(data, length, res, maxchars): 2137 k = 0 2138 mapped = np.zeros(3, dtype=_Py_UCS4) 2139 for idx in range(length): 2140 mapped.fill(0) 2141 code_point = _get_code_point(data, idx) 2142 n_res = _PyUnicode_ToFoldedFull(code_point, mapped) 2143 for m in mapped[:n_res]: 2144 maxchar = maxchars[0] 2145 maxchars[0] = max(maxchar, m) 2146 _set_code_point(res, k, m) 2147 k += 1 2148 2149 return k 2150 2151 2152@register_jitable 2153def _ascii_casefold(data, res): 2154 for idx in range(len(data)): 2155 code_point = _get_code_point(data, idx) 2156 _set_code_point(res, idx, _Py_TOLOWER(code_point)) 2157 2158 2159@overload_method(types.UnicodeType, 'casefold') 2160def unicode_casefold(data): 2161 """Implements str.casefold()""" 2162 return case_operation(_ascii_casefold, _unicode_casefold) 2163 2164 2165# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9737-L9759 # noqa: E501 2166@register_jitable 2167def _unicode_capitalize(data, length, res, maxchars): 2168 k = 0 2169 maxchar = 0 2170 mapped = np.zeros(3, dtype=_Py_UCS4) 2171 code_point = _get_code_point(data, 0) 2172 2173 # https://github.com/python/cpython/commit/b015fc86f7b1f35283804bfee788cce0a5495df7/Objects/unicodeobject.c#diff-220e5da0d1c8abf508b25c02da6ca16c # noqa: E501 2174 if _py38_or_later: 2175 n_res = _PyUnicode_ToTitleFull(code_point, mapped) 2176 else: 2177 n_res = _PyUnicode_ToUpperFull(code_point, mapped) 2178 2179 for m in mapped[:n_res]: 2180 maxchar = max(maxchar, m) 2181 _set_code_point(res, k, m) 2182 k += 1 2183 for idx in range(1, length): 2184 mapped.fill(0) 2185 code_point = _get_code_point(data, idx) 2186 n_res = _lower_ucs4(code_point, data, length, idx, mapped) 2187 for m in mapped[:n_res]: 2188 maxchar = max(maxchar, m) 2189 _set_code_point(res, k, m) 2190 k += 1 2191 maxchars[0] = maxchar 2192 return k 2193 2194 2195# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L361-L382 # noqa: E501 2196@register_jitable 2197def _ascii_capitalize(data, res): 2198 code_point = _get_code_point(data, 0) 2199 _set_code_point(res, 0, _Py_TOUPPER(code_point)) 2200 for idx in range(1, len(data)): 2201 code_point = _get_code_point(data, idx) 2202 _set_code_point(res, idx, _Py_TOLOWER(code_point)) 2203 2204 2205# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10765-L10774 # noqa: E501 2206@overload_method(types.UnicodeType, 'capitalize') 2207def unicode_capitalize(data): 2208 return case_operation(_ascii_capitalize, _unicode_capitalize) 2209 2210 2211# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9996-L10021 # noqa: E501 2212@register_jitable 2213def _unicode_title(data, length, res, maxchars): 2214 """This is a translation of the function that titles a unicode string.""" 2215 k = 0 2216 previous_cased = False 2217 mapped = np.empty(3, dtype=_Py_UCS4) 2218 for idx in range(length): 2219 mapped.fill(0) 2220 code_point = _get_code_point(data, idx) 2221 if previous_cased: 2222 n_res = _lower_ucs4(code_point, data, length, idx, mapped) 2223 else: 2224 n_res = _PyUnicode_ToTitleFull(_Py_UCS4(code_point), mapped) 2225 for m in mapped[:n_res]: 2226 maxchar, = maxchars 2227 maxchars[0] = max(maxchar, m) 2228 _set_code_point(res, k, m) 2229 k += 1 2230 previous_cased = _PyUnicode_IsCased(_Py_UCS4(code_point)) 2231 return k 2232 2233 2234# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L332-L352 # noqa: E501 2235@register_jitable 2236def _ascii_title(data, res): 2237 """ Does .title() on an ASCII string """ 2238 previous_is_cased = False 2239 for idx in range(len(data)): 2240 code_point = _get_code_point(data, idx) 2241 if _Py_ISLOWER(code_point): 2242 if not previous_is_cased: 2243 code_point = _Py_TOUPPER(code_point) 2244 previous_is_cased = True 2245 elif _Py_ISUPPER(code_point): 2246 if previous_is_cased: 2247 code_point = _Py_TOLOWER(code_point) 2248 previous_is_cased = True 2249 else: 2250 previous_is_cased = False 2251 _set_code_point(res, idx, code_point) 2252 2253 2254# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L10023-L10069 # noqa: E501 2255@overload_method(types.UnicodeType, 'title') 2256def unicode_title(data): 2257 """Implements str.title()""" 2258 # https://docs.python.org/3/library/stdtypes.html#str.title 2259 return case_operation(_ascii_title, _unicode_title) 2260 2261 2262# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L391-L408 # noqa: E501 2263@register_jitable 2264def _ascii_swapcase(data, res): 2265 for idx in range(len(data)): 2266 code_point = _get_code_point(data, idx) 2267 if _Py_ISUPPER(code_point): 2268 code_point = _Py_TOLOWER(code_point) 2269 elif _Py_ISLOWER(code_point): 2270 code_point = _Py_TOUPPER(code_point) 2271 _set_code_point(res, idx, code_point) 2272 2273 2274# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9761-L9784 # noqa: E501 2275@register_jitable 2276def _unicode_swapcase(data, length, res, maxchars): 2277 k = 0 2278 maxchar = 0 2279 mapped = np.empty(3, dtype=_Py_UCS4) 2280 for idx in range(length): 2281 mapped.fill(0) 2282 code_point = _get_code_point(data, idx) 2283 if _PyUnicode_IsUppercase(code_point): 2284 n_res = _lower_ucs4(code_point, data, length, idx, mapped) 2285 elif _PyUnicode_IsLowercase(code_point): 2286 n_res = _PyUnicode_ToUpperFull(code_point, mapped) 2287 else: 2288 n_res = 1 2289 mapped[0] = code_point 2290 for m in mapped[:n_res]: 2291 maxchar = max(maxchar, m) 2292 _set_code_point(res, k, m) 2293 k += 1 2294 maxchars[0] = maxchar 2295 return k 2296 2297 2298@overload_method(types.UnicodeType, 'swapcase') 2299def unicode_swapcase(data): 2300 return case_operation(_ascii_swapcase, _unicode_swapcase) 2301 2302 2303# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Python/bltinmodule.c#L1781-L1824 # noqa: E501 2304@overload(ord) 2305def ol_ord(c): 2306 if isinstance(c, types.UnicodeType): 2307 def impl(c): 2308 lc = len(c) 2309 if lc != 1: 2310 # CPython does TypeError 2311 raise TypeError("ord() expected a character") 2312 return _get_code_point(c, 0) 2313 return impl 2314 2315 2316# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L2005-L2028 # noqa: E501 2317# This looks a bit different to the cpython implementation but, with the 2318# exception of a latin1 fast path is logically the same. It finds the "kind" of 2319# the codepoint `ch`, creates a length 1 string of that kind and then injects 2320# the code point into the zero position of that string. Cpython does similar but 2321# branches for each kind (this is encapsulated in Numba's _set_code_point). 2322@register_jitable 2323def _unicode_char(ch): 2324 assert ch <= _MAX_UNICODE 2325 kind = _codepoint_to_kind(ch) 2326 ret = _empty_string(kind, 1, kind == PY_UNICODE_1BYTE_KIND) 2327 _set_code_point(ret, 0, ch) 2328 return ret 2329 2330 2331_out_of_range_msg = "chr() arg not in range(0x%hx)" % _MAX_UNICODE 2332 2333 2334# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L3045-L3055 # noqa: E501 2335@register_jitable 2336def _PyUnicode_FromOrdinal(ordinal): 2337 if (ordinal < 0 or ordinal > _MAX_UNICODE): 2338 raise ValueError(_out_of_range_msg) 2339 2340 return _unicode_char(_Py_UCS4(ordinal)) 2341 2342 2343# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Python/bltinmodule.c#L715-L720 # noqa: E501 2344@overload(chr) 2345def ol_chr(i): 2346 if isinstance(i, types.Integer): 2347 def impl(i): 2348 return _PyUnicode_FromOrdinal(i) 2349 return impl 2350 2351 2352@overload(str) 2353def integer_str(n): 2354 if isinstance(n, types.Integer): 2355 ten = n(10) 2356 2357 def impl(n): 2358 flag = False 2359 if n < 0: 2360 n = -n 2361 flag = True 2362 if n == 0: 2363 return '0' 2364 l = [] 2365 while n > 0: 2366 c = chr(ord('0') + (n % ten)) 2367 n = n // ten 2368 l.append(c) 2369 if flag: 2370 l.append('-') 2371 return ''.join(l[::-1]) 2372 return impl 2373 2374# ------------------------------------------------------------------------------ 2375# iteration 2376# ------------------------------------------------------------------------------ 2377 2378 2379@lower_builtin('getiter', types.UnicodeType) 2380def getiter_unicode(context, builder, sig, args): 2381 [ty] = sig.args 2382 [data] = args 2383 2384 iterobj = context.make_helper(builder, sig.return_type) 2385 2386 # set the index to zero 2387 zero = context.get_constant(types.uintp, 0) 2388 indexptr = cgutils.alloca_once_value(builder, zero) 2389 2390 iterobj.index = indexptr 2391 2392 # wire in the unicode type data 2393 iterobj.data = data 2394 2395 # incref as needed 2396 if context.enable_nrt: 2397 context.nrt.incref(builder, ty, data) 2398 2399 res = iterobj._getvalue() 2400 return impl_ret_new_ref(context, builder, sig.return_type, res) 2401 2402 2403@lower_builtin('iternext', types.UnicodeIteratorType) 2404# a new ref counted object is put into result._yield so set the new_ref to True! 2405@iternext_impl(RefType.NEW) 2406def iternext_unicode(context, builder, sig, args, result): 2407 [iterty] = sig.args 2408 [iter] = args 2409 2410 tyctx = context.typing_context 2411 2412 # get ref to unicode.__getitem__ 2413 fnty = tyctx.resolve_value_type(operator.getitem) 2414 getitem_sig = fnty.get_call_type(tyctx, (types.unicode_type, types.uintp), 2415 {}) 2416 getitem_impl = context.get_function(fnty, getitem_sig) 2417 2418 # get ref to unicode.__len__ 2419 fnty = tyctx.resolve_value_type(len) 2420 len_sig = fnty.get_call_type(tyctx, (types.unicode_type,), {}) 2421 len_impl = context.get_function(fnty, len_sig) 2422 2423 # grab unicode iterator struct 2424 iterobj = context.make_helper(builder, iterty, value=iter) 2425 2426 # find the length of the string 2427 strlen = len_impl(builder, (iterobj.data,)) 2428 2429 # find the current index 2430 index = builder.load(iterobj.index) 2431 2432 # see if the index is in range 2433 is_valid = builder.icmp_unsigned('<', index, strlen) 2434 result.set_valid(is_valid) 2435 2436 with builder.if_then(is_valid): 2437 # return value at index 2438 gotitem = getitem_impl(builder, (iterobj.data, index,)) 2439 result.yield_(gotitem) 2440 2441 # bump index for next cycle 2442 nindex = cgutils.increment_index(builder, index) 2443 builder.store(nindex, iterobj.index) 2444