1from collections import abc 2from decimal import Decimal 3import warnings 4 5import cython 6from cython import Py_ssize_t 7 8from cpython.datetime cimport ( 9 PyDate_Check, 10 PyDateTime_Check, 11 PyDateTime_IMPORT, 12 PyDelta_Check, 13 PyTime_Check, 14) 15from cpython.iterator cimport PyIter_Check 16from cpython.number cimport PyNumber_Check 17from cpython.object cimport Py_EQ, PyObject_RichCompareBool 18from cpython.ref cimport Py_INCREF 19from cpython.sequence cimport PySequence_Check 20from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM 21 22PyDateTime_IMPORT 23 24import numpy as np 25 26cimport numpy as cnp 27from numpy cimport ( 28 NPY_OBJECT, 29 PyArray_Check, 30 PyArray_GETITEM, 31 PyArray_ITER_DATA, 32 PyArray_ITER_NEXT, 33 PyArray_IterNew, 34 complex128_t, 35 flatiter, 36 float32_t, 37 float64_t, 38 int64_t, 39 intp_t, 40 ndarray, 41 uint8_t, 42 uint64_t, 43) 44 45cnp.import_array() 46 47cdef extern from "numpy/arrayobject.h": 48 # cython's numpy.dtype specification is incorrect, which leads to 49 # errors in issubclass(self.dtype.type, np.bool_), so we directly 50 # include the correct version 51 # https://github.com/cython/cython/issues/2022 52 53 ctypedef class numpy.dtype [object PyArray_Descr]: 54 # Use PyDataType_* macros when possible, however there are no macros 55 # for accessing some of the fields, so some are defined. Please 56 # ask on cython-dev if you need more. 57 cdef: 58 int type_num 59 int itemsize "elsize" 60 char byteorder 61 object fields 62 tuple names 63 64 65cdef extern from "src/parse_helper.h": 66 int floatify(object, float64_t *result, int *maybe_int) except -1 67 68from pandas._libs cimport util 69from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX, is_nan 70 71from pandas._libs.tslib import array_to_datetime 72 73from pandas._libs.missing cimport ( 74 C_NA, 75 checknull, 76 is_null_datetime64, 77 is_null_timedelta64, 78 isnaobj, 79) 80from pandas._libs.tslibs.conversion cimport convert_to_tsobject 81from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT, checknull_with_nat 82from pandas._libs.tslibs.offsets cimport is_offset_object 83from pandas._libs.tslibs.period cimport is_period_object 84from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 85from pandas._libs.tslibs.timezones cimport tz_compare 86 87# constants that will be compared to potentially arbitrarily large 88# python int 89cdef: 90 object oINT64_MAX = <int64_t>INT64_MAX 91 object oINT64_MIN = <int64_t>INT64_MIN 92 object oUINT64_MAX = <uint64_t>UINT64_MAX 93 94 float64_t NaN = <float64_t>np.NaN 95 96 97@cython.wraparound(False) 98@cython.boundscheck(False) 99def memory_usage_of_objects(arr: object[:]) -> int64_t: 100 """ 101 Return the memory usage of an object array in bytes. 102 103 Does not include the actual bytes of the pointers 104 """ 105 i: Py_ssize_t 106 n: Py_ssize_t 107 size: int64_t 108 109 size = 0 110 n = len(arr) 111 for i in range(n): 112 size += arr[i].__sizeof__() 113 return size 114 115 116# ---------------------------------------------------------------------- 117 118 119def is_scalar(val: object) -> bool: 120 """ 121 Return True if given object is scalar. 122 123 Parameters 124 ---------- 125 val : object 126 This includes: 127 128 - numpy array scalar (e.g. np.int64) 129 - Python builtin numerics 130 - Python builtin byte arrays and strings 131 - None 132 - datetime.datetime 133 - datetime.timedelta 134 - Period 135 - decimal.Decimal 136 - Interval 137 - DateOffset 138 - Fraction 139 - Number. 140 141 Returns 142 ------- 143 bool 144 Return True if given object is scalar. 145 146 Examples 147 -------- 148 >>> dt = datetime.datetime(2018, 10, 3) 149 >>> pd.api.types.is_scalar(dt) 150 True 151 152 >>> pd.api.types.is_scalar([2, 3]) 153 False 154 155 >>> pd.api.types.is_scalar({0: 1, 2: 3}) 156 False 157 158 >>> pd.api.types.is_scalar((0, 2)) 159 False 160 161 pandas supports PEP 3141 numbers: 162 163 >>> from fractions import Fraction 164 >>> pd.api.types.is_scalar(Fraction(3, 5)) 165 True 166 """ 167 168 # Start with C-optimized checks 169 if (cnp.PyArray_IsAnyScalar(val) 170 # PyArray_IsAnyScalar is always False for bytearrays on Py3 171 or PyDate_Check(val) 172 or PyDelta_Check(val) 173 or PyTime_Check(val) 174 # We differ from numpy, which claims that None is not scalar; 175 # see np.isscalar 176 or val is C_NA 177 or val is None): 178 return True 179 180 # Next use C-optimized checks to exclude common non-scalars before falling 181 # back to non-optimized checks. 182 if PySequence_Check(val): 183 # e.g. list, tuple 184 # includes np.ndarray, Series which PyNumber_Check can return True for 185 return False 186 187 # Note: PyNumber_Check check includes Decimal, Fraction, numbers.Number 188 return (PyNumber_Check(val) 189 or is_period_object(val) 190 or is_interval(val) 191 or is_offset_object(val)) 192 193 194def is_iterator(obj: object) -> bool: 195 """ 196 Check if the object is an iterator. 197 198 This is intended for generators, not list-like objects. 199 200 Parameters 201 ---------- 202 obj : The object to check 203 204 Returns 205 ------- 206 is_iter : bool 207 Whether `obj` is an iterator. 208 209 Examples 210 -------- 211 >>> is_iterator((x for x in [])) 212 True 213 >>> is_iterator([1, 2, 3]) 214 False 215 >>> is_iterator(datetime(2017, 1, 1)) 216 False 217 >>> is_iterator("foo") 218 False 219 >>> is_iterator(1) 220 False 221 """ 222 return PyIter_Check(obj) 223 224 225def item_from_zerodim(val: object) -> object: 226 """ 227 If the value is a zerodim array, return the item it contains. 228 229 Parameters 230 ---------- 231 val : object 232 233 Returns 234 ------- 235 object 236 237 Examples 238 -------- 239 >>> item_from_zerodim(1) 240 1 241 >>> item_from_zerodim('foobar') 242 'foobar' 243 >>> item_from_zerodim(np.array(1)) 244 1 245 >>> item_from_zerodim(np.array([1])) 246 array([1]) 247 """ 248 if cnp.PyArray_IsZeroDim(val): 249 return cnp.PyArray_ToScalar(cnp.PyArray_DATA(val), val) 250 return val 251 252 253@cython.wraparound(False) 254@cython.boundscheck(False) 255def fast_unique_multiple(list arrays, sort: bool = True): 256 """ 257 Generate a list of unique values from a list of arrays. 258 259 Parameters 260 ---------- 261 list : array-like 262 List of array-like objects. 263 sort : bool 264 Whether or not to sort the resulting unique list. 265 266 Returns 267 ------- 268 list of unique values 269 """ 270 cdef: 271 ndarray[object] buf 272 Py_ssize_t k = len(arrays) 273 Py_ssize_t i, j, n 274 list uniques = [] 275 dict table = {} 276 object val, stub = 0 277 278 for i in range(k): 279 buf = arrays[i] 280 n = len(buf) 281 for j in range(n): 282 val = buf[j] 283 if val not in table: 284 table[val] = stub 285 uniques.append(val) 286 287 if sort is None: 288 try: 289 uniques.sort() 290 except TypeError: 291 warnings.warn( 292 "The values in the array are unorderable. " 293 "Pass `sort=False` to suppress this warning.", 294 RuntimeWarning, 295 ) 296 pass 297 298 return uniques 299 300 301@cython.wraparound(False) 302@cython.boundscheck(False) 303def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: 304 cdef: 305 list buf 306 Py_ssize_t k = len(lists) 307 Py_ssize_t i, j, n 308 list uniques = [] 309 dict table = {} 310 object val, stub = 0 311 312 for i in range(k): 313 buf = lists[i] 314 n = len(buf) 315 for j in range(n): 316 val = buf[j] 317 if val not in table: 318 table[val] = stub 319 uniques.append(val) 320 if sort: 321 try: 322 uniques.sort() 323 except TypeError: 324 pass 325 326 return uniques 327 328 329@cython.wraparound(False) 330@cython.boundscheck(False) 331def fast_unique_multiple_list_gen(object gen, bint sort=True): 332 """ 333 Generate a list of unique values from a generator of lists. 334 335 Parameters 336 ---------- 337 gen : generator object 338 Generator of lists from which the unique list is created. 339 sort : bool 340 Whether or not to sort the resulting unique list. 341 342 Returns 343 ------- 344 list of unique values 345 """ 346 cdef: 347 list buf 348 Py_ssize_t j, n 349 list uniques = [] 350 dict table = {} 351 object val, stub = 0 352 353 for buf in gen: 354 n = len(buf) 355 for j in range(n): 356 val = buf[j] 357 if val not in table: 358 table[val] = stub 359 uniques.append(val) 360 if sort: 361 try: 362 uniques.sort() 363 except TypeError: 364 pass 365 366 return uniques 367 368 369@cython.wraparound(False) 370@cython.boundscheck(False) 371def dicts_to_array(dicts: list, columns: list): 372 cdef: 373 Py_ssize_t i, j, k, n 374 ndarray[object, ndim=2] result 375 dict row 376 object col, onan = np.nan 377 378 k = len(columns) 379 n = len(dicts) 380 381 result = np.empty((n, k), dtype='O') 382 383 for i in range(n): 384 row = dicts[i] 385 for j in range(k): 386 col = columns[j] 387 if col in row: 388 result[i, j] = row[col] 389 else: 390 result[i, j] = onan 391 392 return result 393 394 395def fast_zip(list ndarrays): 396 """ 397 For zipping multiple ndarrays into an ndarray of tuples. 398 """ 399 cdef: 400 Py_ssize_t i, j, k, n 401 ndarray[object] result 402 flatiter it 403 object val, tup 404 405 k = len(ndarrays) 406 n = len(ndarrays[0]) 407 408 result = np.empty(n, dtype=object) 409 410 # initialize tuples on first pass 411 arr = ndarrays[0] 412 it = <flatiter>PyArray_IterNew(arr) 413 for i in range(n): 414 val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) 415 tup = PyTuple_New(k) 416 417 PyTuple_SET_ITEM(tup, 0, val) 418 Py_INCREF(val) 419 result[i] = tup 420 PyArray_ITER_NEXT(it) 421 422 for j in range(1, k): 423 arr = ndarrays[j] 424 it = <flatiter>PyArray_IterNew(arr) 425 if len(arr) != n: 426 raise ValueError("all arrays must be same length") 427 428 for i in range(n): 429 val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) 430 PyTuple_SET_ITEM(result[i], j, val) 431 Py_INCREF(val) 432 PyArray_ITER_NEXT(it) 433 434 return result 435 436 437def get_reverse_indexer(const int64_t[:] indexer, Py_ssize_t length): 438 """ 439 Reverse indexing operation. 440 441 Given `indexer`, make `indexer_inv` of it, such that:: 442 443 indexer_inv[indexer[x]] = x 444 445 .. note:: If indexer is not unique, only first occurrence is accounted. 446 """ 447 cdef: 448 Py_ssize_t i, n = len(indexer) 449 ndarray[int64_t] rev_indexer 450 int64_t idx 451 452 rev_indexer = np.empty(length, dtype=np.int64) 453 rev_indexer[:] = -1 454 for i in range(n): 455 idx = indexer[i] 456 if idx != -1: 457 rev_indexer[idx] = i 458 459 return rev_indexer 460 461 462@cython.wraparound(False) 463@cython.boundscheck(False) 464def has_infs_f4(const float32_t[:] arr) -> bool: 465 cdef: 466 Py_ssize_t i, n = len(arr) 467 float32_t inf, neginf, val 468 469 inf = np.inf 470 neginf = -inf 471 472 for i in range(n): 473 val = arr[i] 474 if val == inf or val == neginf: 475 return True 476 return False 477 478 479@cython.wraparound(False) 480@cython.boundscheck(False) 481def has_infs_f8(const float64_t[:] arr) -> bool: 482 cdef: 483 Py_ssize_t i, n = len(arr) 484 float64_t inf, neginf, val 485 486 inf = np.inf 487 neginf = -inf 488 489 for i in range(n): 490 val = arr[i] 491 if val == inf or val == neginf: 492 return True 493 return False 494 495 496def maybe_indices_to_slice(ndarray[intp_t] indices, int max_len): 497 cdef: 498 Py_ssize_t i, n = len(indices) 499 int k, vstart, vlast, v 500 501 if n == 0: 502 return slice(0, 0) 503 504 vstart = indices[0] 505 if vstart < 0 or max_len <= vstart: 506 return indices 507 508 if n == 1: 509 return slice(vstart, vstart + 1) 510 511 vlast = indices[n - 1] 512 if vlast < 0 or max_len <= vlast: 513 return indices 514 515 k = indices[1] - indices[0] 516 if k == 0: 517 return indices 518 else: 519 for i in range(2, n): 520 v = indices[i] 521 if v - indices[i - 1] != k: 522 return indices 523 524 if k > 0: 525 return slice(vstart, vlast + 1, k) 526 else: 527 if vlast == 0: 528 return slice(vstart, None, k) 529 else: 530 return slice(vstart, vlast - 1, k) 531 532 533@cython.wraparound(False) 534@cython.boundscheck(False) 535def maybe_booleans_to_slice(ndarray[uint8_t] mask): 536 cdef: 537 Py_ssize_t i, n = len(mask) 538 Py_ssize_t start = 0, end = 0 539 bint started = False, finished = False 540 541 for i in range(n): 542 if mask[i]: 543 if finished: 544 return mask.view(np.bool_) 545 if not started: 546 started = True 547 start = i 548 else: 549 if finished: 550 continue 551 552 if started: 553 end = i 554 finished = True 555 556 if not started: 557 return slice(0, 0) 558 if not finished: 559 return slice(start, None) 560 else: 561 return slice(start, end) 562 563 564@cython.wraparound(False) 565@cython.boundscheck(False) 566def array_equivalent_object(left: object[:], right: object[:]) -> bool: 567 """ 568 Perform an element by element comparison on 1-d object arrays 569 taking into account nan positions. 570 """ 571 cdef: 572 Py_ssize_t i, n = left.shape[0] 573 object x, y 574 575 for i in range(n): 576 x = left[i] 577 y = right[i] 578 579 # we are either not equal or both nan 580 # I think None == None will be true here 581 try: 582 if PyArray_Check(x) and PyArray_Check(y): 583 if not array_equivalent_object(x, y): 584 return False 585 elif (x is C_NA) ^ (y is C_NA): 586 return False 587 elif not (PyObject_RichCompareBool(x, y, Py_EQ) or 588 (x is None or is_nan(x)) and (y is None or is_nan(y))): 589 return False 590 except ValueError: 591 # Avoid raising ValueError when comparing Numpy arrays to other types 592 if cnp.PyArray_IsAnyScalar(x) != cnp.PyArray_IsAnyScalar(y): 593 # Only compare scalars to scalars and non-scalars to non-scalars 594 return False 595 elif (not (cnp.PyArray_IsPythonScalar(x) or cnp.PyArray_IsPythonScalar(y)) 596 and not (isinstance(x, type(y)) or isinstance(y, type(x)))): 597 # Check if non-scalars have the same type 598 return False 599 raise 600 return True 601 602 603@cython.wraparound(False) 604@cython.boundscheck(False) 605def astype_intsafe(ndarray[object] arr, new_dtype): 606 cdef: 607 Py_ssize_t i, n = len(arr) 608 object val 609 bint is_datelike 610 ndarray result 611 612 is_datelike = new_dtype == 'm8[ns]' 613 result = np.empty(n, dtype=new_dtype) 614 for i in range(n): 615 val = arr[i] 616 if is_datelike and checknull(val): 617 result[i] = NPY_NAT 618 else: 619 result[i] = val 620 621 return result 622 623 624@cython.wraparound(False) 625@cython.boundscheck(False) 626cpdef ndarray[object] ensure_string_array( 627 arr, 628 object na_value=np.nan, 629 bint convert_na_value=True, 630 bint copy=True, 631 bint skipna=True, 632): 633 """Returns a new numpy array with object dtype and only strings and na values. 634 635 Parameters 636 ---------- 637 arr : array-like 638 The values to be converted to str, if needed. 639 na_value : Any, default np.nan 640 The value to use for na. For example, np.nan or pd.NA. 641 convert_na_value : bool, default True 642 If False, existing na values will be used unchanged in the new array. 643 copy : bool, default True 644 Whether to ensure that a new array is returned. 645 skipna : bool, default True 646 Whether or not to coerce nulls to their stringified form 647 (e.g. if False, NaN becomes 'nan'). 648 649 Returns 650 ------- 651 ndarray 652 An array with the input array's elements casted to str or nan-like. 653 """ 654 cdef: 655 Py_ssize_t i = 0, n = len(arr) 656 657 if hasattr(arr, "to_numpy"): 658 arr = arr.to_numpy() 659 elif not isinstance(arr, np.ndarray): 660 arr = np.array(arr, dtype="object") 661 662 result = np.asarray(arr, dtype="object") 663 664 if copy and result is arr: 665 result = result.copy() 666 667 for i in range(n): 668 val = arr[i] 669 670 if isinstance(val, str): 671 continue 672 673 if not checknull(val): 674 result[i] = str(val) 675 else: 676 if convert_na_value: 677 val = na_value 678 if skipna: 679 result[i] = val 680 else: 681 result[i] = str(val) 682 683 return result 684 685 686@cython.wraparound(False) 687@cython.boundscheck(False) 688def clean_index_list(obj: list): 689 """ 690 Utility used in ``pandas.core.indexes.api.ensure_index``. 691 """ 692 cdef: 693 Py_ssize_t i, n = len(obj) 694 object val 695 bint all_arrays = True 696 697 for i in range(n): 698 val = obj[i] 699 if not (isinstance(val, list) or 700 util.is_array(val) or hasattr(val, '_data')): 701 all_arrays = False 702 break 703 704 if all_arrays: 705 return obj, all_arrays 706 707 # don't force numpy coerce with nan's 708 inferred = infer_dtype(obj, skipna=False) 709 if inferred in ['string', 'bytes', 'mixed', 'mixed-integer']: 710 return np.asarray(obj, dtype=object), 0 711 elif inferred in ['integer']: 712 # TODO: we infer an integer but it *could* be a uint64 713 try: 714 return np.asarray(obj, dtype='int64'), 0 715 except OverflowError: 716 return np.asarray(obj, dtype='object'), 0 717 718 return np.asarray(obj), 0 719 720 721# ------------------------------------------------------------------------------ 722# Groupby-related functions 723 724# TODO: could do even better if we know something about the data. eg, index has 725# 1-min data, binner has 5-min data, then bins are just strides in index. This 726# is a general, O(max(len(values), len(binner))) method. 727@cython.boundscheck(False) 728@cython.wraparound(False) 729def generate_bins_dt64(ndarray[int64_t] values, const int64_t[:] binner, 730 object closed='left', bint hasnans=False): 731 """ 732 Int64 (datetime64) version of generic python version in ``groupby.py``. 733 """ 734 cdef: 735 Py_ssize_t lenidx, lenbin, i, j, bc, vc 736 ndarray[int64_t] bins 737 int64_t l_bin, r_bin, nat_count 738 bint right_closed = closed == 'right' 739 740 nat_count = 0 741 if hasnans: 742 mask = values == NPY_NAT 743 nat_count = np.sum(mask) 744 values = values[~mask] 745 746 lenidx = len(values) 747 lenbin = len(binner) 748 749 if lenidx <= 0 or lenbin <= 0: 750 raise ValueError("Invalid length for values or for binner") 751 752 # check binner fits data 753 if values[0] < binner[0]: 754 raise ValueError("Values falls before first bin") 755 756 if values[lenidx - 1] > binner[lenbin - 1]: 757 raise ValueError("Values falls after last bin") 758 759 bins = np.empty(lenbin - 1, dtype=np.int64) 760 761 j = 0 # index into values 762 bc = 0 # bin count 763 764 # linear scan 765 if right_closed: 766 for i in range(0, lenbin - 1): 767 r_bin = binner[i + 1] 768 # count values in current bin, advance to next bin 769 while j < lenidx and values[j] <= r_bin: 770 j += 1 771 bins[bc] = j 772 bc += 1 773 else: 774 for i in range(0, lenbin - 1): 775 r_bin = binner[i + 1] 776 # count values in current bin, advance to next bin 777 while j < lenidx and values[j] < r_bin: 778 j += 1 779 bins[bc] = j 780 bc += 1 781 782 if nat_count > 0: 783 # shift bins by the number of NaT 784 bins = bins + nat_count 785 bins = np.insert(bins, 0, nat_count) 786 787 return bins 788 789 790@cython.boundscheck(False) 791@cython.wraparound(False) 792def get_level_sorter(const int64_t[:] label, const int64_t[:] starts): 793 """ 794 Argsort for a single level of a multi-index, keeping the order of higher 795 levels unchanged. `starts` points to starts of same-key indices w.r.t 796 to leading levels; equivalent to: 797 np.hstack([label[starts[i]:starts[i+1]].argsort(kind='mergesort') 798 + starts[i] for i in range(len(starts) - 1)]) 799 """ 800 cdef: 801 int64_t l, r 802 Py_ssize_t i 803 ndarray[int64_t, ndim=1] out = np.empty(len(label), dtype=np.int64) 804 ndarray[int64_t, ndim=1] label_arr = np.asarray(label) 805 806 for i in range(len(starts) - 1): 807 l, r = starts[i], starts[i + 1] 808 out[l:r] = l + label_arr[l:r].argsort(kind='mergesort') 809 810 return out 811 812 813@cython.boundscheck(False) 814@cython.wraparound(False) 815def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, 816 const int64_t[:] labels, 817 Py_ssize_t max_bin, 818 int axis): 819 cdef: 820 Py_ssize_t i, j, k, n 821 ndarray[int64_t, ndim=2] counts 822 823 assert (axis == 0 or axis == 1) 824 n, k = (<object>mask).shape 825 826 if axis == 0: 827 counts = np.zeros((max_bin, k), dtype='i8') 828 with nogil: 829 for i in range(n): 830 for j in range(k): 831 if mask[i, j]: 832 counts[labels[i], j] += 1 833 834 else: # axis == 1 835 counts = np.zeros((n, max_bin), dtype='i8') 836 with nogil: 837 for i in range(n): 838 for j in range(k): 839 if mask[i, j]: 840 counts[i, labels[j]] += 1 841 842 return counts 843 844 845def generate_slices(const int64_t[:] labels, Py_ssize_t ngroups): 846 cdef: 847 Py_ssize_t i, group_size, n, start 848 int64_t lab 849 object slobj 850 ndarray[int64_t] starts, ends 851 852 n = len(labels) 853 854 starts = np.zeros(ngroups, dtype=np.int64) 855 ends = np.zeros(ngroups, dtype=np.int64) 856 857 start = 0 858 group_size = 0 859 for i in range(n): 860 lab = labels[i] 861 if lab < 0: 862 start += 1 863 else: 864 group_size += 1 865 if i == n - 1 or lab != labels[i + 1]: 866 starts[lab] = start 867 ends[lab] = start + group_size 868 start += group_size 869 group_size = 0 870 871 return starts, ends 872 873 874def indices_fast(ndarray index, const int64_t[:] labels, list keys, 875 list sorted_labels): 876 """ 877 Parameters 878 ---------- 879 index : ndarray 880 labels : ndarray[int64] 881 keys : list 882 sorted_labels : list[ndarray[int64]] 883 """ 884 cdef: 885 Py_ssize_t i, j, k, lab, cur, start, n = len(labels) 886 dict result = {} 887 object tup 888 889 k = len(keys) 890 891 if n == 0: 892 return result 893 894 start = 0 895 cur = labels[0] 896 for i in range(1, n): 897 lab = labels[i] 898 899 if lab != cur: 900 if lab != -1: 901 if k == 1: 902 # When k = 1 we do not want to return a tuple as key 903 tup = keys[0][sorted_labels[0][i - 1]] 904 else: 905 tup = PyTuple_New(k) 906 for j in range(k): 907 val = keys[j][sorted_labels[j][i - 1]] 908 PyTuple_SET_ITEM(tup, j, val) 909 Py_INCREF(val) 910 result[tup] = index[start:i] 911 start = i 912 cur = lab 913 914 if k == 1: 915 # When k = 1 we do not want to return a tuple as key 916 tup = keys[0][sorted_labels[0][n - 1]] 917 else: 918 tup = PyTuple_New(k) 919 for j in range(k): 920 val = keys[j][sorted_labels[j][n - 1]] 921 PyTuple_SET_ITEM(tup, j, val) 922 Py_INCREF(val) 923 result[tup] = index[start:] 924 925 return result 926 927 928# core.common import for fast inference checks 929 930def is_float(obj: object) -> bool: 931 """ 932 Return True if given object is float. 933 934 Returns 935 ------- 936 bool 937 """ 938 return util.is_float_object(obj) 939 940 941def is_integer(obj: object) -> bool: 942 """ 943 Return True if given object is integer. 944 945 Returns 946 ------- 947 bool 948 """ 949 return util.is_integer_object(obj) 950 951 952def is_bool(obj: object) -> bool: 953 """ 954 Return True if given object is boolean. 955 956 Returns 957 ------- 958 bool 959 """ 960 return util.is_bool_object(obj) 961 962 963def is_complex(obj: object) -> bool: 964 """ 965 Return True if given object is complex. 966 967 Returns 968 ------- 969 bool 970 """ 971 return util.is_complex_object(obj) 972 973 974cpdef bint is_decimal(object obj): 975 return isinstance(obj, Decimal) 976 977 978cpdef bint is_interval(object obj): 979 return getattr(obj, '_typ', '_typ') == 'interval' 980 981 982def is_period(val: object) -> bool: 983 """ 984 Return True if given object is Period. 985 986 Returns 987 ------- 988 bool 989 """ 990 return is_period_object(val) 991 992 993def is_list_like(obj: object, allow_sets: bool = True) -> bool: 994 """ 995 Check if the object is list-like. 996 997 Objects that are considered list-like are for example Python 998 lists, tuples, sets, NumPy arrays, and Pandas Series. 999 1000 Strings and datetime objects, however, are not considered list-like. 1001 1002 Parameters 1003 ---------- 1004 obj : object 1005 Object to check. 1006 allow_sets : bool, default True 1007 If this parameter is False, sets will not be considered list-like. 1008 1009 .. versionadded:: 0.24.0 1010 1011 Returns 1012 ------- 1013 bool 1014 Whether `obj` has list-like properties. 1015 1016 Examples 1017 -------- 1018 >>> is_list_like([1, 2, 3]) 1019 True 1020 >>> is_list_like({1, 2, 3}) 1021 True 1022 >>> is_list_like(datetime(2017, 1, 1)) 1023 False 1024 >>> is_list_like("foo") 1025 False 1026 >>> is_list_like(1) 1027 False 1028 >>> is_list_like(np.array([2])) 1029 True 1030 >>> is_list_like(np.array(2)) 1031 False 1032 """ 1033 return c_is_list_like(obj, allow_sets) 1034 1035 1036cdef inline bint c_is_list_like(object obj, bint allow_sets) except -1: 1037 return ( 1038 isinstance(obj, abc.Iterable) 1039 # we do not count strings/unicode/bytes as list-like 1040 and not isinstance(obj, (str, bytes)) 1041 # exclude zero-dimensional numpy arrays, effectively scalars 1042 and not (util.is_array(obj) and obj.ndim == 0) 1043 # exclude sets if allow_sets is False 1044 and not (allow_sets is False and isinstance(obj, abc.Set)) 1045 ) 1046 1047 1048_TYPE_MAP = { 1049 "categorical": "categorical", 1050 "category": "categorical", 1051 "int8": "integer", 1052 "int16": "integer", 1053 "int32": "integer", 1054 "int64": "integer", 1055 "i": "integer", 1056 "uint8": "integer", 1057 "uint16": "integer", 1058 "uint32": "integer", 1059 "uint64": "integer", 1060 "u": "integer", 1061 "float32": "floating", 1062 "float64": "floating", 1063 "f": "floating", 1064 "complex64": "complex", 1065 "complex128": "complex", 1066 "c": "complex", 1067 "string": "string", 1068 "S": "bytes", 1069 "U": "string", 1070 "bool": "boolean", 1071 "b": "boolean", 1072 "datetime64[ns]": "datetime64", 1073 "M": "datetime64", 1074 "timedelta64[ns]": "timedelta64", 1075 "m": "timedelta64", 1076 "interval": "interval", 1077} 1078 1079# types only exist on certain platform 1080try: 1081 np.float128 1082 _TYPE_MAP['float128'] = 'floating' 1083except AttributeError: 1084 pass 1085try: 1086 np.complex256 1087 _TYPE_MAP['complex256'] = 'complex' 1088except AttributeError: 1089 pass 1090try: 1091 np.float16 1092 _TYPE_MAP['float16'] = 'floating' 1093except AttributeError: 1094 pass 1095 1096 1097cdef class Seen: 1098 """ 1099 Class for keeping track of the types of elements 1100 encountered when trying to perform type conversions. 1101 """ 1102 1103 cdef: 1104 bint int_ # seen_int 1105 bint nat_ # seen nat 1106 bint bool_ # seen_bool 1107 bint null_ # seen_null 1108 bint nan_ # seen_np.nan 1109 bint uint_ # seen_uint (unsigned integer) 1110 bint sint_ # seen_sint (signed integer) 1111 bint float_ # seen_float 1112 bint object_ # seen_object 1113 bint complex_ # seen_complex 1114 bint datetime_ # seen_datetime 1115 bint coerce_numeric # coerce data to numeric 1116 bint timedelta_ # seen_timedelta 1117 bint datetimetz_ # seen_datetimetz 1118 1119 def __cinit__(self, bint coerce_numeric=False): 1120 """ 1121 Initialize a Seen instance. 1122 1123 Parameters 1124 ---------- 1125 coerce_numeric : bool, default False 1126 Whether or not to force conversion to a numeric data type if 1127 initial methods to convert to numeric fail. 1128 """ 1129 self.int_ = False 1130 self.nat_ = False 1131 self.bool_ = False 1132 self.null_ = False 1133 self.nan_ = False 1134 self.uint_ = False 1135 self.sint_ = False 1136 self.float_ = False 1137 self.object_ = False 1138 self.complex_ = False 1139 self.datetime_ = False 1140 self.timedelta_ = False 1141 self.datetimetz_ = False 1142 self.coerce_numeric = coerce_numeric 1143 1144 cdef inline bint check_uint64_conflict(self) except -1: 1145 """ 1146 Check whether we can safely convert a uint64 array to a numeric dtype. 1147 1148 There are two cases when conversion to numeric dtype with a uint64 1149 array is not safe (and will therefore not be performed) 1150 1151 1) A NaN element is encountered. 1152 1153 uint64 cannot be safely cast to float64 due to truncation issues 1154 at the extreme ends of the range. 1155 1156 2) A negative number is encountered. 1157 1158 There is no numerical dtype that can hold both negative numbers 1159 and numbers greater than INT64_MAX. Hence, at least one number 1160 will be improperly cast if we convert to a numeric dtype. 1161 1162 Returns 1163 ------- 1164 bool 1165 Whether or not we should return the original input array to avoid 1166 data truncation. 1167 1168 Raises 1169 ------ 1170 ValueError 1171 uint64 elements were detected, and at least one of the 1172 two conflict cases was also detected. However, we are 1173 trying to force conversion to a numeric dtype. 1174 """ 1175 return (self.uint_ and (self.null_ or self.sint_) 1176 and not self.coerce_numeric) 1177 1178 cdef inline saw_null(self): 1179 """ 1180 Set flags indicating that a null value was encountered. 1181 """ 1182 self.null_ = True 1183 self.float_ = True 1184 1185 cdef saw_int(self, object val): 1186 """ 1187 Set flags indicating that an integer value was encountered. 1188 1189 In addition to setting a flag that an integer was seen, we 1190 also set two flags depending on the type of integer seen: 1191 1192 1) sint_ : a negative (signed) number in the 1193 range of [-2**63, 0) was encountered 1194 2) uint_ : a positive number in the range of 1195 [2**63, 2**64) was encountered 1196 1197 Parameters 1198 ---------- 1199 val : Python int 1200 Value with which to set the flags. 1201 """ 1202 self.int_ = True 1203 self.sint_ = self.sint_ or (oINT64_MIN <= val < 0) 1204 self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX) 1205 1206 @property 1207 def numeric_(self): 1208 return self.complex_ or self.float_ or self.int_ 1209 1210 @property 1211 def is_bool(self): 1212 return not (self.datetime_ or self.numeric_ or self.timedelta_ 1213 or self.nat_) 1214 1215 @property 1216 def is_float_or_complex(self): 1217 return not (self.bool_ or self.datetime_ or self.timedelta_ 1218 or self.nat_) 1219 1220 1221cdef object _try_infer_map(object dtype): 1222 """ 1223 If its in our map, just return the dtype. 1224 """ 1225 cdef: 1226 object val 1227 str attr 1228 for attr in ["name", "kind", "base"]: 1229 val = getattr(dtype, attr) 1230 if val in _TYPE_MAP: 1231 return _TYPE_MAP[val] 1232 return None 1233 1234 1235def infer_dtype(value: object, skipna: bool = True) -> str: 1236 """ 1237 Efficiently infer the type of a passed val, or list-like 1238 array of values. Return a string describing the type. 1239 1240 Parameters 1241 ---------- 1242 value : scalar, list, ndarray, or pandas type 1243 skipna : bool, default True 1244 Ignore NaN values when inferring the type. 1245 1246 Returns 1247 ------- 1248 str 1249 Describing the common type of the input data. 1250 Results can include: 1251 1252 - string 1253 - bytes 1254 - floating 1255 - integer 1256 - mixed-integer 1257 - mixed-integer-float 1258 - decimal 1259 - complex 1260 - categorical 1261 - boolean 1262 - datetime64 1263 - datetime 1264 - date 1265 - timedelta64 1266 - timedelta 1267 - time 1268 - period 1269 - mixed 1270 1271 Raises 1272 ------ 1273 TypeError 1274 If ndarray-like but cannot infer the dtype 1275 1276 Notes 1277 ----- 1278 - 'mixed' is the catchall for anything that is not otherwise 1279 specialized 1280 - 'mixed-integer-float' are floats and integers 1281 - 'mixed-integer' are integers mixed with non-integers 1282 1283 Examples 1284 -------- 1285 >>> infer_dtype(['foo', 'bar']) 1286 'string' 1287 1288 >>> infer_dtype(['a', np.nan, 'b'], skipna=True) 1289 'string' 1290 1291 >>> infer_dtype(['a', np.nan, 'b'], skipna=False) 1292 'mixed' 1293 1294 >>> infer_dtype([b'foo', b'bar']) 1295 'bytes' 1296 1297 >>> infer_dtype([1, 2, 3]) 1298 'integer' 1299 1300 >>> infer_dtype([1, 2, 3.5]) 1301 'mixed-integer-float' 1302 1303 >>> infer_dtype([1.0, 2.0, 3.5]) 1304 'floating' 1305 1306 >>> infer_dtype(['a', 1]) 1307 'mixed-integer' 1308 1309 >>> infer_dtype([Decimal(1), Decimal(2.0)]) 1310 'decimal' 1311 1312 >>> infer_dtype([True, False]) 1313 'boolean' 1314 1315 >>> infer_dtype([True, False, np.nan]) 1316 'mixed' 1317 1318 >>> infer_dtype([pd.Timestamp('20130101')]) 1319 'datetime' 1320 1321 >>> infer_dtype([datetime.date(2013, 1, 1)]) 1322 'date' 1323 1324 >>> infer_dtype([np.datetime64('2013-01-01')]) 1325 'datetime64' 1326 1327 >>> infer_dtype([datetime.timedelta(0, 1, 1)]) 1328 'timedelta' 1329 1330 >>> infer_dtype(pd.Series(list('aabc')).astype('category')) 1331 'categorical' 1332 """ 1333 cdef: 1334 Py_ssize_t i, n 1335 object val 1336 ndarray values 1337 bint seen_pdnat = False 1338 bint seen_val = False 1339 1340 if util.is_array(value): 1341 values = value 1342 elif hasattr(value, "inferred_type") and skipna is False: 1343 # Index, use the cached attribute if possible, populate the cache otherwise 1344 return value.inferred_type 1345 elif hasattr(value, "dtype"): 1346 # this will handle ndarray-like 1347 # e.g. categoricals 1348 dtype = value.dtype 1349 if not isinstance(dtype, np.dtype): 1350 value = _try_infer_map(value.dtype) 1351 if value is not None: 1352 return value 1353 1354 # its ndarray-like but we can't handle 1355 raise ValueError(f"cannot infer type for {type(value)}") 1356 1357 # Unwrap Series/Index 1358 values = np.asarray(value) 1359 1360 else: 1361 if not isinstance(value, list): 1362 value = list(value) 1363 1364 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike 1365 values = construct_1d_object_array_from_listlike(value) 1366 1367 # make contiguous 1368 # for f-contiguous array 1000 x 1000, passing order="K" gives 5000x speedup 1369 values = values.ravel(order="K") 1370 1371 val = _try_infer_map(values.dtype) 1372 if val is not None: 1373 return val 1374 1375 if values.dtype != np.object_: 1376 values = values.astype("O") 1377 1378 if skipna: 1379 values = values[~isnaobj(values)] 1380 1381 n = len(values) 1382 if n == 0: 1383 return "empty" 1384 1385 # try to use a valid value 1386 for i in range(n): 1387 val = values[i] 1388 1389 # do not use is_nul_datetimelike to keep 1390 # np.datetime64('nat') and np.timedelta64('nat') 1391 if val is None or util.is_nan(val): 1392 pass 1393 elif val is NaT: 1394 seen_pdnat = True 1395 else: 1396 seen_val = True 1397 break 1398 1399 # if all values are nan/NaT 1400 if seen_val is False and seen_pdnat is True: 1401 return "datetime" 1402 # float/object nan is handled in latter logic 1403 1404 if util.is_datetime64_object(val): 1405 if is_datetime64_array(values): 1406 return "datetime64" 1407 1408 elif is_timedelta(val): 1409 if is_timedelta_or_timedelta64_array(values): 1410 return "timedelta" 1411 1412 elif util.is_integer_object(val): 1413 # ordering matters here; this check must come after the is_timedelta 1414 # check otherwise numpy timedelta64 objects would come through here 1415 1416 if is_integer_array(values): 1417 return "integer" 1418 elif is_integer_float_array(values): 1419 if is_integer_na_array(values): 1420 return "integer-na" 1421 else: 1422 return "mixed-integer-float" 1423 return "mixed-integer" 1424 1425 elif PyDateTime_Check(val): 1426 if is_datetime_array(values, skipna=skipna): 1427 return "datetime" 1428 elif is_date_array(values, skipna=skipna): 1429 return "date" 1430 1431 elif PyDate_Check(val): 1432 if is_date_array(values, skipna=skipna): 1433 return "date" 1434 1435 elif PyTime_Check(val): 1436 if is_time_array(values, skipna=skipna): 1437 return "time" 1438 1439 elif is_decimal(val): 1440 if is_decimal_array(values): 1441 return "decimal" 1442 1443 elif is_complex(val): 1444 if is_complex_array(values): 1445 return "complex" 1446 1447 elif util.is_float_object(val): 1448 if is_float_array(values): 1449 return "floating" 1450 elif is_integer_float_array(values): 1451 if is_integer_na_array(values): 1452 return "integer-na" 1453 else: 1454 return "mixed-integer-float" 1455 1456 elif util.is_bool_object(val): 1457 if is_bool_array(values, skipna=skipna): 1458 return "boolean" 1459 1460 elif isinstance(val, str): 1461 if is_string_array(values, skipna=skipna): 1462 return "string" 1463 1464 elif isinstance(val, bytes): 1465 if is_bytes_array(values, skipna=skipna): 1466 return "bytes" 1467 1468 elif is_period_object(val): 1469 if is_period_array(values): 1470 return "period" 1471 1472 elif is_interval(val): 1473 if is_interval_array(values): 1474 return "interval" 1475 1476 for i in range(n): 1477 val = values[i] 1478 if (util.is_integer_object(val) and 1479 not util.is_timedelta64_object(val) and 1480 not util.is_datetime64_object(val)): 1481 return "mixed-integer" 1482 1483 return "mixed" 1484 1485 1486def infer_datetimelike_array(arr: ndarray[object]) -> str: 1487 """ 1488 Infer if we have a datetime or timedelta array. 1489 - date: we have *only* date and maybe strings, nulls 1490 - datetime: we have *only* datetimes and maybe strings, nulls 1491 - timedelta: we have *only* timedeltas and maybe strings, nulls 1492 - nat: we do not have *any* date, datetimes or timedeltas, but do have 1493 at least a NaT 1494 - mixed: other objects (strings, a mix of tz-aware and tz-naive, or 1495 actual objects) 1496 1497 Parameters 1498 ---------- 1499 arr : ndarray[object] 1500 1501 Returns 1502 ------- 1503 str: {datetime, timedelta, date, nat, mixed} 1504 """ 1505 cdef: 1506 Py_ssize_t i, n = len(arr) 1507 bint seen_timedelta = False, seen_date = False, seen_datetime = False 1508 bint seen_tz_aware = False, seen_tz_naive = False 1509 bint seen_nat = False 1510 list objs = [] 1511 object v 1512 1513 for i in range(n): 1514 v = arr[i] 1515 if isinstance(v, str): 1516 objs.append(v) 1517 1518 if len(objs) == 3: 1519 break 1520 1521 elif v is None or util.is_nan(v): 1522 # nan or None 1523 pass 1524 elif v is NaT: 1525 seen_nat = True 1526 elif PyDateTime_Check(v): 1527 # datetime 1528 seen_datetime = True 1529 1530 # disambiguate between tz-naive and tz-aware 1531 if v.tzinfo is None: 1532 seen_tz_naive = True 1533 else: 1534 seen_tz_aware = True 1535 1536 if seen_tz_naive and seen_tz_aware: 1537 return 'mixed' 1538 elif util.is_datetime64_object(v): 1539 # np.datetime64 1540 seen_datetime = True 1541 elif PyDate_Check(v): 1542 seen_date = True 1543 elif is_timedelta(v): 1544 # timedelta, or timedelta64 1545 seen_timedelta = True 1546 else: 1547 return "mixed" 1548 1549 if seen_date and not (seen_datetime or seen_timedelta): 1550 return "date" 1551 elif seen_datetime and not seen_timedelta: 1552 return "datetime" 1553 elif seen_timedelta and not seen_datetime: 1554 return "timedelta" 1555 elif seen_nat: 1556 return "nat" 1557 1558 # short-circuit by trying to 1559 # actually convert these strings 1560 # this is for performance as we don't need to try 1561 # convert *every* string array 1562 if len(objs): 1563 try: 1564 array_to_datetime(objs, errors="raise") 1565 return "datetime" 1566 except (ValueError, TypeError): 1567 pass 1568 1569 # we are *not* going to infer from strings 1570 # for timedelta as too much ambiguity 1571 1572 return 'mixed' 1573 1574 1575cdef inline bint is_timedelta(object o): 1576 return PyDelta_Check(o) or util.is_timedelta64_object(o) 1577 1578 1579cdef class Validator: 1580 1581 cdef: 1582 Py_ssize_t n 1583 dtype dtype 1584 bint skipna 1585 1586 def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_), 1587 bint skipna=False): 1588 self.n = n 1589 self.dtype = dtype 1590 self.skipna = skipna 1591 1592 cdef bint validate(self, ndarray values) except -1: 1593 if not self.n: 1594 return False 1595 1596 if self.is_array_typed(): 1597 return True 1598 elif self.dtype.type_num == NPY_OBJECT: 1599 if self.skipna: 1600 return self._validate_skipna(values) 1601 else: 1602 return self._validate(values) 1603 else: 1604 return False 1605 1606 @cython.wraparound(False) 1607 @cython.boundscheck(False) 1608 cdef bint _validate(self, ndarray values) except -1: 1609 cdef: 1610 Py_ssize_t i 1611 Py_ssize_t n = self.n 1612 1613 for i in range(n): 1614 if not self.is_valid(values[i]): 1615 return False 1616 1617 return self.finalize_validate() 1618 1619 @cython.wraparound(False) 1620 @cython.boundscheck(False) 1621 cdef bint _validate_skipna(self, ndarray values) except -1: 1622 cdef: 1623 Py_ssize_t i 1624 Py_ssize_t n = self.n 1625 1626 for i in range(n): 1627 if not self.is_valid_skipna(values[i]): 1628 return False 1629 1630 return self.finalize_validate_skipna() 1631 1632 cdef bint is_valid(self, object value) except -1: 1633 return self.is_value_typed(value) 1634 1635 cdef bint is_valid_skipna(self, object value) except -1: 1636 return self.is_valid(value) or self.is_valid_null(value) 1637 1638 cdef bint is_value_typed(self, object value) except -1: 1639 raise NotImplementedError(f"{type(self).__name__} child class " 1640 "must define is_value_typed") 1641 1642 cdef bint is_valid_null(self, object value) except -1: 1643 return value is None or value is C_NA or util.is_nan(value) 1644 1645 cdef bint is_array_typed(self) except -1: 1646 return False 1647 1648 cdef inline bint finalize_validate(self): 1649 return True 1650 1651 cdef bint finalize_validate_skipna(self): 1652 # TODO(phillipc): Remove the existing validate methods and replace them 1653 # with the skipna versions upon full deprecation of skipna=False 1654 return True 1655 1656 1657cdef class BoolValidator(Validator): 1658 cdef inline bint is_value_typed(self, object value) except -1: 1659 return util.is_bool_object(value) 1660 1661 cdef inline bint is_array_typed(self) except -1: 1662 return issubclass(self.dtype.type, np.bool_) 1663 1664 1665cpdef bint is_bool_array(ndarray values, bint skipna=False): 1666 cdef: 1667 BoolValidator validator = BoolValidator(len(values), 1668 values.dtype, 1669 skipna=skipna) 1670 return validator.validate(values) 1671 1672 1673cdef class IntegerValidator(Validator): 1674 cdef inline bint is_value_typed(self, object value) except -1: 1675 return util.is_integer_object(value) 1676 1677 cdef inline bint is_array_typed(self) except -1: 1678 return issubclass(self.dtype.type, np.integer) 1679 1680 1681cpdef bint is_integer_array(ndarray values): 1682 cdef: 1683 IntegerValidator validator = IntegerValidator(len(values), 1684 values.dtype) 1685 return validator.validate(values) 1686 1687 1688cdef class IntegerNaValidator(Validator): 1689 cdef inline bint is_value_typed(self, object value) except -1: 1690 return (util.is_integer_object(value) 1691 or (util.is_nan(value) and util.is_float_object(value))) 1692 1693 1694cdef bint is_integer_na_array(ndarray values): 1695 cdef: 1696 IntegerNaValidator validator = IntegerNaValidator(len(values), 1697 values.dtype) 1698 return validator.validate(values) 1699 1700 1701cdef class IntegerFloatValidator(Validator): 1702 cdef inline bint is_value_typed(self, object value) except -1: 1703 return util.is_integer_object(value) or util.is_float_object(value) 1704 1705 cdef inline bint is_array_typed(self) except -1: 1706 return issubclass(self.dtype.type, np.integer) 1707 1708 1709cdef bint is_integer_float_array(ndarray values): 1710 cdef: 1711 IntegerFloatValidator validator = IntegerFloatValidator(len(values), 1712 values.dtype) 1713 return validator.validate(values) 1714 1715 1716cdef class FloatValidator(Validator): 1717 cdef inline bint is_value_typed(self, object value) except -1: 1718 return util.is_float_object(value) 1719 1720 cdef inline bint is_array_typed(self) except -1: 1721 return issubclass(self.dtype.type, np.floating) 1722 1723 1724cpdef bint is_float_array(ndarray values): 1725 cdef: 1726 FloatValidator validator = FloatValidator(len(values), values.dtype) 1727 return validator.validate(values) 1728 1729 1730cdef class ComplexValidator(Validator): 1731 cdef inline bint is_value_typed(self, object value) except -1: 1732 return ( 1733 util.is_complex_object(value) 1734 or (util.is_float_object(value) and is_nan(value)) 1735 ) 1736 1737 cdef inline bint is_array_typed(self) except -1: 1738 return issubclass(self.dtype.type, np.complexfloating) 1739 1740 1741cdef bint is_complex_array(ndarray values): 1742 cdef: 1743 ComplexValidator validator = ComplexValidator(len(values), values.dtype) 1744 return validator.validate(values) 1745 1746 1747cdef class DecimalValidator(Validator): 1748 cdef inline bint is_value_typed(self, object value) except -1: 1749 return is_decimal(value) 1750 1751 1752cdef bint is_decimal_array(ndarray values): 1753 cdef: 1754 DecimalValidator validator = DecimalValidator(len(values), values.dtype) 1755 return validator.validate(values) 1756 1757 1758cdef class StringValidator(Validator): 1759 cdef inline bint is_value_typed(self, object value) except -1: 1760 return isinstance(value, str) 1761 1762 cdef inline bint is_array_typed(self) except -1: 1763 return issubclass(self.dtype.type, np.str_) 1764 1765 cdef bint is_valid_null(self, object value) except -1: 1766 # We deliberately exclude None / NaN here since StringArray uses NA 1767 return value is C_NA 1768 1769 1770cpdef bint is_string_array(ndarray values, bint skipna=False): 1771 cdef: 1772 StringValidator validator = StringValidator(len(values), 1773 values.dtype, 1774 skipna=skipna) 1775 return validator.validate(values) 1776 1777 1778cdef class BytesValidator(Validator): 1779 cdef inline bint is_value_typed(self, object value) except -1: 1780 return isinstance(value, bytes) 1781 1782 cdef inline bint is_array_typed(self) except -1: 1783 return issubclass(self.dtype.type, np.bytes_) 1784 1785 1786cdef bint is_bytes_array(ndarray values, bint skipna=False): 1787 cdef: 1788 BytesValidator validator = BytesValidator(len(values), values.dtype, 1789 skipna=skipna) 1790 return validator.validate(values) 1791 1792 1793cdef class TemporalValidator(Validator): 1794 cdef: 1795 Py_ssize_t generic_null_count 1796 1797 def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_), 1798 bint skipna=False): 1799 self.n = n 1800 self.dtype = dtype 1801 self.skipna = skipna 1802 self.generic_null_count = 0 1803 1804 cdef inline bint is_valid(self, object value) except -1: 1805 return self.is_value_typed(value) or self.is_valid_null(value) 1806 1807 cdef bint is_valid_null(self, object value) except -1: 1808 raise NotImplementedError(f"{type(self).__name__} child class " 1809 "must define is_valid_null") 1810 1811 cdef inline bint is_valid_skipna(self, object value) except -1: 1812 cdef: 1813 bint is_typed_null = self.is_valid_null(value) 1814 bint is_generic_null = value is None or util.is_nan(value) 1815 self.generic_null_count += is_typed_null and is_generic_null 1816 return self.is_value_typed(value) or is_typed_null or is_generic_null 1817 1818 cdef inline bint finalize_validate_skipna(self): 1819 return self.generic_null_count != self.n 1820 1821 1822cdef class DatetimeValidator(TemporalValidator): 1823 cdef bint is_value_typed(self, object value) except -1: 1824 return PyDateTime_Check(value) 1825 1826 cdef inline bint is_valid_null(self, object value) except -1: 1827 return is_null_datetime64(value) 1828 1829 1830cpdef bint is_datetime_array(ndarray values, bint skipna=True): 1831 cdef: 1832 DatetimeValidator validator = DatetimeValidator(len(values), 1833 skipna=skipna) 1834 return validator.validate(values) 1835 1836 1837cdef class Datetime64Validator(DatetimeValidator): 1838 cdef inline bint is_value_typed(self, object value) except -1: 1839 return util.is_datetime64_object(value) 1840 1841 1842cpdef bint is_datetime64_array(ndarray values): 1843 cdef: 1844 Datetime64Validator validator = Datetime64Validator(len(values), 1845 skipna=True) 1846 return validator.validate(values) 1847 1848 1849# TODO: only non-here use is in test 1850def is_datetime_with_singletz_array(values: ndarray) -> bool: 1851 """ 1852 Check values have the same tzinfo attribute. 1853 Doesn't check values are datetime-like types. 1854 """ 1855 cdef: 1856 Py_ssize_t i = 0, j, n = len(values) 1857 object base_val, base_tz, val, tz 1858 1859 if n == 0: 1860 return False 1861 # Get a reference timezone to compare with the rest of the tzs in the array 1862 for i in range(n): 1863 base_val = values[i] 1864 if base_val is not NaT: 1865 base_tz = getattr(base_val, 'tzinfo', None) 1866 break 1867 1868 for j in range(i, n): 1869 # Compare val's timezone with the reference timezone 1870 # NaT can coexist with tz-aware datetimes, so skip if encountered 1871 val = values[j] 1872 if val is not NaT: 1873 tz = getattr(val, 'tzinfo', None) 1874 if not tz_compare(base_tz, tz): 1875 return False 1876 1877 return True 1878 1879 1880cdef class TimedeltaValidator(TemporalValidator): 1881 cdef bint is_value_typed(self, object value) except -1: 1882 return PyDelta_Check(value) 1883 1884 cdef inline bint is_valid_null(self, object value) except -1: 1885 return is_null_timedelta64(value) 1886 1887 1888cdef class AnyTimedeltaValidator(TimedeltaValidator): 1889 cdef inline bint is_value_typed(self, object value) except -1: 1890 return is_timedelta(value) 1891 1892 1893# TODO: only non-here use is in test 1894cpdef bint is_timedelta_or_timedelta64_array(ndarray values): 1895 """ 1896 Infer with timedeltas and/or nat/none. 1897 """ 1898 cdef: 1899 AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values), 1900 skipna=True) 1901 return validator.validate(values) 1902 1903 1904cdef class DateValidator(Validator): 1905 cdef inline bint is_value_typed(self, object value) except -1: 1906 return PyDate_Check(value) 1907 1908 1909cpdef bint is_date_array(ndarray values, bint skipna=False): 1910 cdef: 1911 DateValidator validator = DateValidator(len(values), skipna=skipna) 1912 return validator.validate(values) 1913 1914 1915cdef class TimeValidator(Validator): 1916 cdef inline bint is_value_typed(self, object value) except -1: 1917 return PyTime_Check(value) 1918 1919 1920cpdef bint is_time_array(ndarray values, bint skipna=False): 1921 cdef: 1922 TimeValidator validator = TimeValidator(len(values), skipna=skipna) 1923 return validator.validate(values) 1924 1925 1926cdef class PeriodValidator(TemporalValidator): 1927 cdef inline bint is_value_typed(self, object value) except -1: 1928 return is_period_object(value) 1929 1930 cdef inline bint is_valid_null(self, object value) except -1: 1931 return checknull_with_nat(value) 1932 1933 1934cpdef bint is_period_array(ndarray values): 1935 cdef: 1936 PeriodValidator validator = PeriodValidator(len(values), skipna=True) 1937 return validator.validate(values) 1938 1939 1940cdef class IntervalValidator(Validator): 1941 cdef inline bint is_value_typed(self, object value) except -1: 1942 return is_interval(value) 1943 1944 1945cpdef bint is_interval_array(ndarray values): 1946 cdef: 1947 IntervalValidator validator = IntervalValidator(len(values), 1948 skipna=True) 1949 return validator.validate(values) 1950 1951 1952@cython.boundscheck(False) 1953@cython.wraparound(False) 1954def maybe_convert_numeric(ndarray[object] values, set na_values, 1955 bint convert_empty=True, bint coerce_numeric=False): 1956 """ 1957 Convert object array to a numeric array if possible. 1958 1959 Parameters 1960 ---------- 1961 values : ndarray 1962 Array of object elements to convert. 1963 na_values : set 1964 Set of values that should be interpreted as NaN. 1965 convert_empty : bool, default True 1966 If an empty array-like object is encountered, whether to interpret 1967 that element as NaN or not. If set to False, a ValueError will be 1968 raised if such an element is encountered and 'coerce_numeric' is False. 1969 coerce_numeric : bool, default False 1970 If initial attempts to convert to numeric have failed, whether to 1971 force conversion to numeric via alternative methods or by setting the 1972 element to NaN. Otherwise, an Exception will be raised when such an 1973 element is encountered. 1974 1975 This boolean also has an impact on how conversion behaves when a 1976 numeric array has no suitable numerical dtype to return (i.e. uint64, 1977 int32, uint8). If set to False, the original object array will be 1978 returned. Otherwise, a ValueError will be raised. 1979 1980 Returns 1981 ------- 1982 Array of converted object values to numerical ones. 1983 """ 1984 if len(values) == 0: 1985 return np.array([], dtype='i8') 1986 1987 # fastpath for ints - try to convert all based on first value 1988 cdef: 1989 object val = values[0] 1990 1991 if util.is_integer_object(val): 1992 try: 1993 maybe_ints = values.astype('i8') 1994 if (maybe_ints == values).all(): 1995 return maybe_ints 1996 except (ValueError, OverflowError, TypeError): 1997 pass 1998 1999 # Otherwise, iterate and do full inference. 2000 cdef: 2001 int status, maybe_int 2002 Py_ssize_t i, n = values.size 2003 Seen seen = Seen(coerce_numeric) 2004 ndarray[float64_t] floats = np.empty(n, dtype='f8') 2005 ndarray[complex128_t] complexes = np.empty(n, dtype='c16') 2006 ndarray[int64_t] ints = np.empty(n, dtype='i8') 2007 ndarray[uint64_t] uints = np.empty(n, dtype='u8') 2008 ndarray[uint8_t] bools = np.empty(n, dtype='u1') 2009 float64_t fval 2010 2011 for i in range(n): 2012 val = values[i] 2013 2014 if val.__hash__ is not None and val in na_values: 2015 seen.saw_null() 2016 floats[i] = complexes[i] = NaN 2017 elif util.is_float_object(val): 2018 fval = val 2019 if fval != fval: 2020 seen.null_ = True 2021 2022 floats[i] = complexes[i] = fval 2023 seen.float_ = True 2024 elif util.is_integer_object(val): 2025 floats[i] = complexes[i] = val 2026 2027 val = int(val) 2028 seen.saw_int(val) 2029 2030 if val >= 0: 2031 if val <= oUINT64_MAX: 2032 uints[i] = val 2033 else: 2034 seen.float_ = True 2035 2036 if oINT64_MIN <= val <= oINT64_MAX: 2037 ints[i] = val 2038 2039 if val < oINT64_MIN or (seen.sint_ and seen.uint_): 2040 seen.float_ = True 2041 2042 elif util.is_bool_object(val): 2043 floats[i] = uints[i] = ints[i] = bools[i] = val 2044 seen.bool_ = True 2045 elif val is None or val is C_NA: 2046 seen.saw_null() 2047 floats[i] = complexes[i] = NaN 2048 elif hasattr(val, '__len__') and len(val) == 0: 2049 if convert_empty or seen.coerce_numeric: 2050 seen.saw_null() 2051 floats[i] = complexes[i] = NaN 2052 else: 2053 raise ValueError("Empty string encountered") 2054 elif util.is_complex_object(val): 2055 complexes[i] = val 2056 seen.complex_ = True 2057 elif is_decimal(val): 2058 floats[i] = complexes[i] = val 2059 seen.float_ = True 2060 else: 2061 try: 2062 status = floatify(val, &fval, &maybe_int) 2063 2064 if fval in na_values: 2065 seen.saw_null() 2066 floats[i] = complexes[i] = NaN 2067 else: 2068 if fval != fval: 2069 seen.null_ = True 2070 2071 floats[i] = fval 2072 2073 if maybe_int: 2074 as_int = int(val) 2075 2076 if as_int in na_values: 2077 seen.saw_null() 2078 else: 2079 seen.saw_int(as_int) 2080 2081 if as_int not in na_values: 2082 if as_int < oINT64_MIN or as_int > oUINT64_MAX: 2083 if seen.coerce_numeric: 2084 seen.float_ = True 2085 else: 2086 raise ValueError("Integer out of range.") 2087 else: 2088 if as_int >= 0: 2089 uints[i] = as_int 2090 2091 if as_int <= oINT64_MAX: 2092 ints[i] = as_int 2093 2094 seen.float_ = seen.float_ or (seen.uint_ and seen.sint_) 2095 else: 2096 seen.float_ = True 2097 except (TypeError, ValueError) as err: 2098 if not seen.coerce_numeric: 2099 raise type(err)(f"{err} at position {i}") 2100 2101 seen.saw_null() 2102 floats[i] = NaN 2103 2104 if seen.check_uint64_conflict(): 2105 return values 2106 2107 if seen.complex_: 2108 return complexes 2109 elif seen.float_: 2110 return floats 2111 elif seen.int_: 2112 if seen.uint_: 2113 return uints 2114 else: 2115 return ints 2116 elif seen.bool_: 2117 return bools.view(np.bool_) 2118 elif seen.uint_: 2119 return uints 2120 return ints 2121 2122 2123@cython.boundscheck(False) 2124@cython.wraparound(False) 2125def maybe_convert_objects(ndarray[object] objects, bint try_float=False, 2126 bint safe=False, bint convert_datetime=False, 2127 bint convert_timedelta=False, 2128 bint convert_to_nullable_integer=False): 2129 """ 2130 Type inference function-- convert object array to proper dtype 2131 2132 Parameters 2133 ---------- 2134 values : ndarray 2135 Array of object elements to convert. 2136 try_float : bool, default False 2137 If an array-like object contains only float or NaN values is 2138 encountered, whether to convert and return an array of float dtype. 2139 safe : bool, default False 2140 Whether to upcast numeric type (e.g. int cast to float). If set to 2141 True, no upcasting will be performed. 2142 convert_datetime : bool, default False 2143 If an array-like object contains only datetime values or NaT is 2144 encountered, whether to convert and return an array of M8[ns] dtype. 2145 convert_timedelta : bool, default False 2146 If an array-like object contains only timedelta values or NaT is 2147 encountered, whether to convert and return an array of m8[ns] dtype. 2148 convert_to_nullable_integer : bool, default False 2149 If an array-like object contains only integer values (and NaN) is 2150 encountered, whether to convert and return an IntegerArray. 2151 2152 Returns 2153 ------- 2154 Array of converted object values to more specific dtypes if applicable. 2155 """ 2156 cdef: 2157 Py_ssize_t i, n 2158 ndarray[float64_t] floats 2159 ndarray[complex128_t] complexes 2160 ndarray[int64_t] ints 2161 ndarray[uint64_t] uints 2162 ndarray[uint8_t] bools 2163 int64_t[:] idatetimes 2164 int64_t[:] itimedeltas 2165 Seen seen = Seen() 2166 object val 2167 float64_t fval, fnan 2168 2169 n = len(objects) 2170 2171 floats = np.empty(n, dtype='f8') 2172 complexes = np.empty(n, dtype='c16') 2173 ints = np.empty(n, dtype='i8') 2174 uints = np.empty(n, dtype='u8') 2175 bools = np.empty(n, dtype=np.uint8) 2176 mask = np.full(n, False) 2177 2178 if convert_datetime: 2179 datetimes = np.empty(n, dtype='M8[ns]') 2180 idatetimes = datetimes.view(np.int64) 2181 2182 if convert_timedelta: 2183 timedeltas = np.empty(n, dtype='m8[ns]') 2184 itimedeltas = timedeltas.view(np.int64) 2185 2186 fnan = np.nan 2187 2188 for i in range(n): 2189 val = objects[i] 2190 2191 if val is None: 2192 seen.null_ = True 2193 floats[i] = complexes[i] = fnan 2194 mask[i] = True 2195 elif val is NaT: 2196 seen.nat_ = True 2197 if convert_datetime: 2198 idatetimes[i] = NPY_NAT 2199 if convert_timedelta: 2200 itimedeltas[i] = NPY_NAT 2201 if not (convert_datetime or convert_timedelta): 2202 seen.object_ = True 2203 break 2204 elif val is np.nan: 2205 seen.nan_ = True 2206 mask[i] = True 2207 floats[i] = complexes[i] = val 2208 elif util.is_bool_object(val): 2209 seen.bool_ = True 2210 bools[i] = val 2211 elif util.is_float_object(val): 2212 floats[i] = complexes[i] = val 2213 seen.float_ = True 2214 elif util.is_datetime64_object(val): 2215 if convert_datetime: 2216 idatetimes[i] = convert_to_tsobject( 2217 val, None, None, 0, 0).value 2218 seen.datetime_ = True 2219 else: 2220 seen.object_ = True 2221 break 2222 elif is_timedelta(val): 2223 if convert_timedelta: 2224 itimedeltas[i] = convert_to_timedelta64(val, 'ns') 2225 seen.timedelta_ = True 2226 else: 2227 seen.object_ = True 2228 break 2229 elif util.is_integer_object(val): 2230 seen.int_ = True 2231 floats[i] = <float64_t>val 2232 complexes[i] = <double complex>val 2233 if not seen.null_: 2234 val = int(val) 2235 seen.saw_int(val) 2236 2237 if ((seen.uint_ and seen.sint_) or 2238 val > oUINT64_MAX or val < oINT64_MIN): 2239 seen.object_ = True 2240 break 2241 2242 if seen.uint_: 2243 uints[i] = val 2244 elif seen.sint_: 2245 ints[i] = val 2246 else: 2247 uints[i] = val 2248 ints[i] = val 2249 2250 elif util.is_complex_object(val): 2251 complexes[i] = val 2252 seen.complex_ = True 2253 elif PyDateTime_Check(val) or util.is_datetime64_object(val): 2254 2255 # if we have an tz's attached then return the objects 2256 if convert_datetime: 2257 if getattr(val, 'tzinfo', None) is not None: 2258 seen.datetimetz_ = True 2259 break 2260 else: 2261 seen.datetime_ = True 2262 idatetimes[i] = convert_to_tsobject( 2263 val, None, None, 0, 0).value 2264 else: 2265 seen.object_ = True 2266 break 2267 elif try_float and not isinstance(val, str): 2268 # this will convert Decimal objects 2269 try: 2270 floats[i] = float(val) 2271 complexes[i] = complex(val) 2272 seen.float_ = True 2273 except (ValueError, TypeError): 2274 seen.object_ = True 2275 break 2276 else: 2277 seen.object_ = True 2278 break 2279 2280 # we try to coerce datetime w/tz but must all have the same tz 2281 if seen.datetimetz_: 2282 if is_datetime_with_singletz_array(objects): 2283 from pandas import DatetimeIndex 2284 return DatetimeIndex(objects) 2285 seen.object_ = True 2286 2287 if not seen.object_: 2288 if not safe: 2289 if seen.null_ or seen.nan_: 2290 if seen.is_float_or_complex: 2291 if seen.complex_: 2292 return complexes 2293 elif seen.float_: 2294 return floats 2295 elif seen.int_: 2296 if convert_to_nullable_integer: 2297 from pandas.core.arrays import IntegerArray 2298 return IntegerArray(ints, mask) 2299 else: 2300 return floats 2301 elif seen.nan_: 2302 return floats 2303 else: 2304 if not seen.bool_: 2305 if seen.datetime_: 2306 if not seen.numeric_ and not seen.timedelta_: 2307 return datetimes 2308 elif seen.timedelta_: 2309 if not seen.numeric_: 2310 return timedeltas 2311 elif seen.nat_: 2312 if not seen.numeric_: 2313 if convert_datetime and convert_timedelta: 2314 # TODO: array full of NaT ambiguity resolve here needed 2315 pass 2316 elif convert_datetime: 2317 return datetimes 2318 elif convert_timedelta: 2319 return timedeltas 2320 else: 2321 if seen.complex_: 2322 return complexes 2323 elif seen.float_: 2324 return floats 2325 elif seen.int_: 2326 if seen.uint_: 2327 return uints 2328 else: 2329 return ints 2330 elif seen.is_bool: 2331 return bools.view(np.bool_) 2332 2333 else: 2334 # don't cast int to float, etc. 2335 if seen.null_: 2336 if seen.is_float_or_complex: 2337 if seen.complex_: 2338 if not seen.int_: 2339 return complexes 2340 elif seen.float_ or seen.nan_: 2341 if not seen.int_: 2342 return floats 2343 else: 2344 if not seen.bool_: 2345 if seen.datetime_: 2346 if not seen.numeric_ and not seen.timedelta_: 2347 return datetimes 2348 elif seen.timedelta_: 2349 if not seen.numeric_: 2350 return timedeltas 2351 elif seen.nat_: 2352 if not seen.numeric_: 2353 if convert_datetime and convert_timedelta: 2354 # TODO: array full of NaT ambiguity resolve here needed 2355 pass 2356 elif convert_datetime: 2357 return datetimes 2358 elif convert_timedelta: 2359 return timedeltas 2360 else: 2361 if seen.complex_: 2362 if not seen.int_: 2363 return complexes 2364 elif seen.float_ or seen.nan_: 2365 if not seen.int_: 2366 return floats 2367 elif seen.int_: 2368 if seen.uint_: 2369 return uints 2370 else: 2371 return ints 2372 elif seen.is_bool and not seen.nan_: 2373 return bools.view(np.bool_) 2374 2375 return objects 2376 2377 2378# Note: no_default is exported to the public API in pandas.api.extensions 2379no_default = object() #: Sentinel indicating the default value. 2380 2381 2382@cython.boundscheck(False) 2383@cython.wraparound(False) 2384def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True, 2385 object na_value=no_default, object dtype=object): 2386 """ 2387 Substitute for np.vectorize with pandas-friendly dtype inference. 2388 2389 Parameters 2390 ---------- 2391 arr : ndarray 2392 f : function 2393 mask : ndarray 2394 uint8 dtype ndarray indicating values not to apply `f` to. 2395 convert : bool, default True 2396 Whether to call `maybe_convert_objects` on the resulting ndarray 2397 na_value : Any, optional 2398 The result value to use for masked values. By default, the 2399 input value is used 2400 dtype : numpy.dtype 2401 The numpy dtype to use for the result ndarray. 2402 2403 Returns 2404 ------- 2405 ndarray 2406 """ 2407 cdef: 2408 Py_ssize_t i, n 2409 ndarray result 2410 object val 2411 2412 n = len(arr) 2413 result = np.empty(n, dtype=dtype) 2414 for i in range(n): 2415 if mask[i]: 2416 if na_value is no_default: 2417 val = arr[i] 2418 else: 2419 val = na_value 2420 else: 2421 val = f(arr[i]) 2422 2423 if cnp.PyArray_IsZeroDim(val): 2424 # unbox 0-dim arrays, GH#690 2425 val = val.item() 2426 2427 result[i] = val 2428 2429 if convert: 2430 return maybe_convert_objects(result, 2431 try_float=False, 2432 convert_datetime=False, 2433 convert_timedelta=False) 2434 2435 return result 2436 2437 2438@cython.boundscheck(False) 2439@cython.wraparound(False) 2440def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False): 2441 """ 2442 Substitute for np.vectorize with pandas-friendly dtype inference. 2443 2444 Parameters 2445 ---------- 2446 arr : ndarray 2447 f : function 2448 convert : bint 2449 ignore_na : bint 2450 If True, NA values will not have f applied 2451 2452 Returns 2453 ------- 2454 ndarray 2455 """ 2456 cdef: 2457 Py_ssize_t i, n 2458 ndarray[object] result 2459 object val 2460 2461 n = len(arr) 2462 result = np.empty(n, dtype=object) 2463 for i in range(n): 2464 if ignore_na and checknull(arr[i]): 2465 result[i] = arr[i] 2466 continue 2467 val = f(arr[i]) 2468 2469 if cnp.PyArray_IsZeroDim(val): 2470 # unbox 0-dim arrays, GH#690 2471 val = val.item() 2472 2473 result[i] = val 2474 2475 if convert: 2476 return maybe_convert_objects(result, 2477 try_float=False, 2478 convert_datetime=False, 2479 convert_timedelta=False) 2480 2481 return result 2482 2483 2484def to_object_array(rows: object, int min_width=0): 2485 """ 2486 Convert a list of lists into an object array. 2487 2488 Parameters 2489 ---------- 2490 rows : 2-d array (N, K) 2491 List of lists to be converted into an array. 2492 min_width : int 2493 Minimum width of the object array. If a list 2494 in `rows` contains fewer than `width` elements, 2495 the remaining elements in the corresponding row 2496 will all be `NaN`. 2497 2498 Returns 2499 ------- 2500 numpy array of the object dtype. 2501 """ 2502 cdef: 2503 Py_ssize_t i, j, n, k, tmp 2504 ndarray[object, ndim=2] result 2505 list row 2506 2507 rows = list(rows) 2508 n = len(rows) 2509 2510 k = min_width 2511 for i in range(n): 2512 tmp = len(rows[i]) 2513 if tmp > k: 2514 k = tmp 2515 2516 result = np.empty((n, k), dtype=object) 2517 2518 for i in range(n): 2519 row = list(rows[i]) 2520 2521 for j in range(len(row)): 2522 result[i, j] = row[j] 2523 2524 return result 2525 2526 2527def tuples_to_object_array(ndarray[object] tuples): 2528 cdef: 2529 Py_ssize_t i, j, n, k, tmp 2530 ndarray[object, ndim=2] result 2531 tuple tup 2532 2533 n = len(tuples) 2534 k = len(tuples[0]) 2535 result = np.empty((n, k), dtype=object) 2536 for i in range(n): 2537 tup = tuples[i] 2538 for j in range(k): 2539 result[i, j] = tup[j] 2540 2541 return result 2542 2543 2544def to_object_array_tuples(rows: object): 2545 """ 2546 Convert a list of tuples into an object array. Any subclass of 2547 tuple in `rows` will be casted to tuple. 2548 2549 Parameters 2550 ---------- 2551 rows : 2-d array (N, K) 2552 List of tuples to be converted into an array. 2553 2554 Returns 2555 ------- 2556 numpy array of the object dtype. 2557 """ 2558 cdef: 2559 Py_ssize_t i, j, n, k, tmp 2560 ndarray[object, ndim=2] result 2561 tuple row 2562 2563 rows = list(rows) 2564 n = len(rows) 2565 2566 k = 0 2567 for i in range(n): 2568 tmp = 1 if checknull(rows[i]) else len(rows[i]) 2569 if tmp > k: 2570 k = tmp 2571 2572 result = np.empty((n, k), dtype=object) 2573 2574 try: 2575 for i in range(n): 2576 row = rows[i] 2577 for j in range(len(row)): 2578 result[i, j] = row[j] 2579 except TypeError: 2580 # e.g. "Expected tuple, got list" 2581 # upcast any subclasses to tuple 2582 for i in range(n): 2583 row = (rows[i],) if checknull(rows[i]) else tuple(rows[i]) 2584 for j in range(len(row)): 2585 result[i, j] = row[j] 2586 2587 return result 2588 2589 2590@cython.wraparound(False) 2591@cython.boundscheck(False) 2592def fast_multiget(dict mapping, ndarray keys, default=np.nan): 2593 cdef: 2594 Py_ssize_t i, n = len(keys) 2595 object val 2596 ndarray[object] output = np.empty(n, dtype='O') 2597 2598 if n == 0: 2599 # kludge, for Series 2600 return np.empty(0, dtype='f8') 2601 2602 for i in range(n): 2603 val = keys[i] 2604 if val in mapping: 2605 output[i] = mapping[val] 2606 else: 2607 output[i] = default 2608 2609 return maybe_convert_objects(output) 2610