1# Licensed to the Apache Software Foundation (ASF) under one 2# or more contributor license agreements. See the NOTICE file 3# distributed with this work for additional information 4# regarding copyright ownership. The ASF licenses this file 5# to you under the Apache License, Version 2.0 (the 6# "License"); you may not use this file except in compliance 7# with the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, 12# software distributed under the License is distributed on an 13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14# KIND, either express or implied. See the License for the 15# specific language governing permissions and limitations 16# under the License. 17 18import os 19import warnings 20 21 22cdef _sequence_to_array(object sequence, object mask, object size, 23 DataType type, CMemoryPool* pool, c_bool from_pandas): 24 cdef: 25 int64_t c_size 26 PyConversionOptions options 27 shared_ptr[CChunkedArray] chunked 28 29 if type is not None: 30 options.type = type.sp_type 31 32 if size is not None: 33 options.size = size 34 35 options.from_pandas = from_pandas 36 options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False) 37 38 with nogil: 39 chunked = GetResultValue( 40 ConvertPySequence(sequence, mask, options, pool) 41 ) 42 43 if chunked.get().num_chunks() == 1: 44 return pyarrow_wrap_array(chunked.get().chunk(0)) 45 else: 46 return pyarrow_wrap_chunked_array(chunked) 47 48 49cdef inline _is_array_like(obj): 50 if isinstance(obj, np.ndarray): 51 return True 52 return pandas_api._have_pandas_internal() and pandas_api.is_array_like(obj) 53 54 55def _ndarray_to_arrow_type(object values, DataType type): 56 return pyarrow_wrap_data_type(_ndarray_to_type(values, type)) 57 58 59cdef shared_ptr[CDataType] _ndarray_to_type(object values, 60 DataType type) except *: 61 cdef shared_ptr[CDataType] c_type 62 63 dtype = values.dtype 64 65 if type is None and dtype != object: 66 with nogil: 67 check_status(NumPyDtypeToArrow(dtype, &c_type)) 68 69 if type is not None: 70 c_type = type.sp_type 71 72 return c_type 73 74 75cdef _ndarray_to_array(object values, object mask, DataType type, 76 c_bool from_pandas, c_bool safe, CMemoryPool* pool): 77 cdef: 78 shared_ptr[CChunkedArray] chunked_out 79 shared_ptr[CDataType] c_type = _ndarray_to_type(values, type) 80 CCastOptions cast_options = CCastOptions(safe) 81 82 with nogil: 83 check_status(NdarrayToArrow(pool, values, mask, from_pandas, 84 c_type, cast_options, &chunked_out)) 85 86 if chunked_out.get().num_chunks() > 1: 87 return pyarrow_wrap_chunked_array(chunked_out) 88 else: 89 return pyarrow_wrap_array(chunked_out.get().chunk(0)) 90 91 92cdef _codes_to_indices(object codes, object mask, DataType type, 93 MemoryPool memory_pool): 94 """ 95 Convert the codes of a pandas Categorical to indices for a pyarrow 96 DictionaryArray, taking into account missing values + mask 97 """ 98 if mask is None: 99 mask = codes == -1 100 else: 101 mask = mask | (codes == -1) 102 return array(codes, mask=mask, type=type, memory_pool=memory_pool) 103 104 105def _handle_arrow_array_protocol(obj, type, mask, size): 106 if mask is not None or size is not None: 107 raise ValueError( 108 "Cannot specify a mask or a size when passing an object that is " 109 "converted with the __arrow_array__ protocol.") 110 res = obj.__arrow_array__(type=type) 111 if not isinstance(res, (Array, ChunkedArray)): 112 raise TypeError("The object's __arrow_array__ method does not " 113 "return a pyarrow Array or ChunkedArray.") 114 return res 115 116 117def array(object obj, type=None, mask=None, size=None, from_pandas=None, 118 bint safe=True, MemoryPool memory_pool=None): 119 """ 120 Create pyarrow.Array instance from a Python object. 121 122 Parameters 123 ---------- 124 obj : sequence, iterable, ndarray or Series 125 If both type and size are specified may be a single use iterable. If 126 not strongly-typed, Arrow type will be inferred for resulting array. 127 type : pyarrow.DataType 128 Explicit type to attempt to coerce to, otherwise will be inferred from 129 the data. 130 mask : array[bool], optional 131 Indicate which values are null (True) or not null (False). 132 size : int64, optional 133 Size of the elements. If the input is larger than size bail at this 134 length. For iterators, if size is larger than the input iterator this 135 will be treated as a "max size", but will involve an initial allocation 136 of size followed by a resize to the actual size (so if you know the 137 exact size specifying it correctly will give you better performance). 138 from_pandas : bool, default None 139 Use pandas's semantics for inferring nulls from values in 140 ndarray-like data. If passed, the mask tasks precedence, but 141 if a value is unmasked (not-null), but still null according to 142 pandas semantics, then it is null. Defaults to False if not 143 passed explicitly by user, or True if a pandas object is 144 passed in. 145 safe : bool, default True 146 Check for overflows or other unsafe conversions. 147 memory_pool : pyarrow.MemoryPool, optional 148 If not passed, will allocate memory from the currently-set default 149 memory pool. 150 151 Returns 152 ------- 153 array : pyarrow.Array or pyarrow.ChunkedArray 154 A ChunkedArray instead of an Array is returned if: 155 156 - the object data overflowed binary storage. 157 - the object's ``__arrow_array__`` protocol method returned a chunked 158 array. 159 160 Notes 161 ----- 162 Localized timestamps will currently be returned as UTC (pandas's native 163 representation). Timezone-naive data will be implicitly interpreted as 164 UTC. 165 166 Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by 167 default converted as MonthDayNanoIntervalArray. relativedelta leapdays 168 are ignored as are all absolute fields on both objects. datetime.timedelta 169 can also be converted to MonthDayNanoIntervalArray but this requires 170 passing MonthDayNanoIntervalType explicitly. 171 172 Converting to dictionary array will promote to a wider integer type for 173 indices if the number of distinct values cannot be represented, even if 174 the index type was explicitly set. This means that if there are more than 175 127 values the returned dictionary array's index type will be at least 176 pa.int16() even if pa.int8() was passed to the function. Note that an 177 explicit index type will not be demoted even if it is wider than required. 178 179 Examples 180 -------- 181 >>> import pandas as pd 182 >>> import pyarrow as pa 183 >>> pa.array(pd.Series([1, 2])) 184 <pyarrow.lib.Int64Array object at 0x7f674e4c0e10> 185 [ 186 1, 187 2 188 ] 189 190 >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string())) 191 <pyarrow.lib.DictionaryArray object at 0x7feb288d9040> 192 -- dictionary: 193 [ 194 "a", 195 "b" 196 ] 197 -- indices: 198 [ 199 0, 200 1, 201 0 202 ] 203 204 >>> import numpy as np 205 >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool)) 206 <pyarrow.lib.Int64Array object at 0x7f9019e11208> 207 [ 208 1, 209 null 210 ] 211 212 >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64())) 213 >>> arr.type.index_type 214 DataType(int16) 215 """ 216 cdef: 217 CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) 218 bint is_pandas_object = False 219 bint c_from_pandas 220 221 type = ensure_type(type, allow_none=True) 222 223 if from_pandas is None: 224 c_from_pandas = False 225 else: 226 c_from_pandas = from_pandas 227 228 if hasattr(obj, '__arrow_array__'): 229 return _handle_arrow_array_protocol(obj, type, mask, size) 230 elif _is_array_like(obj): 231 if mask is not None: 232 if _is_array_like(mask): 233 mask = get_values(mask, &is_pandas_object) 234 else: 235 raise TypeError("Mask must be a numpy array " 236 "when converting numpy arrays") 237 238 values = get_values(obj, &is_pandas_object) 239 if is_pandas_object and from_pandas is None: 240 c_from_pandas = True 241 242 if isinstance(values, np.ma.MaskedArray): 243 if mask is not None: 244 raise ValueError("Cannot pass a numpy masked array and " 245 "specify a mask at the same time") 246 else: 247 # don't use shrunken masks 248 mask = None if values.mask is np.ma.nomask else values.mask 249 values = values.data 250 251 if mask is not None: 252 if mask.dtype != np.bool_: 253 raise TypeError("Mask must be boolean dtype") 254 if mask.ndim != 1: 255 raise ValueError("Mask must be 1D array") 256 if len(values) != len(mask): 257 raise ValueError( 258 "Mask is a different length from sequence being converted") 259 260 if hasattr(values, '__arrow_array__'): 261 return _handle_arrow_array_protocol(values, type, mask, size) 262 elif pandas_api.is_categorical(values): 263 if type is not None: 264 if type.id != Type_DICTIONARY: 265 return _ndarray_to_array( 266 np.asarray(values), mask, type, c_from_pandas, safe, 267 pool) 268 index_type = type.index_type 269 value_type = type.value_type 270 if values.ordered != type.ordered: 271 warnings.warn( 272 "The 'ordered' flag of the passed categorical values " 273 "does not match the 'ordered' of the specified type. " 274 "Using the flag of the values, but in the future this " 275 "mismatch will raise a ValueError.", 276 FutureWarning, stacklevel=2) 277 else: 278 index_type = None 279 value_type = None 280 281 indices = _codes_to_indices( 282 values.codes, mask, index_type, memory_pool) 283 try: 284 dictionary = array( 285 values.categories.values, type=value_type, 286 memory_pool=memory_pool) 287 except TypeError: 288 # TODO when removing the deprecation warning, this whole 289 # try/except can be removed (to bubble the TypeError of 290 # the first array(..) call) 291 if value_type is not None: 292 warnings.warn( 293 "The dtype of the 'categories' of the passed " 294 "categorical values ({0}) does not match the " 295 "specified type ({1}). For now ignoring the specified " 296 "type, but in the future this mismatch will raise a " 297 "TypeError".format( 298 values.categories.dtype, value_type), 299 FutureWarning, stacklevel=2) 300 dictionary = array( 301 values.categories.values, memory_pool=memory_pool) 302 else: 303 raise 304 305 return DictionaryArray.from_arrays( 306 indices, dictionary, ordered=values.ordered, safe=safe) 307 else: 308 if pandas_api.have_pandas: 309 values, type = pandas_api.compat.get_datetimetz_type( 310 values, obj.dtype, type) 311 return _ndarray_to_array(values, mask, type, c_from_pandas, safe, 312 pool) 313 else: 314 # ConvertPySequence does strict conversion if type is explicitly passed 315 return _sequence_to_array(obj, mask, size, type, pool, c_from_pandas) 316 317 318def asarray(values, type=None): 319 """ 320 Convert to pyarrow.Array, inferring type if not provided. 321 322 Parameters 323 ---------- 324 values : array-like 325 This can be a sequence, numpy.ndarray, pyarrow.Array or 326 pyarrow.ChunkedArray. If a ChunkedArray is passed, the output will be 327 a ChunkedArray, otherwise the output will be a Array. 328 type : string or DataType 329 Explicitly construct the array with this type. Attempt to cast if 330 indicated type is different. 331 332 Returns 333 ------- 334 arr : Array or ChunkedArray 335 """ 336 if isinstance(values, (Array, ChunkedArray)): 337 if type is not None and not values.type.equals(type): 338 values = values.cast(type) 339 return values 340 else: 341 return array(values, type=type) 342 343 344def nulls(size, type=None, MemoryPool memory_pool=None): 345 """ 346 Create a strongly-typed Array instance with all elements null. 347 348 Parameters 349 ---------- 350 size : int 351 Array length. 352 type : pyarrow.DataType, default None 353 Explicit type for the array. By default use NullType. 354 memory_pool : MemoryPool, default None 355 Arrow MemoryPool to use for allocations. Uses the default memory 356 pool is not passed. 357 358 Returns 359 ------- 360 arr : Array 361 362 Examples 363 -------- 364 >>> import pyarrow as pa 365 >>> pa.nulls(10) 366 <pyarrow.lib.NullArray object at 0x7ffaf04c2e50> 367 10 nulls 368 369 >>> pa.nulls(3, pa.uint32()) 370 <pyarrow.lib.UInt32Array object at 0x7ffaf04c2e50> 371 [ 372 null, 373 null, 374 null 375 ] 376 """ 377 cdef: 378 CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) 379 int64_t length = size 380 shared_ptr[CDataType] ty 381 shared_ptr[CArray] arr 382 383 type = ensure_type(type, allow_none=True) 384 if type is None: 385 type = null() 386 387 ty = pyarrow_unwrap_data_type(type) 388 with nogil: 389 arr = GetResultValue(MakeArrayOfNull(ty, length, pool)) 390 391 return pyarrow_wrap_array(arr) 392 393 394def repeat(value, size, MemoryPool memory_pool=None): 395 """ 396 Create an Array instance whose slots are the given scalar. 397 398 Parameters 399 ---------- 400 value : Scalar-like object 401 Either a pyarrow.Scalar or any python object coercible to a Scalar. 402 size : int 403 Number of times to repeat the scalar in the output Array. 404 memory_pool : MemoryPool, default None 405 Arrow MemoryPool to use for allocations. Uses the default memory 406 pool is not passed. 407 408 Returns 409 ------- 410 arr : Array 411 412 Examples 413 -------- 414 >>> import pyarrow as pa 415 >>> pa.repeat(10, 3) 416 <pyarrow.lib.Int64Array object at 0x7ffac03a2750> 417 [ 418 10, 419 10, 420 10 421 ] 422 423 >>> pa.repeat([1, 2], 2) 424 <pyarrow.lib.ListArray object at 0x7ffaf04c2e50> 425 [ 426 [ 427 1, 428 2 429 ], 430 [ 431 1, 432 2 433 ] 434 ] 435 436 >>> pa.repeat("string", 3) 437 <pyarrow.lib.StringArray object at 0x7ffac03a2750> 438 [ 439 "string", 440 "string", 441 "string" 442 ] 443 444 >>> pa.repeat(pa.scalar({'a': 1, 'b': [1, 2]}), 2) 445 <pyarrow.lib.StructArray object at 0x7ffac03a2750> 446 -- is_valid: all not null 447 -- child 0 type: int64 448 [ 449 1, 450 1 451 ] 452 -- child 1 type: list<item: int64> 453 [ 454 [ 455 1, 456 2 457 ], 458 [ 459 1, 460 2 461 ] 462 ] 463 """ 464 cdef: 465 CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) 466 int64_t length = size 467 shared_ptr[CArray] c_array 468 shared_ptr[CScalar] c_scalar 469 470 if not isinstance(value, Scalar): 471 value = scalar(value, memory_pool=memory_pool) 472 473 c_scalar = (<Scalar> value).unwrap() 474 with nogil: 475 c_array = GetResultValue( 476 MakeArrayFromScalar(deref(c_scalar), length, pool) 477 ) 478 479 return pyarrow_wrap_array(c_array) 480 481 482def infer_type(values, mask=None, from_pandas=False): 483 """ 484 Attempt to infer Arrow data type that can hold the passed Python 485 sequence type in an Array object 486 487 Parameters 488 ---------- 489 values : array-like 490 Sequence to infer type from. 491 mask : ndarray (bool type), optional 492 Optional exclusion mask where True marks null, False non-null. 493 from_pandas : bool, default False 494 Use pandas's NA/null sentinel values for type inference. 495 496 Returns 497 ------- 498 type : DataType 499 """ 500 cdef: 501 shared_ptr[CDataType] out 502 c_bool use_pandas_sentinels = from_pandas 503 504 if mask is not None and not isinstance(mask, np.ndarray): 505 mask = np.array(mask, dtype=bool) 506 507 out = GetResultValue(InferArrowType(values, mask, use_pandas_sentinels)) 508 return pyarrow_wrap_data_type(out) 509 510 511def _normalize_slice(object arrow_obj, slice key): 512 """ 513 Slices with step not equal to 1 (or None) will produce a copy 514 rather than a zero-copy view 515 """ 516 cdef: 517 Py_ssize_t start, stop, step 518 Py_ssize_t n = len(arrow_obj) 519 520 start = key.start or 0 521 if start < 0: 522 start += n 523 if start < 0: 524 start = 0 525 elif start >= n: 526 start = n 527 528 stop = key.stop if key.stop is not None else n 529 if stop < 0: 530 stop += n 531 if stop < 0: 532 stop = 0 533 elif stop >= n: 534 stop = n 535 536 step = key.step or 1 537 if step != 1: 538 if step < 0: 539 # Negative steps require some special handling 540 if key.start is None: 541 start = n - 1 542 543 if key.stop is None: 544 stop = -1 545 546 indices = np.arange(start, stop, step) 547 return arrow_obj.take(indices) 548 else: 549 length = max(stop - start, 0) 550 return arrow_obj.slice(start, length) 551 552 553cdef Py_ssize_t _normalize_index(Py_ssize_t index, 554 Py_ssize_t length) except -1: 555 if index < 0: 556 index += length 557 if index < 0: 558 raise IndexError("index out of bounds") 559 elif index >= length: 560 raise IndexError("index out of bounds") 561 return index 562 563 564cdef wrap_datum(const CDatum& datum): 565 if datum.kind() == DatumType_ARRAY: 566 return pyarrow_wrap_array(MakeArray(datum.array())) 567 elif datum.kind() == DatumType_CHUNKED_ARRAY: 568 return pyarrow_wrap_chunked_array(datum.chunked_array()) 569 elif datum.kind() == DatumType_RECORD_BATCH: 570 return pyarrow_wrap_batch(datum.record_batch()) 571 elif datum.kind() == DatumType_TABLE: 572 return pyarrow_wrap_table(datum.table()) 573 elif datum.kind() == DatumType_SCALAR: 574 return pyarrow_wrap_scalar(datum.scalar()) 575 else: 576 raise ValueError("Unable to wrap Datum in a Python object") 577 578 579cdef _append_array_buffers(const CArrayData* ad, list res): 580 """ 581 Recursively append Buffer wrappers from *ad* and its children. 582 """ 583 cdef size_t i, n 584 assert ad != NULL 585 n = ad.buffers.size() 586 for i in range(n): 587 buf = ad.buffers[i] 588 res.append(pyarrow_wrap_buffer(buf) 589 if buf.get() != NULL else None) 590 n = ad.child_data.size() 591 for i in range(n): 592 _append_array_buffers(ad.child_data[i].get(), res) 593 594 595cdef _reduce_array_data(const CArrayData* ad): 596 """ 597 Recursively dissect ArrayData to (pickable) tuples. 598 """ 599 cdef size_t i, n 600 assert ad != NULL 601 602 n = ad.buffers.size() 603 buffers = [] 604 for i in range(n): 605 buf = ad.buffers[i] 606 buffers.append(pyarrow_wrap_buffer(buf) 607 if buf.get() != NULL else None) 608 609 children = [] 610 n = ad.child_data.size() 611 for i in range(n): 612 children.append(_reduce_array_data(ad.child_data[i].get())) 613 614 if ad.dictionary.get() != NULL: 615 dictionary = _reduce_array_data(ad.dictionary.get()) 616 else: 617 dictionary = None 618 619 return pyarrow_wrap_data_type(ad.type), ad.length, ad.null_count, \ 620 ad.offset, buffers, children, dictionary 621 622 623cdef shared_ptr[CArrayData] _reconstruct_array_data(data): 624 """ 625 Reconstruct CArrayData objects from the tuple structure generated 626 by _reduce_array_data. 627 """ 628 cdef: 629 int64_t length, null_count, offset, i 630 DataType dtype 631 Buffer buf 632 vector[shared_ptr[CBuffer]] c_buffers 633 vector[shared_ptr[CArrayData]] c_children 634 shared_ptr[CArrayData] c_dictionary 635 636 dtype, length, null_count, offset, buffers, children, dictionary = data 637 638 for i in range(len(buffers)): 639 buf = buffers[i] 640 if buf is None: 641 c_buffers.push_back(shared_ptr[CBuffer]()) 642 else: 643 c_buffers.push_back(buf.buffer) 644 645 for i in range(len(children)): 646 c_children.push_back(_reconstruct_array_data(children[i])) 647 648 if dictionary is not None: 649 c_dictionary = _reconstruct_array_data(dictionary) 650 651 return CArrayData.MakeWithChildrenAndDictionary( 652 dtype.sp_type, 653 length, 654 c_buffers, 655 c_children, 656 c_dictionary, 657 null_count, 658 offset) 659 660 661def _restore_array(data): 662 """ 663 Reconstruct an Array from pickled ArrayData. 664 """ 665 cdef shared_ptr[CArrayData] ad = _reconstruct_array_data(data) 666 return pyarrow_wrap_array(MakeArray(ad)) 667 668 669cdef class _PandasConvertible(_Weakrefable): 670 671 def to_pandas( 672 self, 673 memory_pool=None, 674 categories=None, 675 bint strings_to_categorical=False, 676 bint zero_copy_only=False, 677 bint integer_object_nulls=False, 678 bint date_as_object=True, 679 bint timestamp_as_object=False, 680 bint use_threads=True, 681 bint deduplicate_objects=True, 682 bint ignore_metadata=False, 683 bint safe=True, 684 bint split_blocks=False, 685 bint self_destruct=False, 686 types_mapper=None 687 ): 688 """ 689 Convert to a pandas-compatible NumPy array or DataFrame, as appropriate 690 691 Parameters 692 ---------- 693 memory_pool : MemoryPool, default None 694 Arrow MemoryPool to use for allocations. Uses the default memory 695 pool is not passed. 696 strings_to_categorical : bool, default False 697 Encode string (UTF8) and binary types to pandas.Categorical. 698 categories: list, default empty 699 List of fields that should be returned as pandas.Categorical. Only 700 applies to table-like data structures. 701 zero_copy_only : bool, default False 702 Raise an ArrowException if this function call would require copying 703 the underlying data. 704 integer_object_nulls : bool, default False 705 Cast integers with nulls to objects 706 date_as_object : bool, default True 707 Cast dates to objects. If False, convert to datetime64[ns] dtype. 708 timestamp_as_object : bool, default False 709 Cast non-nanosecond timestamps (np.datetime64) to objects. This is 710 useful if you have timestamps that don't fit in the normal date 711 range of nanosecond timestamps (1678 CE-2262 CE). 712 If False, all timestamps are converted to datetime64[ns] dtype. 713 use_threads: bool, default True 714 Whether to parallelize the conversion using multiple threads. 715 deduplicate_objects : bool, default False 716 Do not create multiple copies Python objects when created, to save 717 on memory use. Conversion will be slower. 718 ignore_metadata : bool, default False 719 If True, do not use the 'pandas' metadata to reconstruct the 720 DataFrame index, if present 721 safe : bool, default True 722 For certain data types, a cast is needed in order to store the 723 data in a pandas DataFrame or Series (e.g. timestamps are always 724 stored as nanoseconds in pandas). This option controls whether it 725 is a safe cast or not. 726 split_blocks : bool, default False 727 If True, generate one internal "block" for each column when 728 creating a pandas.DataFrame from a RecordBatch or Table. While this 729 can temporarily reduce memory note that various pandas operations 730 can trigger "consolidation" which may balloon memory use. 731 self_destruct : bool, default False 732 EXPERIMENTAL: If True, attempt to deallocate the originating Arrow 733 memory while converting the Arrow object to pandas. If you use the 734 object after calling to_pandas with this option it will crash your 735 program. 736 737 Note that you may not see always memory usage improvements. For 738 example, if multiple columns share an underlying allocation, 739 memory can't be freed until all columns are converted. 740 types_mapper : function, default None 741 A function mapping a pyarrow DataType to a pandas ExtensionDtype. 742 This can be used to override the default pandas type for conversion 743 of built-in pyarrow types or in absence of pandas_metadata in the 744 Table schema. The function receives a pyarrow DataType and is 745 expected to return a pandas ExtensionDtype or ``None`` if the 746 default conversion should be used for that type. If you have 747 a dictionary mapping, you can pass ``dict.get`` as function. 748 749 Returns 750 ------- 751 pandas.Series or pandas.DataFrame depending on type of object 752 """ 753 options = dict( 754 pool=memory_pool, 755 strings_to_categorical=strings_to_categorical, 756 zero_copy_only=zero_copy_only, 757 integer_object_nulls=integer_object_nulls, 758 date_as_object=date_as_object, 759 timestamp_as_object=timestamp_as_object, 760 use_threads=use_threads, 761 deduplicate_objects=deduplicate_objects, 762 safe=safe, 763 split_blocks=split_blocks, 764 self_destruct=self_destruct 765 ) 766 return self._to_pandas(options, categories=categories, 767 ignore_metadata=ignore_metadata, 768 types_mapper=types_mapper) 769 770 771cdef PandasOptions _convert_pandas_options(dict options): 772 cdef PandasOptions result 773 result.pool = maybe_unbox_memory_pool(options['pool']) 774 result.strings_to_categorical = options['strings_to_categorical'] 775 result.zero_copy_only = options['zero_copy_only'] 776 result.integer_object_nulls = options['integer_object_nulls'] 777 result.date_as_object = options['date_as_object'] 778 result.timestamp_as_object = options['timestamp_as_object'] 779 result.use_threads = options['use_threads'] 780 result.deduplicate_objects = options['deduplicate_objects'] 781 result.safe_cast = options['safe'] 782 result.split_blocks = options['split_blocks'] 783 result.self_destruct = options['self_destruct'] 784 result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False) 785 return result 786 787 788cdef class Array(_PandasConvertible): 789 """ 790 The base class for all Arrow arrays. 791 """ 792 793 def __init__(self): 794 raise TypeError("Do not call {}'s constructor directly, use one of " 795 "the `pyarrow.Array.from_*` functions instead." 796 .format(self.__class__.__name__)) 797 798 cdef void init(self, const shared_ptr[CArray]& sp_array) except *: 799 self.sp_array = sp_array 800 self.ap = sp_array.get() 801 self.type = pyarrow_wrap_data_type(self.sp_array.get().type()) 802 803 def _debug_print(self): 804 with nogil: 805 check_status(DebugPrint(deref(self.ap), 0)) 806 807 def diff(self, Array other): 808 """ 809 Compare contents of this array against another one. 810 811 Return string containing the result of arrow::Diff comparing contents 812 of this array against the other array. 813 """ 814 cdef c_string result 815 with nogil: 816 result = self.ap.Diff(deref(other.ap)) 817 return frombytes(result, safe=True) 818 819 def cast(self, object target_type, safe=True): 820 """ 821 Cast array values to another data type 822 823 See pyarrow.compute.cast for usage 824 """ 825 return _pc().cast(self, target_type, safe=safe) 826 827 def view(self, object target_type): 828 """ 829 Return zero-copy "view" of array as another data type. 830 831 The data types must have compatible columnar buffer layouts 832 833 Parameters 834 ---------- 835 target_type : DataType 836 Type to construct view as. 837 838 Returns 839 ------- 840 view : Array 841 """ 842 cdef DataType type = ensure_type(target_type) 843 cdef shared_ptr[CArray] result 844 with nogil: 845 result = GetResultValue(self.ap.View(type.sp_type)) 846 return pyarrow_wrap_array(result) 847 848 def sum(self, **kwargs): 849 """ 850 Sum the values in a numerical array. 851 """ 852 options = _pc().ScalarAggregateOptions(**kwargs) 853 return _pc().call_function('sum', [self], options) 854 855 def unique(self): 856 """ 857 Compute distinct elements in array. 858 """ 859 return _pc().call_function('unique', [self]) 860 861 def dictionary_encode(self, null_encoding='mask'): 862 """ 863 Compute dictionary-encoded representation of array. 864 """ 865 options = _pc().DictionaryEncodeOptions(null_encoding) 866 return _pc().call_function('dictionary_encode', [self], options) 867 868 def value_counts(self): 869 """ 870 Compute counts of unique elements in array. 871 872 Returns 873 ------- 874 An array of <input type "Values", int64_t "Counts"> structs 875 """ 876 return _pc().call_function('value_counts', [self]) 877 878 @staticmethod 879 def from_pandas(obj, mask=None, type=None, bint safe=True, 880 MemoryPool memory_pool=None): 881 """ 882 Convert pandas.Series to an Arrow Array. 883 884 This method uses Pandas semantics about what values indicate 885 nulls. See pyarrow.array for more general conversion from arrays or 886 sequences to Arrow arrays. 887 888 Parameters 889 ---------- 890 obj : ndarray, pandas.Series, array-like 891 mask : array (boolean), optional 892 Indicate which values are null (True) or not null (False). 893 type : pyarrow.DataType 894 Explicit type to attempt to coerce to, otherwise will be inferred 895 from the data. 896 safe : bool, default True 897 Check for overflows or other unsafe conversions. 898 memory_pool : pyarrow.MemoryPool, optional 899 If not passed, will allocate memory from the currently-set default 900 memory pool. 901 902 Notes 903 ----- 904 Localized timestamps will currently be returned as UTC (pandas's native 905 representation). Timezone-naive data will be implicitly interpreted as 906 UTC. 907 908 Returns 909 ------- 910 array : pyarrow.Array or pyarrow.ChunkedArray 911 ChunkedArray is returned if object data overflows binary buffer. 912 """ 913 return array(obj, mask=mask, type=type, safe=safe, from_pandas=True, 914 memory_pool=memory_pool) 915 916 def __reduce__(self): 917 return _restore_array, \ 918 (_reduce_array_data(self.sp_array.get().data().get()),) 919 920 @staticmethod 921 def from_buffers(DataType type, length, buffers, null_count=-1, offset=0, 922 children=None): 923 """ 924 Construct an Array from a sequence of buffers. 925 926 The concrete type returned depends on the datatype. 927 928 Parameters 929 ---------- 930 type : DataType 931 The value type of the array. 932 length : int 933 The number of values in the array. 934 buffers : List[Buffer] 935 The buffers backing this array. 936 null_count : int, default -1 937 The number of null entries in the array. Negative value means that 938 the null count is not known. 939 offset : int, default 0 940 The array's logical offset (in values, not in bytes) from the 941 start of each buffer. 942 children : List[Array], default None 943 Nested type children with length matching type.num_fields. 944 945 Returns 946 ------- 947 array : Array 948 """ 949 cdef: 950 Buffer buf 951 Array child 952 vector[shared_ptr[CBuffer]] c_buffers 953 vector[shared_ptr[CArrayData]] c_child_data 954 shared_ptr[CArrayData] array_data 955 956 children = children or [] 957 958 if type.num_fields != len(children): 959 raise ValueError("Type's expected number of children " 960 "({0}) did not match the passed number " 961 "({1}).".format(type.num_fields, len(children))) 962 963 if type.num_buffers != len(buffers): 964 raise ValueError("Type's expected number of buffers " 965 "({0}) did not match the passed number " 966 "({1}).".format(type.num_buffers, len(buffers))) 967 968 for buf in buffers: 969 # None will produce a null buffer pointer 970 c_buffers.push_back(pyarrow_unwrap_buffer(buf)) 971 972 for child in children: 973 c_child_data.push_back(child.ap.data()) 974 975 array_data = CArrayData.MakeWithChildren(type.sp_type, length, 976 c_buffers, c_child_data, 977 null_count, offset) 978 cdef Array result = pyarrow_wrap_array(MakeArray(array_data)) 979 result.validate() 980 return result 981 982 @property 983 def null_count(self): 984 return self.sp_array.get().null_count() 985 986 @property 987 def nbytes(self): 988 """ 989 Total number of bytes consumed by the elements of the array. 990 """ 991 size = 0 992 for buf in self.buffers(): 993 if buf is not None: 994 size += buf.size 995 return size 996 997 def __sizeof__(self): 998 return super(Array, self).__sizeof__() + self.nbytes 999 1000 def __iter__(self): 1001 for i in range(len(self)): 1002 yield self.getitem(i) 1003 1004 def __repr__(self): 1005 type_format = object.__repr__(self) 1006 return '{0}\n{1}'.format(type_format, str(self)) 1007 1008 def to_string(self, *, int indent=0, int window=10, 1009 c_bool skip_new_lines=False): 1010 """ 1011 Render a "pretty-printed" string representation of the Array. 1012 1013 Parameters 1014 ---------- 1015 indent : int 1016 How much to indent right the content of the array, 1017 by default ``0``. 1018 window : int 1019 How many items to preview at the begin and end 1020 of the array when the arrays is bigger than the window. 1021 The other elements will be ellipsed. 1022 skip_new_lines : bool 1023 If the array should be rendered as a single line of text 1024 or if each element should be on its own line. 1025 """ 1026 cdef: 1027 c_string result 1028 PrettyPrintOptions options 1029 1030 with nogil: 1031 options = PrettyPrintOptions(indent, window) 1032 options.skip_new_lines = skip_new_lines 1033 check_status( 1034 PrettyPrint( 1035 deref(self.ap), 1036 options, 1037 &result 1038 ) 1039 ) 1040 1041 return frombytes(result, safe=True) 1042 1043 def format(self, **kwargs): 1044 import warnings 1045 warnings.warn('Array.format is deprecated, use Array.to_string') 1046 return self.to_string(**kwargs) 1047 1048 def __str__(self): 1049 return self.to_string() 1050 1051 def __eq__(self, other): 1052 try: 1053 return self.equals(other) 1054 except TypeError: 1055 # This also handles comparing with None 1056 # as Array.equals(None) raises a TypeError. 1057 return NotImplemented 1058 1059 def equals(Array self, Array other not None): 1060 return self.ap.Equals(deref(other.ap)) 1061 1062 def __len__(self): 1063 return self.length() 1064 1065 cdef int64_t length(self): 1066 if self.sp_array.get(): 1067 return self.sp_array.get().length() 1068 else: 1069 return 0 1070 1071 def is_null(self, *, nan_is_null=False): 1072 """ 1073 Return BooleanArray indicating the null values. 1074 1075 Parameters 1076 ---------- 1077 nan_is_null : bool (optional, default False) 1078 Whether floating-point NaN values should also be considered null. 1079 1080 Returns 1081 ------- 1082 array : boolean Array 1083 """ 1084 options = _pc().NullOptions(nan_is_null=nan_is_null) 1085 return _pc().call_function('is_null', [self], options) 1086 1087 def is_valid(self): 1088 """ 1089 Return BooleanArray indicating the non-null values. 1090 """ 1091 return _pc().is_valid(self) 1092 1093 def fill_null(self, fill_value): 1094 """ 1095 See pyarrow.compute.fill_null for usage. 1096 """ 1097 return _pc().fill_null(self, fill_value) 1098 1099 def __getitem__(self, key): 1100 """ 1101 Slice or return value at given index 1102 1103 Parameters 1104 ---------- 1105 key : integer or slice 1106 Slices with step not equal to 1 (or None) will produce a copy 1107 rather than a zero-copy view 1108 1109 Returns 1110 ------- 1111 value : Scalar (index) or Array (slice) 1112 """ 1113 if PySlice_Check(key): 1114 return _normalize_slice(self, key) 1115 1116 return self.getitem(_normalize_index(key, self.length())) 1117 1118 cdef getitem(self, int64_t i): 1119 return Scalar.wrap(GetResultValue(self.ap.GetScalar(i))) 1120 1121 def slice(self, offset=0, length=None): 1122 """ 1123 Compute zero-copy slice of this array. 1124 1125 Parameters 1126 ---------- 1127 offset : int, default 0 1128 Offset from start of array to slice. 1129 length : int, default None 1130 Length of slice (default is until end of Array starting from 1131 offset). 1132 1133 Returns 1134 ------- 1135 sliced : RecordBatch 1136 """ 1137 cdef: 1138 shared_ptr[CArray] result 1139 1140 if offset < 0: 1141 raise IndexError('Offset must be non-negative') 1142 1143 offset = min(len(self), offset) 1144 if length is None: 1145 result = self.ap.Slice(offset) 1146 else: 1147 if length < 0: 1148 raise ValueError('Length must be non-negative') 1149 result = self.ap.Slice(offset, length) 1150 1151 return pyarrow_wrap_array(result) 1152 1153 def take(self, object indices): 1154 """ 1155 Select values from an array. See pyarrow.compute.take for full usage. 1156 """ 1157 return _pc().take(self, indices) 1158 1159 def drop_null(self): 1160 """ 1161 Remove missing values from an array. 1162 """ 1163 return _pc().drop_null(self) 1164 1165 def filter(self, Array mask, *, null_selection_behavior='drop'): 1166 """ 1167 Select values from an array. See pyarrow.compute.filter for full usage. 1168 """ 1169 return _pc().filter(self, mask, 1170 null_selection_behavior=null_selection_behavior) 1171 1172 def index(self, value, start=None, end=None, *, memory_pool=None): 1173 """ 1174 Find the first index of a value. 1175 1176 See pyarrow.compute.index for full usage. 1177 """ 1178 return _pc().index(self, value, start, end, memory_pool=memory_pool) 1179 1180 def _to_pandas(self, options, **kwargs): 1181 return _array_like_to_pandas(self, options) 1182 1183 def __array__(self, dtype=None): 1184 values = self.to_numpy(zero_copy_only=False) 1185 if dtype is None: 1186 return values 1187 return values.astype(dtype) 1188 1189 def to_numpy(self, zero_copy_only=True, writable=False): 1190 """ 1191 Return a NumPy view or copy of this array (experimental). 1192 1193 By default, tries to return a view of this array. This is only 1194 supported for primitive arrays with the same memory layout as NumPy 1195 (i.e. integers, floating point, ..) and without any nulls. 1196 1197 Parameters 1198 ---------- 1199 zero_copy_only : bool, default True 1200 If True, an exception will be raised if the conversion to a numpy 1201 array would require copying the underlying data (e.g. in presence 1202 of nulls, or for non-primitive types). 1203 writable : bool, default False 1204 For numpy arrays created with zero copy (view on the Arrow data), 1205 the resulting array is not writable (Arrow data is immutable). 1206 By setting this to True, a copy of the array is made to ensure 1207 it is writable. 1208 1209 Returns 1210 ------- 1211 array : numpy.ndarray 1212 """ 1213 cdef: 1214 PyObject* out 1215 PandasOptions c_options 1216 object values 1217 1218 if zero_copy_only and writable: 1219 raise ValueError( 1220 "Cannot return a writable array if asking for zero-copy") 1221 1222 # If there are nulls and the array is a DictionaryArray 1223 # decoding the dictionary will make sure nulls are correctly handled. 1224 # Decoding a dictionary does imply a copy by the way, 1225 # so it can't be done if the user requested a zero_copy. 1226 c_options.decode_dictionaries = not zero_copy_only 1227 c_options.zero_copy_only = zero_copy_only 1228 1229 with nogil: 1230 check_status(ConvertArrayToPandas(c_options, self.sp_array, 1231 self, &out)) 1232 1233 # wrap_array_output uses pandas to convert to Categorical, here 1234 # always convert to numpy array without pandas dependency 1235 array = PyObject_to_object(out) 1236 1237 if isinstance(array, dict): 1238 array = np.take(array['dictionary'], array['indices']) 1239 1240 if writable and not array.flags.writeable: 1241 # if the conversion already needed to a copy, writeable is True 1242 array = array.copy() 1243 return array 1244 1245 def to_pylist(self): 1246 """ 1247 Convert to a list of native Python objects. 1248 1249 Returns 1250 ------- 1251 lst : list 1252 """ 1253 return [x.as_py() for x in self] 1254 1255 def tolist(self): 1256 """ 1257 Alias of to_pylist for compatibility with NumPy. 1258 """ 1259 return self.to_pylist() 1260 1261 def validate(self, *, full=False): 1262 """ 1263 Perform validation checks. An exception is raised if validation fails. 1264 1265 By default only cheap validation checks are run. Pass `full=True` 1266 for thorough validation checks (potentially O(n)). 1267 1268 Parameters 1269 ---------- 1270 full: bool, default False 1271 If True, run expensive checks, otherwise cheap checks only. 1272 1273 Raises 1274 ------ 1275 ArrowInvalid 1276 """ 1277 if full: 1278 with nogil: 1279 check_status(self.ap.ValidateFull()) 1280 else: 1281 with nogil: 1282 check_status(self.ap.Validate()) 1283 1284 @property 1285 def offset(self): 1286 """ 1287 A relative position into another array's data. 1288 1289 The purpose is to enable zero-copy slicing. This value defaults to zero 1290 but must be applied on all operations with the physical storage 1291 buffers. 1292 """ 1293 return self.sp_array.get().offset() 1294 1295 def buffers(self): 1296 """ 1297 Return a list of Buffer objects pointing to this array's physical 1298 storage. 1299 1300 To correctly interpret these buffers, you need to also apply the offset 1301 multiplied with the size of the stored data type. 1302 """ 1303 res = [] 1304 _append_array_buffers(self.sp_array.get().data().get(), res) 1305 return res 1306 1307 def _export_to_c(self, uintptr_t out_ptr, uintptr_t out_schema_ptr=0): 1308 """ 1309 Export to a C ArrowArray struct, given its pointer. 1310 1311 If a C ArrowSchema struct pointer is also given, the array type 1312 is exported to it at the same time. 1313 1314 Parameters 1315 ---------- 1316 out_ptr: int 1317 The raw pointer to a C ArrowArray struct. 1318 out_schema_ptr: int (optional) 1319 The raw pointer to a C ArrowSchema struct. 1320 1321 Be careful: if you don't pass the ArrowArray struct to a consumer, 1322 array memory will leak. This is a low-level function intended for 1323 expert users. 1324 """ 1325 with nogil: 1326 check_status(ExportArray(deref(self.sp_array), 1327 <ArrowArray*> out_ptr, 1328 <ArrowSchema*> out_schema_ptr)) 1329 1330 @staticmethod 1331 def _import_from_c(uintptr_t in_ptr, type): 1332 """ 1333 Import Array from a C ArrowArray struct, given its pointer 1334 and the imported array type. 1335 1336 Parameters 1337 ---------- 1338 in_ptr: int 1339 The raw pointer to a C ArrowArray struct. 1340 type: DataType or int 1341 Either a DataType object, or the raw pointer to a C ArrowSchema 1342 struct. 1343 1344 This is a low-level function intended for expert users. 1345 """ 1346 cdef: 1347 shared_ptr[CArray] c_array 1348 1349 c_type = pyarrow_unwrap_data_type(type) 1350 if c_type == nullptr: 1351 # Not a DataType object, perhaps a raw ArrowSchema pointer 1352 type_ptr = <uintptr_t> type 1353 with nogil: 1354 c_array = GetResultValue(ImportArray(<ArrowArray*> in_ptr, 1355 <ArrowSchema*> type_ptr)) 1356 else: 1357 with nogil: 1358 c_array = GetResultValue(ImportArray(<ArrowArray*> in_ptr, 1359 c_type)) 1360 return pyarrow_wrap_array(c_array) 1361 1362 1363cdef _array_like_to_pandas(obj, options): 1364 cdef: 1365 PyObject* out 1366 PandasOptions c_options = _convert_pandas_options(options) 1367 1368 original_type = obj.type 1369 name = obj._name 1370 1371 # ARROW-3789(wesm): Convert date/timestamp types to datetime64[ns] 1372 c_options.coerce_temporal_nanoseconds = True 1373 1374 if isinstance(obj, Array): 1375 with nogil: 1376 check_status(ConvertArrayToPandas(c_options, 1377 (<Array> obj).sp_array, 1378 obj, &out)) 1379 elif isinstance(obj, ChunkedArray): 1380 with nogil: 1381 check_status(libarrow.ConvertChunkedArrayToPandas( 1382 c_options, 1383 (<ChunkedArray> obj).sp_chunked_array, 1384 obj, &out)) 1385 1386 arr = wrap_array_output(out) 1387 1388 if (isinstance(original_type, TimestampType) and 1389 options["timestamp_as_object"]): 1390 # ARROW-5359 - need to specify object dtype to avoid pandas to 1391 # coerce back to ns resolution 1392 dtype = "object" 1393 else: 1394 dtype = None 1395 1396 result = pandas_api.series(arr, dtype=dtype, name=name) 1397 1398 if (isinstance(original_type, TimestampType) and 1399 original_type.tz is not None and 1400 # can be object dtype for non-ns and timestamp_as_object=True 1401 result.dtype.kind == "M"): 1402 from pyarrow.pandas_compat import make_tz_aware 1403 result = make_tz_aware(result, original_type.tz) 1404 1405 return result 1406 1407 1408cdef wrap_array_output(PyObject* output): 1409 cdef object obj = PyObject_to_object(output) 1410 1411 if isinstance(obj, dict): 1412 return pandas_api.categorical_type(obj['indices'], 1413 categories=obj['dictionary'], 1414 ordered=obj['ordered'], 1415 fastpath=True) 1416 else: 1417 return obj 1418 1419 1420cdef class NullArray(Array): 1421 """ 1422 Concrete class for Arrow arrays of null data type. 1423 """ 1424 1425 1426cdef class BooleanArray(Array): 1427 """ 1428 Concrete class for Arrow arrays of boolean data type. 1429 """ 1430 @property 1431 def false_count(self): 1432 return (<CBooleanArray*> self.ap).false_count() 1433 1434 @property 1435 def true_count(self): 1436 return (<CBooleanArray*> self.ap).true_count() 1437 1438 1439cdef class NumericArray(Array): 1440 """ 1441 A base class for Arrow numeric arrays. 1442 """ 1443 1444 1445cdef class IntegerArray(NumericArray): 1446 """ 1447 A base class for Arrow integer arrays. 1448 """ 1449 1450 1451cdef class FloatingPointArray(NumericArray): 1452 """ 1453 A base class for Arrow floating-point arrays. 1454 """ 1455 1456 1457cdef class Int8Array(IntegerArray): 1458 """ 1459 Concrete class for Arrow arrays of int8 data type. 1460 """ 1461 1462 1463cdef class UInt8Array(IntegerArray): 1464 """ 1465 Concrete class for Arrow arrays of uint8 data type. 1466 """ 1467 1468 1469cdef class Int16Array(IntegerArray): 1470 """ 1471 Concrete class for Arrow arrays of int16 data type. 1472 """ 1473 1474 1475cdef class UInt16Array(IntegerArray): 1476 """ 1477 Concrete class for Arrow arrays of uint16 data type. 1478 """ 1479 1480 1481cdef class Int32Array(IntegerArray): 1482 """ 1483 Concrete class for Arrow arrays of int32 data type. 1484 """ 1485 1486 1487cdef class UInt32Array(IntegerArray): 1488 """ 1489 Concrete class for Arrow arrays of uint32 data type. 1490 """ 1491 1492 1493cdef class Int64Array(IntegerArray): 1494 """ 1495 Concrete class for Arrow arrays of int64 data type. 1496 """ 1497 1498 1499cdef class UInt64Array(IntegerArray): 1500 """ 1501 Concrete class for Arrow arrays of uint64 data type. 1502 """ 1503 1504 1505cdef class Date32Array(NumericArray): 1506 """ 1507 Concrete class for Arrow arrays of date32 data type. 1508 """ 1509 1510 1511cdef class Date64Array(NumericArray): 1512 """ 1513 Concrete class for Arrow arrays of date64 data type. 1514 """ 1515 1516 1517cdef class TimestampArray(NumericArray): 1518 """ 1519 Concrete class for Arrow arrays of timestamp data type. 1520 """ 1521 1522 1523cdef class Time32Array(NumericArray): 1524 """ 1525 Concrete class for Arrow arrays of time32 data type. 1526 """ 1527 1528 1529cdef class Time64Array(NumericArray): 1530 """ 1531 Concrete class for Arrow arrays of time64 data type. 1532 """ 1533 1534 1535cdef class DurationArray(NumericArray): 1536 """ 1537 Concrete class for Arrow arrays of duration data type. 1538 """ 1539 1540 1541cdef class MonthDayNanoIntervalArray(Array): 1542 """ 1543 Concrete class for Arrow arrays of interval[MonthDayNano] type. 1544 """ 1545 1546 def to_pylist(self): 1547 """ 1548 Convert to a list of native Python objects. 1549 1550 pyarrow.MonthDayNano is used as the native representation. 1551 1552 Returns 1553 ------- 1554 lst : list 1555 """ 1556 cdef: 1557 CResult[PyObject*] maybe_py_list 1558 PyObject* py_list 1559 CMonthDayNanoIntervalArray* array 1560 array = <CMonthDayNanoIntervalArray*>self.sp_array.get() 1561 maybe_py_list = MonthDayNanoIntervalArrayToPyList(deref(array)) 1562 py_list = GetResultValue(maybe_py_list) 1563 return PyObject_to_object(py_list) 1564 1565 1566cdef class HalfFloatArray(FloatingPointArray): 1567 """ 1568 Concrete class for Arrow arrays of float16 data type. 1569 """ 1570 1571 1572cdef class FloatArray(FloatingPointArray): 1573 """ 1574 Concrete class for Arrow arrays of float32 data type. 1575 """ 1576 1577 1578cdef class DoubleArray(FloatingPointArray): 1579 """ 1580 Concrete class for Arrow arrays of float64 data type. 1581 """ 1582 1583 1584cdef class FixedSizeBinaryArray(Array): 1585 """ 1586 Concrete class for Arrow arrays of a fixed-size binary data type. 1587 """ 1588 1589 1590cdef class Decimal128Array(FixedSizeBinaryArray): 1591 """ 1592 Concrete class for Arrow arrays of decimal128 data type. 1593 """ 1594 1595 1596cdef class Decimal256Array(FixedSizeBinaryArray): 1597 """ 1598 Concrete class for Arrow arrays of decimal256 data type. 1599 """ 1600 1601cdef class BaseListArray(Array): 1602 1603 def flatten(self): 1604 """ 1605 Unnest this ListArray/LargeListArray by one level. 1606 1607 The returned Array is logically a concatenation of all the sub-lists 1608 in this Array. 1609 1610 Note that this method is different from ``self.values()`` in that 1611 it takes care of the slicing offset as well as null elements backed 1612 by non-empty sub-lists. 1613 1614 Returns 1615 ------- 1616 result : Array 1617 """ 1618 return _pc().list_flatten(self) 1619 1620 def value_parent_indices(self): 1621 """ 1622 Return array of same length as list child values array where each 1623 output value is the index of the parent list array slot containing each 1624 child value. 1625 1626 Examples 1627 -------- 1628 >>> arr = pa.array([[1, 2, 3], [], None, [4]], 1629 ... type=pa.list_(pa.int32())) 1630 >>> arr.value_parent_indices() 1631 <pyarrow.lib.Int32Array object at 0x7efc5db958a0> 1632 [ 1633 0, 1634 0, 1635 0, 1636 3 1637 ] 1638 """ 1639 return _pc().list_parent_indices(self) 1640 1641 def value_lengths(self): 1642 """ 1643 Return integers array with values equal to the respective length of 1644 each list element. Null list values are null in the output. 1645 1646 Examples 1647 -------- 1648 >>> arr = pa.array([[1, 2, 3], [], None, [4]], 1649 ... type=pa.list_(pa.int32())) 1650 >>> arr.value_lengths() 1651 <pyarrow.lib.Int32Array object at 0x7efc5db95910> 1652 [ 1653 3, 1654 0, 1655 null, 1656 1 1657 ] 1658 """ 1659 return _pc().list_value_length(self) 1660 1661 1662cdef class ListArray(BaseListArray): 1663 """ 1664 Concrete class for Arrow arrays of a list data type. 1665 """ 1666 1667 @staticmethod 1668 def from_arrays(offsets, values, MemoryPool pool=None): 1669 """ 1670 Construct ListArray from arrays of int32 offsets and values. 1671 1672 Parameters 1673 ---------- 1674 offsets : Array (int32 type) 1675 values : Array (any type) 1676 pool : MemoryPool 1677 1678 Returns 1679 ------- 1680 list_array : ListArray 1681 1682 Examples 1683 -------- 1684 >>> values = pa.array([1, 2, 3, 4]) 1685 >>> offsets = pa.array([0, 2, 4]) 1686 >>> pa.ListArray.from_arrays(offsets, values) 1687 <pyarrow.lib.ListArray object at 0x7fbde226bf40> 1688 [ 1689 [ 1690 0, 1691 1 1692 ], 1693 [ 1694 2, 1695 3 1696 ] 1697 ] 1698 # nulls in the offsets array become null lists 1699 >>> offsets = pa.array([0, None, 2, 4]) 1700 >>> pa.ListArray.from_arrays(offsets, values) 1701 <pyarrow.lib.ListArray object at 0x7fbde226bf40> 1702 [ 1703 [ 1704 0, 1705 1 1706 ], 1707 null, 1708 [ 1709 2, 1710 3 1711 ] 1712 ] 1713 """ 1714 cdef: 1715 Array _offsets, _values 1716 shared_ptr[CArray] out 1717 cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) 1718 1719 _offsets = asarray(offsets, type='int32') 1720 _values = asarray(values) 1721 1722 with nogil: 1723 out = GetResultValue( 1724 CListArray.FromArrays(_offsets.ap[0], _values.ap[0], cpool)) 1725 cdef Array result = pyarrow_wrap_array(out) 1726 result.validate() 1727 return result 1728 1729 @property 1730 def values(self): 1731 cdef CListArray* arr = <CListArray*> self.ap 1732 return pyarrow_wrap_array(arr.values()) 1733 1734 @property 1735 def offsets(self): 1736 """ 1737 Return the offsets as an int32 array. 1738 """ 1739 return pyarrow_wrap_array((<CListArray*> self.ap).offsets()) 1740 1741 1742cdef class LargeListArray(BaseListArray): 1743 """ 1744 Concrete class for Arrow arrays of a large list data type. 1745 1746 Identical to ListArray, but 64-bit offsets. 1747 """ 1748 1749 @staticmethod 1750 def from_arrays(offsets, values, MemoryPool pool=None): 1751 """ 1752 Construct LargeListArray from arrays of int64 offsets and values. 1753 1754 Parameters 1755 ---------- 1756 offsets : Array (int64 type) 1757 values : Array (any type) 1758 pool : MemoryPool 1759 1760 Returns 1761 ------- 1762 list_array : LargeListArray 1763 """ 1764 cdef: 1765 Array _offsets, _values 1766 shared_ptr[CArray] out 1767 cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) 1768 1769 _offsets = asarray(offsets, type='int64') 1770 _values = asarray(values) 1771 1772 with nogil: 1773 out = GetResultValue( 1774 CLargeListArray.FromArrays(_offsets.ap[0], _values.ap[0], 1775 cpool)) 1776 cdef Array result = pyarrow_wrap_array(out) 1777 result.validate() 1778 return result 1779 1780 @property 1781 def values(self): 1782 cdef CLargeListArray* arr = <CLargeListArray*> self.ap 1783 return pyarrow_wrap_array(arr.values()) 1784 1785 @property 1786 def offsets(self): 1787 """ 1788 Return the offsets as an int64 array. 1789 """ 1790 return pyarrow_wrap_array((<CLargeListArray*> self.ap).offsets()) 1791 1792 1793cdef class MapArray(Array): 1794 """ 1795 Concrete class for Arrow arrays of a map data type. 1796 """ 1797 1798 @staticmethod 1799 def from_arrays(offsets, keys, items, MemoryPool pool=None): 1800 """ 1801 Construct MapArray from arrays of int32 offsets and key, item arrays. 1802 1803 Parameters 1804 ---------- 1805 offsets : array-like or sequence (int32 type) 1806 keys : array-like or sequence (any type) 1807 items : array-like or sequence (any type) 1808 pool : MemoryPool 1809 1810 Returns 1811 ------- 1812 map_array : MapArray 1813 """ 1814 cdef: 1815 Array _offsets, _keys, _items 1816 shared_ptr[CArray] out 1817 cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) 1818 1819 _offsets = asarray(offsets, type='int32') 1820 _keys = asarray(keys) 1821 _items = asarray(items) 1822 1823 with nogil: 1824 out = GetResultValue( 1825 CMapArray.FromArrays(_offsets.sp_array, 1826 _keys.sp_array, 1827 _items.sp_array, cpool)) 1828 cdef Array result = pyarrow_wrap_array(out) 1829 result.validate() 1830 return result 1831 1832 @property 1833 def keys(self): 1834 return pyarrow_wrap_array((<CMapArray*> self.ap).keys()) 1835 1836 @property 1837 def items(self): 1838 return pyarrow_wrap_array((<CMapArray*> self.ap).items()) 1839 1840 1841cdef class FixedSizeListArray(Array): 1842 """ 1843 Concrete class for Arrow arrays of a fixed size list data type. 1844 """ 1845 1846 @staticmethod 1847 def from_arrays(values, int32_t list_size): 1848 """ 1849 Construct FixedSizeListArray from array of values and a list length. 1850 1851 Parameters 1852 ---------- 1853 values : Array (any type) 1854 list_size : int 1855 The fixed length of the lists. 1856 1857 Returns 1858 ------- 1859 FixedSizeListArray 1860 """ 1861 cdef: 1862 Array _values 1863 CResult[shared_ptr[CArray]] c_result 1864 1865 _values = asarray(values) 1866 1867 with nogil: 1868 c_result = CFixedSizeListArray.FromArrays( 1869 _values.sp_array, list_size) 1870 cdef Array result = pyarrow_wrap_array(GetResultValue(c_result)) 1871 result.validate() 1872 return result 1873 1874 @property 1875 def values(self): 1876 return self.flatten() 1877 1878 def flatten(self): 1879 """ 1880 Unnest this FixedSizeListArray by one level. 1881 1882 Returns 1883 ------- 1884 result : Array 1885 """ 1886 cdef CFixedSizeListArray* arr = <CFixedSizeListArray*> self.ap 1887 return pyarrow_wrap_array(arr.values()) 1888 1889 1890cdef class UnionArray(Array): 1891 """ 1892 Concrete class for Arrow arrays of a Union data type. 1893 """ 1894 1895 def child(self, int pos): 1896 import warnings 1897 warnings.warn("child is deprecated, use field", FutureWarning) 1898 return self.field(pos) 1899 1900 def field(self, int pos): 1901 """ 1902 Return the given child field as an individual array. 1903 1904 For sparse unions, the returned array has its offset, length, 1905 and null count adjusted. 1906 1907 For dense unions, the returned array is unchanged. 1908 """ 1909 cdef shared_ptr[CArray] result 1910 result = (<CUnionArray*> self.ap).field(pos) 1911 if result != NULL: 1912 return pyarrow_wrap_array(result) 1913 raise KeyError("UnionArray does not have child {}".format(pos)) 1914 1915 @property 1916 def type_codes(self): 1917 """Get the type codes array.""" 1918 buf = pyarrow_wrap_buffer((<CUnionArray*> self.ap).type_codes()) 1919 return Array.from_buffers(int8(), len(self), [None, buf]) 1920 1921 @property 1922 def offsets(self): 1923 """ 1924 Get the value offsets array (dense arrays only). 1925 1926 Does not account for any slice offset. 1927 """ 1928 if self.type.mode != "dense": 1929 raise ArrowTypeError("Can only get value offsets for dense arrays") 1930 cdef CDenseUnionArray* dense = <CDenseUnionArray*> self.ap 1931 buf = pyarrow_wrap_buffer(dense.value_offsets()) 1932 return Array.from_buffers(int32(), len(self), [None, buf]) 1933 1934 @staticmethod 1935 def from_dense(Array types, Array value_offsets, list children, 1936 list field_names=None, list type_codes=None): 1937 """ 1938 Construct dense UnionArray from arrays of int8 types, int32 offsets and 1939 children arrays 1940 1941 Parameters 1942 ---------- 1943 types : Array (int8 type) 1944 value_offsets : Array (int32 type) 1945 children : list 1946 field_names : list 1947 type_codes : list 1948 1949 Returns 1950 ------- 1951 union_array : UnionArray 1952 """ 1953 cdef: 1954 shared_ptr[CArray] out 1955 vector[shared_ptr[CArray]] c 1956 Array child 1957 vector[c_string] c_field_names 1958 vector[int8_t] c_type_codes 1959 1960 for child in children: 1961 c.push_back(child.sp_array) 1962 if field_names is not None: 1963 for x in field_names: 1964 c_field_names.push_back(tobytes(x)) 1965 if type_codes is not None: 1966 for x in type_codes: 1967 c_type_codes.push_back(x) 1968 1969 with nogil: 1970 out = GetResultValue(CDenseUnionArray.Make( 1971 deref(types.ap), deref(value_offsets.ap), c, c_field_names, 1972 c_type_codes)) 1973 1974 cdef Array result = pyarrow_wrap_array(out) 1975 result.validate() 1976 return result 1977 1978 @staticmethod 1979 def from_sparse(Array types, list children, list field_names=None, 1980 list type_codes=None): 1981 """ 1982 Construct sparse UnionArray from arrays of int8 types and children 1983 arrays 1984 1985 Parameters 1986 ---------- 1987 types : Array (int8 type) 1988 children : list 1989 field_names : list 1990 type_codes : list 1991 1992 Returns 1993 ------- 1994 union_array : UnionArray 1995 """ 1996 cdef: 1997 shared_ptr[CArray] out 1998 vector[shared_ptr[CArray]] c 1999 Array child 2000 vector[c_string] c_field_names 2001 vector[int8_t] c_type_codes 2002 2003 for child in children: 2004 c.push_back(child.sp_array) 2005 if field_names is not None: 2006 for x in field_names: 2007 c_field_names.push_back(tobytes(x)) 2008 if type_codes is not None: 2009 for x in type_codes: 2010 c_type_codes.push_back(x) 2011 2012 with nogil: 2013 out = GetResultValue(CSparseUnionArray.Make( 2014 deref(types.ap), c, c_field_names, c_type_codes)) 2015 2016 cdef Array result = pyarrow_wrap_array(out) 2017 result.validate() 2018 return result 2019 2020 2021cdef class StringArray(Array): 2022 """ 2023 Concrete class for Arrow arrays of string (or utf8) data type. 2024 """ 2025 2026 @staticmethod 2027 def from_buffers(int length, Buffer value_offsets, Buffer data, 2028 Buffer null_bitmap=None, int null_count=-1, 2029 int offset=0): 2030 """ 2031 Construct a StringArray from value_offsets and data buffers. 2032 If there are nulls in the data, also a null_bitmap and the matching 2033 null_count must be passed. 2034 2035 Parameters 2036 ---------- 2037 length : int 2038 value_offsets : Buffer 2039 data : Buffer 2040 null_bitmap : Buffer, optional 2041 null_count : int, default 0 2042 offset : int, default 0 2043 2044 Returns 2045 ------- 2046 string_array : StringArray 2047 """ 2048 return Array.from_buffers(utf8(), length, 2049 [null_bitmap, value_offsets, data], 2050 null_count, offset) 2051 2052 2053cdef class LargeStringArray(Array): 2054 """ 2055 Concrete class for Arrow arrays of large string (or utf8) data type. 2056 """ 2057 2058 @staticmethod 2059 def from_buffers(int length, Buffer value_offsets, Buffer data, 2060 Buffer null_bitmap=None, int null_count=-1, 2061 int offset=0): 2062 """ 2063 Construct a LargeStringArray from value_offsets and data buffers. 2064 If there are nulls in the data, also a null_bitmap and the matching 2065 null_count must be passed. 2066 2067 Parameters 2068 ---------- 2069 length : int 2070 value_offsets : Buffer 2071 data : Buffer 2072 null_bitmap : Buffer, optional 2073 null_count : int, default 0 2074 offset : int, default 0 2075 2076 Returns 2077 ------- 2078 string_array : StringArray 2079 """ 2080 return Array.from_buffers(large_utf8(), length, 2081 [null_bitmap, value_offsets, data], 2082 null_count, offset) 2083 2084 2085cdef class BinaryArray(Array): 2086 """ 2087 Concrete class for Arrow arrays of variable-sized binary data type. 2088 """ 2089 @property 2090 def total_values_length(self): 2091 """ 2092 The number of bytes from beginning to end of the data buffer addressed 2093 by the offsets of this BinaryArray. 2094 """ 2095 return (<CBinaryArray*> self.ap).total_values_length() 2096 2097 2098cdef class LargeBinaryArray(Array): 2099 """ 2100 Concrete class for Arrow arrays of large variable-sized binary data type. 2101 """ 2102 @property 2103 def total_values_length(self): 2104 """ 2105 The number of bytes from beginning to end of the data buffer addressed 2106 by the offsets of this LargeBinaryArray. 2107 """ 2108 return (<CLargeBinaryArray*> self.ap).total_values_length() 2109 2110 2111cdef class DictionaryArray(Array): 2112 """ 2113 Concrete class for dictionary-encoded Arrow arrays. 2114 """ 2115 2116 def dictionary_encode(self): 2117 return self 2118 2119 def dictionary_decode(self): 2120 """ 2121 Decodes the DictionaryArray to an Array. 2122 """ 2123 return self.dictionary.take(self.indices) 2124 2125 @property 2126 def dictionary(self): 2127 cdef CDictionaryArray* darr = <CDictionaryArray*>(self.ap) 2128 2129 if self._dictionary is None: 2130 self._dictionary = pyarrow_wrap_array(darr.dictionary()) 2131 2132 return self._dictionary 2133 2134 @property 2135 def indices(self): 2136 cdef CDictionaryArray* darr = <CDictionaryArray*>(self.ap) 2137 2138 if self._indices is None: 2139 self._indices = pyarrow_wrap_array(darr.indices()) 2140 2141 return self._indices 2142 2143 @staticmethod 2144 def from_arrays(indices, dictionary, mask=None, bint ordered=False, 2145 bint from_pandas=False, bint safe=True, 2146 MemoryPool memory_pool=None): 2147 """ 2148 Construct a DictionaryArray from indices and values. 2149 2150 Parameters 2151 ---------- 2152 indices : pyarrow.Array, numpy.ndarray or pandas.Series, int type 2153 Non-negative integers referencing the dictionary values by zero 2154 based index. 2155 dictionary : pyarrow.Array, ndarray or pandas.Series 2156 The array of values referenced by the indices. 2157 mask : ndarray or pandas.Series, bool type 2158 True values indicate that indices are actually null. 2159 from_pandas : bool, default False 2160 If True, the indices should be treated as though they originated in 2161 a pandas.Categorical (null encoded as -1). 2162 ordered : bool, default False 2163 Set to True if the category values are ordered. 2164 safe : bool, default True 2165 If True, check that the dictionary indices are in range. 2166 memory_pool : MemoryPool, default None 2167 For memory allocations, if required, otherwise uses default pool. 2168 2169 Returns 2170 ------- 2171 dict_array : DictionaryArray 2172 """ 2173 cdef: 2174 Array _indices, _dictionary 2175 shared_ptr[CDataType] c_type 2176 shared_ptr[CArray] c_result 2177 2178 if isinstance(indices, Array): 2179 if mask is not None: 2180 raise NotImplementedError( 2181 "mask not implemented with Arrow array inputs yet") 2182 _indices = indices 2183 else: 2184 if from_pandas: 2185 _indices = _codes_to_indices(indices, mask, None, memory_pool) 2186 else: 2187 _indices = array(indices, mask=mask, memory_pool=memory_pool) 2188 2189 if isinstance(dictionary, Array): 2190 _dictionary = dictionary 2191 else: 2192 _dictionary = array(dictionary, memory_pool=memory_pool) 2193 2194 if not isinstance(_indices, IntegerArray): 2195 raise ValueError('Indices must be integer type') 2196 2197 cdef c_bool c_ordered = ordered 2198 2199 c_type.reset(new CDictionaryType(_indices.type.sp_type, 2200 _dictionary.sp_array.get().type(), 2201 c_ordered)) 2202 2203 if safe: 2204 with nogil: 2205 c_result = GetResultValue( 2206 CDictionaryArray.FromArrays(c_type, _indices.sp_array, 2207 _dictionary.sp_array)) 2208 else: 2209 c_result.reset(new CDictionaryArray(c_type, _indices.sp_array, 2210 _dictionary.sp_array)) 2211 2212 cdef Array result = pyarrow_wrap_array(c_result) 2213 result.validate() 2214 return result 2215 2216 2217cdef class StructArray(Array): 2218 """ 2219 Concrete class for Arrow arrays of a struct data type. 2220 """ 2221 2222 def field(self, index): 2223 """ 2224 Retrieves the child array belonging to field. 2225 2226 Parameters 2227 ---------- 2228 index : Union[int, str] 2229 Index / position or name of the field. 2230 2231 Returns 2232 ------- 2233 result : Array 2234 """ 2235 cdef: 2236 CStructArray* arr = <CStructArray*> self.ap 2237 shared_ptr[CArray] child 2238 2239 if isinstance(index, (bytes, str)): 2240 child = arr.GetFieldByName(tobytes(index)) 2241 if child == nullptr: 2242 raise KeyError(index) 2243 elif isinstance(index, int): 2244 child = arr.field( 2245 <int>_normalize_index(index, self.ap.num_fields())) 2246 else: 2247 raise TypeError('Expected integer or string index') 2248 2249 return pyarrow_wrap_array(child) 2250 2251 def flatten(self, MemoryPool memory_pool=None): 2252 """ 2253 Return one individual array for each field in the struct. 2254 2255 Parameters 2256 ---------- 2257 memory_pool : MemoryPool, default None 2258 For memory allocations, if required, otherwise use default pool. 2259 2260 Returns 2261 ------- 2262 result : List[Array] 2263 """ 2264 cdef: 2265 vector[shared_ptr[CArray]] arrays 2266 CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) 2267 CStructArray* sarr = <CStructArray*> self.ap 2268 2269 with nogil: 2270 arrays = GetResultValue(sarr.Flatten(pool)) 2271 2272 return [pyarrow_wrap_array(arr) for arr in arrays] 2273 2274 @staticmethod 2275 def from_arrays(arrays, names=None, fields=None, mask=None, 2276 memory_pool=None): 2277 """ 2278 Construct StructArray from collection of arrays representing 2279 each field in the struct. 2280 2281 Either field names or field instances must be passed. 2282 2283 Parameters 2284 ---------- 2285 arrays : sequence of Array 2286 names : List[str] (optional) 2287 Field names for each struct child. 2288 fields : List[Field] (optional) 2289 Field instances for each struct child. 2290 mask : pyarrow.Array[bool] (optional) 2291 Indicate which values are null (True) or not null (False). 2292 memory_pool : MemoryPool (optional) 2293 For memory allocations, if required, otherwise uses default pool. 2294 2295 Returns 2296 ------- 2297 result : StructArray 2298 """ 2299 cdef: 2300 shared_ptr[CArray] c_array 2301 shared_ptr[CBuffer] c_mask 2302 vector[shared_ptr[CArray]] c_arrays 2303 vector[c_string] c_names 2304 vector[shared_ptr[CField]] c_fields 2305 CResult[shared_ptr[CArray]] c_result 2306 ssize_t num_arrays 2307 ssize_t length 2308 ssize_t i 2309 Field py_field 2310 DataType struct_type 2311 2312 if names is None and fields is None: 2313 raise ValueError('Must pass either names or fields') 2314 if names is not None and fields is not None: 2315 raise ValueError('Must pass either names or fields, not both') 2316 2317 if mask is None: 2318 c_mask = shared_ptr[CBuffer]() 2319 elif isinstance(mask, Array): 2320 if mask.type.id != Type_BOOL: 2321 raise ValueError('Mask must be a pyarrow.Array of type bool') 2322 if mask.null_count != 0: 2323 raise ValueError('Mask must not contain nulls') 2324 inverted_mask = _pc().invert(mask, memory_pool=memory_pool) 2325 c_mask = pyarrow_unwrap_buffer(inverted_mask.buffers()[1]) 2326 else: 2327 raise ValueError('Mask must be a pyarrow.Array of type bool') 2328 2329 arrays = [asarray(x) for x in arrays] 2330 for arr in arrays: 2331 c_array = pyarrow_unwrap_array(arr) 2332 if c_array == nullptr: 2333 raise TypeError(f"Expected Array, got {arr.__class__}") 2334 c_arrays.push_back(c_array) 2335 if names is not None: 2336 for name in names: 2337 c_names.push_back(tobytes(name)) 2338 else: 2339 for item in fields: 2340 if isinstance(item, tuple): 2341 py_field = field(*item) 2342 else: 2343 py_field = item 2344 c_fields.push_back(py_field.sp_field) 2345 2346 if (c_arrays.size() == 0 and c_names.size() == 0 and 2347 c_fields.size() == 0): 2348 # The C++ side doesn't allow this 2349 return array([], struct([])) 2350 2351 if names is not None: 2352 # XXX Cannot pass "nullptr" for a shared_ptr<T> argument: 2353 # https://github.com/cython/cython/issues/3020 2354 c_result = CStructArray.MakeFromFieldNames( 2355 c_arrays, c_names, c_mask, -1, 0) 2356 else: 2357 c_result = CStructArray.MakeFromFields( 2358 c_arrays, c_fields, c_mask, -1, 0) 2359 cdef Array result = pyarrow_wrap_array(GetResultValue(c_result)) 2360 result.validate() 2361 return result 2362 2363 2364cdef class ExtensionArray(Array): 2365 """ 2366 Concrete class for Arrow extension arrays. 2367 """ 2368 2369 @property 2370 def storage(self): 2371 cdef: 2372 CExtensionArray* ext_array = <CExtensionArray*>(self.ap) 2373 2374 return pyarrow_wrap_array(ext_array.storage()) 2375 2376 @staticmethod 2377 def from_storage(BaseExtensionType typ, Array storage): 2378 """ 2379 Construct ExtensionArray from type and storage array. 2380 2381 Parameters 2382 ---------- 2383 typ : DataType 2384 The extension type for the result array. 2385 storage : Array 2386 The underlying storage for the result array. 2387 2388 Returns 2389 ------- 2390 ext_array : ExtensionArray 2391 """ 2392 cdef: 2393 shared_ptr[CExtensionArray] ext_array 2394 2395 if storage.type != typ.storage_type: 2396 raise TypeError("Incompatible storage type {0} " 2397 "for extension type {1}".format(storage.type, typ)) 2398 2399 ext_array = make_shared[CExtensionArray](typ.sp_type, storage.sp_array) 2400 cdef Array result = pyarrow_wrap_array(<shared_ptr[CArray]> ext_array) 2401 result.validate() 2402 return result 2403 2404 def _to_pandas(self, options, **kwargs): 2405 pandas_dtype = None 2406 try: 2407 pandas_dtype = self.type.to_pandas_dtype() 2408 except NotImplementedError: 2409 pass 2410 2411 # pandas ExtensionDtype that implements conversion from pyarrow 2412 if hasattr(pandas_dtype, '__from_arrow__'): 2413 arr = pandas_dtype.__from_arrow__(self) 2414 return pandas_api.series(arr) 2415 2416 # otherwise convert the storage array with the base implementation 2417 return Array._to_pandas(self.storage, options, **kwargs) 2418 2419 def to_numpy(self, **kwargs): 2420 """ 2421 Convert extension array to a numpy ndarray. 2422 2423 See Also 2424 -------- 2425 Array.to_numpy 2426 """ 2427 return self.storage.to_numpy(**kwargs) 2428 2429 2430cdef dict _array_classes = { 2431 _Type_NA: NullArray, 2432 _Type_BOOL: BooleanArray, 2433 _Type_UINT8: UInt8Array, 2434 _Type_UINT16: UInt16Array, 2435 _Type_UINT32: UInt32Array, 2436 _Type_UINT64: UInt64Array, 2437 _Type_INT8: Int8Array, 2438 _Type_INT16: Int16Array, 2439 _Type_INT32: Int32Array, 2440 _Type_INT64: Int64Array, 2441 _Type_DATE32: Date32Array, 2442 _Type_DATE64: Date64Array, 2443 _Type_TIMESTAMP: TimestampArray, 2444 _Type_TIME32: Time32Array, 2445 _Type_TIME64: Time64Array, 2446 _Type_DURATION: DurationArray, 2447 _Type_INTERVAL_MONTH_DAY_NANO: MonthDayNanoIntervalArray, 2448 _Type_HALF_FLOAT: HalfFloatArray, 2449 _Type_FLOAT: FloatArray, 2450 _Type_DOUBLE: DoubleArray, 2451 _Type_LIST: ListArray, 2452 _Type_LARGE_LIST: LargeListArray, 2453 _Type_MAP: MapArray, 2454 _Type_FIXED_SIZE_LIST: FixedSizeListArray, 2455 _Type_SPARSE_UNION: UnionArray, 2456 _Type_DENSE_UNION: UnionArray, 2457 _Type_BINARY: BinaryArray, 2458 _Type_STRING: StringArray, 2459 _Type_LARGE_BINARY: LargeBinaryArray, 2460 _Type_LARGE_STRING: LargeStringArray, 2461 _Type_DICTIONARY: DictionaryArray, 2462 _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray, 2463 _Type_DECIMAL128: Decimal128Array, 2464 _Type_DECIMAL256: Decimal256Array, 2465 _Type_STRUCT: StructArray, 2466 _Type_EXTENSION: ExtensionArray, 2467} 2468 2469 2470cdef object get_array_class_from_type( 2471 const shared_ptr[CDataType]& sp_data_type): 2472 cdef CDataType* data_type = sp_data_type.get() 2473 if data_type == NULL: 2474 raise ValueError('Array data type was NULL') 2475 2476 if data_type.id() == _Type_EXTENSION: 2477 py_ext_data_type = pyarrow_wrap_data_type(sp_data_type) 2478 return py_ext_data_type.__arrow_ext_class__() 2479 else: 2480 return _array_classes[data_type.id()] 2481 2482 2483cdef object get_values(object obj, bint* is_series): 2484 if pandas_api.is_series(obj) or pandas_api.is_index(obj): 2485 result = pandas_api.get_values(obj) 2486 is_series[0] = True 2487 elif isinstance(obj, np.ndarray): 2488 result = obj 2489 is_series[0] = False 2490 else: 2491 result = pandas_api.series(obj).values 2492 is_series[0] = False 2493 2494 return result 2495 2496 2497def concat_arrays(arrays, MemoryPool memory_pool=None): 2498 """ 2499 Concatenate the given arrays. 2500 2501 The contents of the input arrays are copied into the returned array. 2502 2503 Raises 2504 ------ 2505 ArrowInvalid : if not all of the arrays have the same type. 2506 2507 Parameters 2508 ---------- 2509 arrays : iterable of pyarrow.Array 2510 Arrays to concatenate, must be identically typed. 2511 memory_pool : MemoryPool, default None 2512 For memory allocations. If None, the default pool is used. 2513 """ 2514 cdef: 2515 vector[shared_ptr[CArray]] c_arrays 2516 shared_ptr[CArray] c_concatenated 2517 CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) 2518 2519 for array in arrays: 2520 if not isinstance(array, Array): 2521 raise TypeError("Iterable should contain Array objects, " 2522 "got {0} instead".format(type(array))) 2523 c_arrays.push_back(pyarrow_unwrap_array(array)) 2524 2525 with nogil: 2526 c_concatenated = GetResultValue(Concatenate(c_arrays, pool)) 2527 2528 return pyarrow_wrap_array(c_concatenated) 2529 2530 2531def _empty_array(DataType type): 2532 """ 2533 Create empty array of the given type. 2534 """ 2535 if type.id == Type_DICTIONARY: 2536 arr = DictionaryArray.from_arrays( 2537 _empty_array(type.index_type), _empty_array(type.value_type), 2538 ordered=type.ordered) 2539 else: 2540 arr = array([], type=type) 2541 return arr 2542