1# Licensed to the Apache Software Foundation (ASF) under one 2# or more contributor license agreements. See the NOTICE file 3# distributed with this work for additional information 4# regarding copyright ownership. The ASF licenses this file 5# to you under the Apache License, Version 2.0 (the 6# "License"); you may not use this file except in compliance 7# with the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, 12# software distributed under the License is distributed on an 13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14# KIND, either express or implied. See the License for the 15# specific language governing permissions and limitations 16# under the License. 17 18import datetime 19import decimal 20import hypothesis as h 21import hypothesis.strategies as st 22import itertools 23import pickle 24import pytest 25import struct 26import sys 27 28import numpy as np 29try: 30 import pickle5 31except ImportError: 32 pickle5 = None 33 34import pyarrow as pa 35import pyarrow.tests.strategies as past 36from pyarrow import compat 37 38 39def test_total_bytes_allocated(): 40 assert pa.total_allocated_bytes() == 0 41 42 43def test_getitem_NULL(): 44 arr = pa.array([1, None, 2]) 45 assert arr[1] is pa.NULL 46 47 48def test_constructor_raises(): 49 # This could happen by wrong capitalization. 50 # ARROW-2638: prevent calling extension class constructors directly 51 with pytest.raises(TypeError): 52 pa.Array([1, 2]) 53 54 55def test_list_format(): 56 arr = pa.array([[1], None, [2, 3, None]]) 57 result = arr.to_string() 58 expected = """\ 59[ 60 [ 61 1 62 ], 63 null, 64 [ 65 2, 66 3, 67 null 68 ] 69]""" 70 assert result == expected 71 72 73def test_string_format(): 74 arr = pa.array(['', None, 'foo']) 75 result = arr.to_string() 76 expected = """\ 77[ 78 "", 79 null, 80 "foo" 81]""" 82 assert result == expected 83 84 85def test_long_array_format(): 86 arr = pa.array(range(100)) 87 result = arr.to_string(window=2) 88 expected = """\ 89[ 90 0, 91 1, 92 ... 93 98, 94 99 95]""" 96 assert result == expected 97 98 99def test_binary_format(): 100 arr = pa.array([b'\x00', b'', None, b'\x01foo', b'\x80\xff']) 101 result = arr.to_string() 102 expected = """\ 103[ 104 00, 105 , 106 null, 107 01666F6F, 108 80FF 109]""" 110 assert result == expected 111 112 113def test_to_numpy_zero_copy(): 114 arr = pa.array(range(10)) 115 116 np_arr = arr.to_numpy() 117 118 # check for zero copy (both arrays using same memory) 119 arrow_buf = arr.buffers()[1] 120 assert arrow_buf.address == np_arr.ctypes.data 121 122 arr = None 123 import gc 124 gc.collect() 125 126 # Ensure base is still valid 127 assert np_arr.base is not None 128 expected = np.arange(10) 129 np.testing.assert_array_equal(np_arr, expected) 130 131 132def test_to_numpy_unsupported_types(): 133 # ARROW-2871: Some primitive types are not yet supported in to_numpy 134 bool_arr = pa.array([True, False, True]) 135 136 with pytest.raises(ValueError): 137 bool_arr.to_numpy() 138 139 result = bool_arr.to_numpy(zero_copy_only=False) 140 expected = np.array([True, False, True]) 141 np.testing.assert_array_equal(result, expected) 142 143 null_arr = pa.array([None, None, None]) 144 145 with pytest.raises(ValueError): 146 null_arr.to_numpy() 147 148 result = null_arr.to_numpy(zero_copy_only=False) 149 expected = np.array([None, None, None], dtype=object) 150 np.testing.assert_array_equal(result, expected) 151 152 arr = pa.array([1, 2, None]) 153 154 with pytest.raises(ValueError, match="with 1 nulls"): 155 arr.to_numpy() 156 157 158def test_to_numpy_writable(): 159 arr = pa.array(range(10)) 160 np_arr = arr.to_numpy() 161 162 # by default not writable for zero-copy conversion 163 with pytest.raises(ValueError): 164 np_arr[0] = 10 165 166 np_arr2 = arr.to_numpy(zero_copy_only=False, writable=True) 167 np_arr2[0] = 10 168 assert arr[0].as_py() == 0 169 170 # when asking for writable, cannot do zero-copy 171 with pytest.raises(ValueError): 172 arr.to_numpy(zero_copy_only=True, writable=True) 173 174 175@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns']) 176def test_to_numpy_datetime64(unit): 177 arr = pa.array([1, 2, 3], pa.timestamp(unit)) 178 expected = np.array([1, 2, 3], dtype="datetime64[{}]".format(unit)) 179 np_arr = arr.to_numpy() 180 np.testing.assert_array_equal(np_arr, expected) 181 182 183@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns']) 184def test_to_numpy_timedelta64(unit): 185 arr = pa.array([1, 2, 3], pa.duration(unit)) 186 expected = np.array([1, 2, 3], dtype="timedelta64[{}]".format(unit)) 187 np_arr = arr.to_numpy() 188 np.testing.assert_array_equal(np_arr, expected) 189 190 191def test_to_numpy_dictionary(): 192 # ARROW-7591 193 arr = pa.array(["a", "b", "a"]).dictionary_encode() 194 expected = np.array(["a", "b", "a"], dtype=object) 195 np_arr = arr.to_numpy(zero_copy_only=False) 196 np.testing.assert_array_equal(np_arr, expected) 197 198 199@pytest.mark.pandas 200def test_to_pandas_zero_copy(): 201 import gc 202 203 arr = pa.array(range(10)) 204 205 for i in range(10): 206 series = arr.to_pandas() 207 assert sys.getrefcount(series) == 2 208 series = None # noqa 209 210 assert sys.getrefcount(arr) == 2 211 212 for i in range(10): 213 arr = pa.array(range(10)) 214 series = arr.to_pandas() 215 arr = None 216 gc.collect() 217 218 # Ensure base is still valid 219 220 # Because of py.test's assert inspection magic, if you put getrefcount 221 # on the line being examined, it will be 1 higher than you expect 222 base_refcount = sys.getrefcount(series.values.base) 223 assert base_refcount == 2 224 series.sum() 225 226 227@pytest.mark.nopandas 228@pytest.mark.pandas 229def test_asarray(): 230 # ensure this is tested both when pandas is present or not (ARROW-6564) 231 232 arr = pa.array(range(4)) 233 234 # The iterator interface gives back an array of Int64Value's 235 np_arr = np.asarray([_ for _ in arr]) 236 assert np_arr.tolist() == [0, 1, 2, 3] 237 assert np_arr.dtype == np.dtype('O') 238 assert type(np_arr[0]) == pa.lib.Int64Value 239 240 # Calling with the arrow array gives back an array with 'int64' dtype 241 np_arr = np.asarray(arr) 242 assert np_arr.tolist() == [0, 1, 2, 3] 243 assert np_arr.dtype == np.dtype('int64') 244 245 # An optional type can be specified when calling np.asarray 246 np_arr = np.asarray(arr, dtype='str') 247 assert np_arr.tolist() == ['0', '1', '2', '3'] 248 249 # If PyArrow array has null values, numpy type will be changed as needed 250 # to support nulls. 251 arr = pa.array([0, 1, 2, None]) 252 assert arr.type == pa.int64() 253 np_arr = np.asarray(arr) 254 elements = np_arr.tolist() 255 assert elements[:3] == [0., 1., 2.] 256 assert np.isnan(elements[3]) 257 assert np_arr.dtype == np.dtype('float64') 258 259 # DictionaryType data will be converted to dense numpy array 260 arr = pa.DictionaryArray.from_arrays( 261 pa.array([0, 1, 2, 0, 1]), pa.array(['a', 'b', 'c'])) 262 np_arr = np.asarray(arr) 263 assert np_arr.dtype == np.dtype('object') 264 assert np_arr.tolist() == ['a', 'b', 'c', 'a', 'b'] 265 266 267def test_array_getitem(): 268 arr = pa.array(range(10, 15)) 269 lst = arr.to_pylist() 270 271 for idx in range(-len(arr), len(arr)): 272 assert arr[idx].as_py() == lst[idx] 273 for idx in range(-2 * len(arr), -len(arr)): 274 with pytest.raises(IndexError): 275 arr[idx] 276 for idx in range(len(arr), 2 * len(arr)): 277 with pytest.raises(IndexError): 278 arr[idx] 279 280 281def test_array_slice(): 282 arr = pa.array(range(10)) 283 284 sliced = arr.slice(2) 285 expected = pa.array(range(2, 10)) 286 assert sliced.equals(expected) 287 288 sliced2 = arr.slice(2, 4) 289 expected2 = pa.array(range(2, 6)) 290 assert sliced2.equals(expected2) 291 292 # 0 offset 293 assert arr.slice(0).equals(arr) 294 295 # Slice past end of array 296 assert len(arr.slice(len(arr))) == 0 297 298 with pytest.raises(IndexError): 299 arr.slice(-1) 300 301 # Test slice notation 302 assert arr[2:].equals(arr.slice(2)) 303 assert arr[2:5].equals(arr.slice(2, 3)) 304 assert arr[-5:].equals(arr.slice(len(arr) - 5)) 305 306 n = len(arr) 307 for start in range(-n * 2, n * 2): 308 for stop in range(-n * 2, n * 2): 309 assert arr[start:stop].to_pylist() == arr.to_pylist()[start:stop] 310 311 312def test_array_slice_negative_step(): 313 # ARROW-2714 314 np_arr = np.arange(20) 315 arr = pa.array(np_arr) 316 chunked_arr = pa.chunked_array([arr]) 317 318 cases = [ 319 slice(None, None, -1), 320 slice(None, 6, -2), 321 slice(10, 6, -2), 322 slice(8, None, -2), 323 slice(2, 10, -2), 324 slice(10, 2, -2), 325 slice(None, None, 2), 326 slice(0, 10, 2), 327 ] 328 329 for case in cases: 330 result = arr[case] 331 expected = pa.array(np_arr[case]) 332 assert result.equals(expected) 333 334 result = pa.record_batch([arr], names=['f0'])[case] 335 expected = pa.record_batch([expected], names=['f0']) 336 assert result.equals(expected) 337 338 result = chunked_arr[case] 339 expected = pa.chunked_array([np_arr[case]]) 340 assert result.equals(expected) 341 342 343def test_array_diff(): 344 # ARROW-6252 345 arr1 = pa.array(['foo'], type=pa.utf8()) 346 arr2 = pa.array(['foo', 'bar', None], type=pa.utf8()) 347 arr3 = pa.array([1, 2, 3]) 348 arr4 = pa.array([[], [1], None], type=pa.list_(pa.int64())) 349 350 assert arr1.diff(arr1) == '' 351 assert arr1.diff(arr2) == ''' 352@@ -1, +1 @@ 353+"bar" 354+null 355''' 356 assert arr1.diff(arr3).strip() == '# Array types differed: string vs int64' 357 assert arr1.diff(arr3).strip() == '# Array types differed: string vs int64' 358 assert arr1.diff(arr4).strip() == ('# Array types differed: string vs ' 359 'list<item: int64>') 360 361 362def test_array_iter(): 363 arr = pa.array(range(10)) 364 365 for i, j in zip(range(10), arr): 366 assert i == j 367 368 assert isinstance(arr, compat.Iterable) 369 370 371def test_struct_array_slice(): 372 # ARROW-2311: slicing nested arrays needs special care 373 ty = pa.struct([pa.field('a', pa.int8()), 374 pa.field('b', pa.float32())]) 375 arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) 376 assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5}, 377 {'a': 5, 'b': 6.5}] 378 379 380def test_array_factory_invalid_type(): 381 382 class MyObject: 383 pass 384 385 arr = np.array([MyObject()]) 386 with pytest.raises(ValueError): 387 pa.array(arr) 388 389 390def test_array_ref_to_ndarray_base(): 391 arr = np.array([1, 2, 3]) 392 393 refcount = sys.getrefcount(arr) 394 arr2 = pa.array(arr) # noqa 395 assert sys.getrefcount(arr) == (refcount + 1) 396 397 398def test_array_from_buffers(): 399 values_buf = pa.py_buffer(np.int16([4, 5, 6, 7])) 400 nulls_buf = pa.py_buffer(np.uint8([0b00001101])) 401 arr = pa.Array.from_buffers(pa.int16(), 4, [nulls_buf, values_buf]) 402 assert arr.type == pa.int16() 403 assert arr.to_pylist() == [4, None, 6, 7] 404 405 arr = pa.Array.from_buffers(pa.int16(), 4, [None, values_buf]) 406 assert arr.type == pa.int16() 407 assert arr.to_pylist() == [4, 5, 6, 7] 408 409 arr = pa.Array.from_buffers(pa.int16(), 3, [nulls_buf, values_buf], 410 offset=1) 411 assert arr.type == pa.int16() 412 assert arr.to_pylist() == [None, 6, 7] 413 414 with pytest.raises(TypeError): 415 pa.Array.from_buffers(pa.int16(), 3, ['', ''], offset=1) 416 417 418def test_string_binary_from_buffers(): 419 array = pa.array(["a", None, "b", "c"]) 420 421 buffers = array.buffers() 422 copied = pa.StringArray.from_buffers( 423 len(array), buffers[1], buffers[2], buffers[0], array.null_count, 424 array.offset) 425 assert copied.to_pylist() == ["a", None, "b", "c"] 426 427 binary_copy = pa.Array.from_buffers(pa.binary(), len(array), 428 array.buffers(), array.null_count, 429 array.offset) 430 assert binary_copy.to_pylist() == [b"a", None, b"b", b"c"] 431 432 copied = pa.StringArray.from_buffers( 433 len(array), buffers[1], buffers[2], buffers[0]) 434 assert copied.to_pylist() == ["a", None, "b", "c"] 435 436 sliced = array[1:] 437 buffers = sliced.buffers() 438 copied = pa.StringArray.from_buffers( 439 len(sliced), buffers[1], buffers[2], buffers[0], -1, sliced.offset) 440 assert copied.to_pylist() == [None, "b", "c"] 441 assert copied.null_count == 1 442 443 # Slice but exclude all null entries so that we don't need to pass 444 # the null bitmap. 445 sliced = array[2:] 446 buffers = sliced.buffers() 447 copied = pa.StringArray.from_buffers( 448 len(sliced), buffers[1], buffers[2], None, -1, sliced.offset) 449 assert copied.to_pylist() == ["b", "c"] 450 assert copied.null_count == 0 451 452 453@pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list]) 454def test_list_from_buffers(list_type_factory): 455 ty = list_type_factory(pa.int16()) 456 array = pa.array([[0, 1, 2], None, [], [3, 4, 5]], type=ty) 457 assert array.type == ty 458 459 buffers = array.buffers() 460 461 with pytest.raises(ValueError): 462 # No children 463 pa.Array.from_buffers(ty, 4, [None, buffers[1]]) 464 465 child = pa.Array.from_buffers(pa.int16(), 6, buffers[2:]) 466 copied = pa.Array.from_buffers(ty, 4, buffers[:2], children=[child]) 467 assert copied.equals(array) 468 469 with pytest.raises(ValueError): 470 # too many children 471 pa.Array.from_buffers(ty, 4, [None, buffers[1]], 472 children=[child, child]) 473 474 475def test_struct_from_buffers(): 476 ty = pa.struct([pa.field('a', pa.int16()), pa.field('b', pa.utf8())]) 477 array = pa.array([{'a': 0, 'b': 'foo'}, None, {'a': 5, 'b': ''}], 478 type=ty) 479 buffers = array.buffers() 480 481 with pytest.raises(ValueError): 482 # No children 483 pa.Array.from_buffers(ty, 3, [None, buffers[1]]) 484 485 children = [pa.Array.from_buffers(pa.int16(), 3, buffers[1:3]), 486 pa.Array.from_buffers(pa.utf8(), 3, buffers[3:])] 487 copied = pa.Array.from_buffers(ty, 3, buffers[:1], children=children) 488 assert copied.equals(array) 489 490 with pytest.raises(ValueError): 491 # not enough many children 492 pa.Array.from_buffers(ty, 3, [buffers[0]], 493 children=children[:1]) 494 495 496def test_struct_from_arrays(): 497 a = pa.array([4, 5, 6], type=pa.int64()) 498 b = pa.array(["bar", None, ""]) 499 c = pa.array([[1, 2], None, [3, None]]) 500 expected_list = [ 501 {'a': 4, 'b': 'bar', 'c': [1, 2]}, 502 {'a': 5, 'b': None, 'c': None}, 503 {'a': 6, 'b': '', 'c': [3, None]}, 504 ] 505 506 # From field names 507 arr = pa.StructArray.from_arrays([a, b, c], ["a", "b", "c"]) 508 assert arr.type == pa.struct( 509 [("a", a.type), ("b", b.type), ("c", c.type)]) 510 assert arr.to_pylist() == expected_list 511 512 with pytest.raises(ValueError): 513 pa.StructArray.from_arrays([a, b, c], ["a", "b"]) 514 515 arr = pa.StructArray.from_arrays([], []) 516 assert arr.type == pa.struct([]) 517 assert arr.to_pylist() == [] 518 519 # From fields 520 fa = pa.field("a", a.type, nullable=False) 521 fb = pa.field("b", b.type) 522 fc = pa.field("c", c.type) 523 arr = pa.StructArray.from_arrays([a, b, c], fields=[fa, fb, fc]) 524 assert arr.type == pa.struct([fa, fb, fc]) 525 assert not arr.type[0].nullable 526 assert arr.to_pylist() == expected_list 527 528 with pytest.raises(ValueError): 529 pa.StructArray.from_arrays([a, b, c], fields=[fa, fb]) 530 531 arr = pa.StructArray.from_arrays([], fields=[]) 532 assert arr.type == pa.struct([]) 533 assert arr.to_pylist() == [] 534 535 # Inconsistent fields 536 fa2 = pa.field("a", pa.int32()) 537 with pytest.raises(ValueError, match="int64 vs int32"): 538 pa.StructArray.from_arrays([a, b, c], fields=[fa2, fb, fc]) 539 540 541def test_dictionary_from_numpy(): 542 indices = np.repeat([0, 1, 2], 2) 543 dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) 544 mask = np.array([False, False, True, False, False, False]) 545 546 d1 = pa.DictionaryArray.from_arrays(indices, dictionary) 547 d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask) 548 549 assert d1.indices.to_pylist() == indices.tolist() 550 assert d1.indices.to_pylist() == indices.tolist() 551 assert d1.dictionary.to_pylist() == dictionary.tolist() 552 assert d2.dictionary.to_pylist() == dictionary.tolist() 553 554 for i in range(len(indices)): 555 assert d1[i].as_py() == dictionary[indices[i]] 556 557 if mask[i]: 558 assert d2[i] is pa.NULL 559 else: 560 assert d2[i].as_py() == dictionary[indices[i]] 561 562 563def test_dictionary_from_boxed_arrays(): 564 indices = np.repeat([0, 1, 2], 2) 565 dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) 566 567 iarr = pa.array(indices) 568 darr = pa.array(dictionary) 569 570 d1 = pa.DictionaryArray.from_arrays(iarr, darr) 571 572 assert d1.indices.to_pylist() == indices.tolist() 573 assert d1.dictionary.to_pylist() == dictionary.tolist() 574 575 for i in range(len(indices)): 576 assert d1[i].as_py() == dictionary[indices[i]] 577 578 579def test_dictionary_from_arrays_boundscheck(): 580 indices1 = pa.array([0, 1, 2, 0, 1, 2]) 581 indices2 = pa.array([0, -1, 2]) 582 indices3 = pa.array([0, 1, 2, 3]) 583 584 dictionary = pa.array(['foo', 'bar', 'baz']) 585 586 # Works fine 587 pa.DictionaryArray.from_arrays(indices1, dictionary) 588 589 with pytest.raises(pa.ArrowException): 590 pa.DictionaryArray.from_arrays(indices2, dictionary) 591 592 with pytest.raises(pa.ArrowException): 593 pa.DictionaryArray.from_arrays(indices3, dictionary) 594 595 # If we are confident that the indices are "safe" we can pass safe=False to 596 # disable the boundschecking 597 pa.DictionaryArray.from_arrays(indices2, dictionary, safe=False) 598 599 600def test_dictionary_indices(): 601 # https://issues.apache.org/jira/browse/ARROW-6882 602 indices = pa.array([0, 1, 2, 0, 1, 2]) 603 dictionary = pa.array(['foo', 'bar', 'baz']) 604 arr = pa.DictionaryArray.from_arrays(indices, dictionary) 605 arr.indices.validate(full=True) 606 607 608@pytest.mark.parametrize(('list_array_type', 'list_type_factory'), 609 [(pa.ListArray, pa.list_), 610 (pa.LargeListArray, pa.large_list)]) 611def test_list_from_arrays(list_array_type, list_type_factory): 612 offsets_arr = np.array([0, 2, 5, 8], dtype='i4') 613 offsets = pa.array(offsets_arr, type='int32') 614 pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'] 615 values = pa.array(pyvalues, type='binary') 616 617 result = list_array_type.from_arrays(offsets, values) 618 expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]], 619 type=list_type_factory(pa.binary())) 620 621 assert result.equals(expected) 622 623 # With nulls 624 offsets = [0, None, 2, 6] 625 values = [b'a', b'b', b'c', b'd', b'e', b'f'] 626 627 result = list_array_type.from_arrays(offsets, values) 628 expected = pa.array([values[:2], None, values[2:]], 629 type=list_type_factory(pa.binary())) 630 631 assert result.equals(expected) 632 633 # Another edge case 634 offsets2 = [0, 2, None, 6] 635 result = list_array_type.from_arrays(offsets2, values) 636 expected = pa.array([values[:2], values[2:], None], 637 type=list_type_factory(pa.binary())) 638 assert result.equals(expected) 639 640 # raise on invalid array 641 offsets = [1, 3, 10] 642 values = np.arange(5) 643 with pytest.raises(ValueError): 644 list_array_type.from_arrays(offsets, values) 645 646 # Non-monotonic offsets 647 offsets = [0, 3, 2, 6] 648 values = list(range(6)) 649 result = list_array_type.from_arrays(offsets, values) 650 with pytest.raises(ValueError): 651 result.validate(full=True) 652 653 654def test_map_from_arrays(): 655 offsets_arr = np.array([0, 2, 5, 8], dtype='i4') 656 offsets = pa.array(offsets_arr, type='int32') 657 pykeys = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'] 658 pyitems = list(range(len(pykeys))) 659 pypairs = list(zip(pykeys, pyitems)) 660 pyentries = [pypairs[:2], pypairs[2:5], pypairs[5:8]] 661 keys = pa.array(pykeys, type='binary') 662 items = pa.array(pyitems, type='i4') 663 664 result = pa.MapArray.from_arrays(offsets, keys, items) 665 expected = pa.array(pyentries, type=pa.map_(pa.binary(), pa.int32())) 666 667 assert result.equals(expected) 668 669 # With nulls 670 offsets = [0, None, 2, 6] 671 pykeys = [b'a', b'b', b'c', b'd', b'e', b'f'] 672 pyitems = [1, 2, 3, None, 4, 5] 673 pypairs = list(zip(pykeys, pyitems)) 674 pyentries = [pypairs[:2], None, pypairs[2:]] 675 keys = pa.array(pykeys, type='binary') 676 items = pa.array(pyitems, type='i4') 677 678 result = pa.MapArray.from_arrays(offsets, keys, items) 679 expected = pa.array(pyentries, type=pa.map_(pa.binary(), pa.int32())) 680 681 assert result.equals(expected) 682 683 # check invalid usage 684 685 offsets = [0, 1, 3, 5] 686 keys = np.arange(5) 687 items = np.arange(5) 688 _ = pa.MapArray.from_arrays(offsets, keys, items) 689 690 # raise on invalid offsets 691 with pytest.raises(ValueError): 692 pa.MapArray.from_arrays(offsets + [6], keys, items) 693 694 # raise on length of keys != items 695 with pytest.raises(ValueError): 696 pa.MapArray.from_arrays(offsets, keys, np.concatenate([items, items])) 697 698 # raise on keys with null 699 keys_with_null = list(keys)[:-1] + [None] 700 assert len(keys_with_null) == len(items) 701 with pytest.raises(ValueError): 702 pa.MapArray.from_arrays(offsets, keys_with_null, items) 703 704 705def test_fixed_size_list_from_arrays(): 706 values = pa.array(range(12), pa.int64()) 707 result = pa.FixedSizeListArray.from_arrays(values, 4) 708 assert result.to_pylist() == [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]] 709 assert result.type.equals(pa.list_(pa.int64(), 4)) 710 711 # raise on invalid values / list_size 712 with pytest.raises(ValueError): 713 pa.FixedSizeListArray.from_arrays(values, -4) 714 715 with pytest.raises(ValueError): 716 # array with list size 0 cannot be constructed with from_arrays 717 pa.FixedSizeListArray.from_arrays(pa.array([], pa.int64()), 0) 718 719 with pytest.raises(ValueError): 720 # length of values not multiple of 5 721 pa.FixedSizeListArray.from_arrays(values, 5) 722 723 724def test_union_from_dense(): 725 binary = pa.array([b'a', b'b', b'c', b'd'], type='binary') 726 int64 = pa.array([1, 2, 3], type='int64') 727 types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') 728 logical_types = pa.array([11, 13, 11, 11, 13, 13, 11], type='int8') 729 value_offsets = pa.array([1, 0, 0, 2, 1, 2, 3], type='int32') 730 py_value = [b'b', 1, b'a', b'c', 2, 3, b'd'] 731 732 def check_result(result, expected_field_names, expected_type_codes, 733 expected_type_code_values): 734 result.validate(full=True) 735 actual_field_names = [result.type[i].name 736 for i in range(result.type.num_children)] 737 assert actual_field_names == expected_field_names 738 assert result.type.mode == "dense" 739 assert result.type.type_codes == expected_type_codes 740 assert result.to_pylist() == py_value 741 assert expected_type_code_values.equals(result.type_codes) 742 assert value_offsets.equals(result.offsets) 743 assert result.child(0).equals(binary) 744 assert result.child(1).equals(int64) 745 with pytest.raises(KeyError): 746 result.child(-1) 747 with pytest.raises(KeyError): 748 result.child(2) 749 750 # without field names and type codes 751 check_result(pa.UnionArray.from_dense(types, value_offsets, 752 [binary, int64]), 753 expected_field_names=['0', '1'], 754 expected_type_codes=[0, 1], 755 expected_type_code_values=types) 756 757 # with field names 758 check_result(pa.UnionArray.from_dense(types, value_offsets, 759 [binary, int64], 760 ['bin', 'int']), 761 expected_field_names=['bin', 'int'], 762 expected_type_codes=[0, 1], 763 expected_type_code_values=types) 764 765 # with type codes 766 check_result(pa.UnionArray.from_dense(logical_types, value_offsets, 767 [binary, int64], 768 type_codes=[11, 13]), 769 expected_field_names=['0', '1'], 770 expected_type_codes=[11, 13], 771 expected_type_code_values=logical_types) 772 773 # with field names and type codes 774 check_result(pa.UnionArray.from_dense(logical_types, value_offsets, 775 [binary, int64], 776 ['bin', 'int'], [11, 13]), 777 expected_field_names=['bin', 'int'], 778 expected_type_codes=[11, 13], 779 expected_type_code_values=logical_types) 780 781 # Bad type ids 782 arr = pa.UnionArray.from_dense(logical_types, value_offsets, 783 [binary, int64]) 784 with pytest.raises(pa.ArrowInvalid): 785 arr.validate(full=True) 786 arr = pa.UnionArray.from_dense(types, value_offsets, [binary, int64], 787 type_codes=[11, 13]) 788 with pytest.raises(pa.ArrowInvalid): 789 arr.validate(full=True) 790 791 # Offset larger than child size 792 bad_offsets = pa.array([0, 0, 1, 2, 1, 2, 4], type='int32') 793 arr = pa.UnionArray.from_dense(types, bad_offsets, [binary, int64]) 794 with pytest.raises(pa.ArrowInvalid): 795 arr.validate(full=True) 796 797 798def test_union_from_sparse(): 799 binary = pa.array([b'a', b' ', b'b', b'c', b' ', b' ', b'd'], 800 type='binary') 801 int64 = pa.array([0, 1, 0, 0, 2, 3, 0], type='int64') 802 types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') 803 logical_types = pa.array([11, 13, 11, 11, 13, 13, 11], type='int8') 804 py_value = [b'a', 1, b'b', b'c', 2, 3, b'd'] 805 806 def check_result(result, expected_field_names, expected_type_codes, 807 expected_type_code_values): 808 result.validate(full=True) 809 assert result.to_pylist() == py_value 810 actual_field_names = [result.type[i].name 811 for i in range(result.type.num_children)] 812 assert actual_field_names == expected_field_names 813 assert result.type.mode == "sparse" 814 assert result.type.type_codes == expected_type_codes 815 assert expected_type_code_values.equals(result.type_codes) 816 assert result.child(0).equals(binary) 817 assert result.child(1).equals(int64) 818 with pytest.raises(pa.ArrowTypeError): 819 result.offsets 820 with pytest.raises(KeyError): 821 result.child(-1) 822 with pytest.raises(KeyError): 823 result.child(2) 824 825 # without field names and type codes 826 check_result(pa.UnionArray.from_sparse(types, [binary, int64]), 827 expected_field_names=['0', '1'], 828 expected_type_codes=[0, 1], 829 expected_type_code_values=types) 830 831 # with field names 832 check_result(pa.UnionArray.from_sparse(types, [binary, int64], 833 ['bin', 'int']), 834 expected_field_names=['bin', 'int'], 835 expected_type_codes=[0, 1], 836 expected_type_code_values=types) 837 838 # with type codes 839 check_result(pa.UnionArray.from_sparse(logical_types, [binary, int64], 840 type_codes=[11, 13]), 841 expected_field_names=['0', '1'], 842 expected_type_codes=[11, 13], 843 expected_type_code_values=logical_types) 844 845 # with field names and type codes 846 check_result(pa.UnionArray.from_sparse(logical_types, [binary, int64], 847 ['bin', 'int'], 848 [11, 13]), 849 expected_field_names=['bin', 'int'], 850 expected_type_codes=[11, 13], 851 expected_type_code_values=logical_types) 852 853 # Bad type ids 854 arr = pa.UnionArray.from_sparse(logical_types, [binary, int64]) 855 with pytest.raises(pa.ArrowInvalid): 856 arr.validate(full=True) 857 arr = pa.UnionArray.from_sparse(types, [binary, int64], 858 type_codes=[11, 13]) 859 with pytest.raises(pa.ArrowInvalid): 860 arr.validate(full=True) 861 862 # Invalid child length 863 with pytest.raises(pa.ArrowInvalid): 864 arr = pa.UnionArray.from_sparse(logical_types, [binary, int64[1:]]) 865 866 867def test_union_array_slice(): 868 # ARROW-2314 869 arr = pa.UnionArray.from_sparse(pa.array([0, 0, 1, 1], type=pa.int8()), 870 [pa.array(["a", "b", "c", "d"]), 871 pa.array([1, 2, 3, 4])]) 872 assert arr[1:].to_pylist() == ["b", 3, 4] 873 874 binary = pa.array([b'a', b'b', b'c', b'd'], type='binary') 875 int64 = pa.array([1, 2, 3], type='int64') 876 types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') 877 value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32') 878 879 arr = pa.UnionArray.from_dense(types, value_offsets, [binary, int64]) 880 lst = arr.to_pylist() 881 for i in range(len(arr)): 882 for j in range(i, len(arr)): 883 assert arr[i:j].to_pylist() == lst[i:j] 884 885 886def _check_cast_case(case, *, safe=True, check_array_construction=True): 887 in_data, in_type, out_data, out_type = case 888 if isinstance(out_data, pa.Array): 889 assert out_data.type == out_type 890 expected = out_data 891 else: 892 expected = pa.array(out_data, type=out_type) 893 894 # check casting an already created array 895 if isinstance(in_data, pa.Array): 896 assert in_data.type == in_type 897 in_arr = in_data 898 else: 899 in_arr = pa.array(in_data, type=in_type) 900 casted = in_arr.cast(out_type, safe=safe) 901 casted.validate(full=True) 902 assert casted.equals(expected) 903 904 # constructing an array with out type which optionally involves casting 905 # for more see ARROW-1949 906 if check_array_construction: 907 in_arr = pa.array(in_data, type=out_type, safe=safe) 908 assert in_arr.equals(expected) 909 910 911def test_cast_integers_safe(): 912 safe_cases = [ 913 (np.array([0, 1, 2, 3], dtype='i1'), 'int8', 914 np.array([0, 1, 2, 3], dtype='i4'), pa.int32()), 915 (np.array([0, 1, 2, 3], dtype='i1'), 'int8', 916 np.array([0, 1, 2, 3], dtype='u4'), pa.uint16()), 917 (np.array([0, 1, 2, 3], dtype='i1'), 'int8', 918 np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()), 919 (np.array([0, 1, 2, 3], dtype='i1'), 'int8', 920 np.array([0, 1, 2, 3], dtype='f8'), pa.float64()) 921 ] 922 923 for case in safe_cases: 924 _check_cast_case(case) 925 926 unsafe_cases = [ 927 (np.array([50000], dtype='i4'), 'int32', 'int16'), 928 (np.array([70000], dtype='i4'), 'int32', 'uint16'), 929 (np.array([-1], dtype='i4'), 'int32', 'uint16'), 930 (np.array([50000], dtype='u2'), 'uint16', 'int16') 931 ] 932 for in_data, in_type, out_type in unsafe_cases: 933 in_arr = pa.array(in_data, type=in_type) 934 935 with pytest.raises(pa.ArrowInvalid): 936 in_arr.cast(out_type) 937 938 939def test_cast_none(): 940 # ARROW-3735: Ensure that calling cast(None) doesn't segfault. 941 arr = pa.array([1, 2, 3]) 942 943 with pytest.raises(ValueError): 944 arr.cast(None) 945 946 947def test_cast_list_to_primitive(): 948 # ARROW-8070: cast segfaults on unsupported cast from list<binary> to utf8 949 arr = pa.array([[1, 2], [3, 4]]) 950 with pytest.raises(NotImplementedError): 951 arr.cast(pa.int8()) 952 953 arr = pa.array([[b"a", b"b"], [b"c"]], pa.list_(pa.binary())) 954 with pytest.raises(NotImplementedError): 955 arr.cast(pa.binary()) 956 957 958def test_slice_chunked_array_zero_chunks(): 959 # ARROW-8911 960 arr = pa.chunked_array([], type='int8') 961 assert arr.num_chunks == 0 962 963 result = arr[:] 964 assert result.equals(arr) 965 966 # Do not crash 967 arr[:5] 968 969 970def test_cast_chunked_array(): 971 arrays = [pa.array([1, 2, 3]), pa.array([4, 5, 6])] 972 carr = pa.chunked_array(arrays) 973 974 target = pa.float64() 975 casted = carr.cast(target) 976 expected = pa.chunked_array([x.cast(target) for x in arrays]) 977 assert casted.equals(expected) 978 979 980def test_cast_chunked_array_empty(): 981 # ARROW-8142 982 for typ1, typ2 in [(pa.dictionary(pa.int8(), pa.string()), pa.string()), 983 (pa.int64(), pa.int32())]: 984 985 arr = pa.chunked_array([], type=typ1) 986 result = arr.cast(typ2) 987 expected = pa.chunked_array([], type=typ2) 988 assert result.equals(expected) 989 990 991def test_chunked_array_data_warns(): 992 with pytest.warns(FutureWarning): 993 res = pa.chunked_array([[]]).data 994 assert isinstance(res, pa.ChunkedArray) 995 996 997def test_cast_integers_unsafe(): 998 # We let NumPy do the unsafe casting 999 unsafe_cases = [ 1000 (np.array([50000], dtype='i4'), 'int32', 1001 np.array([50000], dtype='i2'), pa.int16()), 1002 (np.array([70000], dtype='i4'), 'int32', 1003 np.array([70000], dtype='u2'), pa.uint16()), 1004 (np.array([-1], dtype='i4'), 'int32', 1005 np.array([-1], dtype='u2'), pa.uint16()), 1006 (np.array([50000], dtype='u2'), pa.uint16(), 1007 np.array([50000], dtype='i2'), pa.int16()) 1008 ] 1009 1010 for case in unsafe_cases: 1011 _check_cast_case(case, safe=False) 1012 1013 1014def test_floating_point_truncate_safe(): 1015 safe_cases = [ 1016 (np.array([1.0, 2.0, 3.0], dtype='float32'), 'float32', 1017 np.array([1, 2, 3], dtype='i4'), pa.int32()), 1018 (np.array([1.0, 2.0, 3.0], dtype='float64'), 'float64', 1019 np.array([1, 2, 3], dtype='i4'), pa.int32()), 1020 (np.array([-10.0, 20.0, -30.0], dtype='float64'), 'float64', 1021 np.array([-10, 20, -30], dtype='i4'), pa.int32()), 1022 ] 1023 for case in safe_cases: 1024 _check_cast_case(case, safe=True) 1025 1026 1027def test_floating_point_truncate_unsafe(): 1028 unsafe_cases = [ 1029 (np.array([1.1, 2.2, 3.3], dtype='float32'), 'float32', 1030 np.array([1, 2, 3], dtype='i4'), pa.int32()), 1031 (np.array([1.1, 2.2, 3.3], dtype='float64'), 'float64', 1032 np.array([1, 2, 3], dtype='i4'), pa.int32()), 1033 (np.array([-10.1, 20.2, -30.3], dtype='float64'), 'float64', 1034 np.array([-10, 20, -30], dtype='i4'), pa.int32()), 1035 ] 1036 for case in unsafe_cases: 1037 # test safe casting raises 1038 with pytest.raises(pa.ArrowInvalid, 1039 match='Floating point value truncated'): 1040 _check_cast_case(case, safe=True) 1041 1042 # test unsafe casting truncates 1043 _check_cast_case(case, safe=False) 1044 1045 1046def test_decimal_to_int_safe(): 1047 safe_cases = [ 1048 ( 1049 [decimal.Decimal("123456"), None, decimal.Decimal("-912345")], 1050 pa.decimal128(32, 5), 1051 [123456, None, -912345], 1052 pa.int32() 1053 ), 1054 ( 1055 [decimal.Decimal("1234"), None, decimal.Decimal("-9123")], 1056 pa.decimal128(19, 10), 1057 [1234, None, -9123], 1058 pa.int16() 1059 ), 1060 ( 1061 [decimal.Decimal("123"), None, decimal.Decimal("-91")], 1062 pa.decimal128(19, 10), 1063 [123, None, -91], 1064 pa.int8() 1065 ), 1066 ] 1067 for case in safe_cases: 1068 _check_cast_case(case) 1069 _check_cast_case(case, safe=True) 1070 1071 1072def test_decimal_to_int_value_out_of_bounds(): 1073 out_of_bounds_cases = [ 1074 ( 1075 np.array([ 1076 decimal.Decimal("1234567890123"), 1077 None, 1078 decimal.Decimal("-912345678901234") 1079 ]), 1080 pa.decimal128(32, 5), 1081 [1912276171, None, -135950322], 1082 pa.int32() 1083 ), 1084 ( 1085 [decimal.Decimal("123456"), None, decimal.Decimal("-912345678")], 1086 pa.decimal128(32, 5), 1087 [-7616, None, -19022], 1088 pa.int16() 1089 ), 1090 ( 1091 [decimal.Decimal("1234"), None, decimal.Decimal("-9123")], 1092 pa.decimal128(32, 5), 1093 [-46, None, 93], 1094 pa.int8() 1095 ), 1096 ] 1097 1098 for case in out_of_bounds_cases: 1099 # test safe casting raises 1100 with pytest.raises(pa.ArrowInvalid, 1101 match='Integer value out of bounds'): 1102 _check_cast_case(case) 1103 1104 # XXX `safe=False` can be ignored when constructing an array 1105 # from a sequence of Python objects (ARROW-8567) 1106 _check_cast_case(case, safe=False, check_array_construction=False) 1107 1108 1109def test_decimal_to_int_non_integer(): 1110 non_integer_cases = [ 1111 ( 1112 [ 1113 decimal.Decimal("123456.21"), 1114 None, 1115 decimal.Decimal("-912345.13") 1116 ], 1117 pa.decimal128(32, 5), 1118 [123456, None, -912345], 1119 pa.int32() 1120 ), 1121 ( 1122 [decimal.Decimal("1234.134"), None, decimal.Decimal("-9123.1")], 1123 pa.decimal128(19, 10), 1124 [1234, None, -9123], 1125 pa.int16() 1126 ), 1127 ( 1128 [decimal.Decimal("123.1451"), None, decimal.Decimal("-91.21")], 1129 pa.decimal128(19, 10), 1130 [123, None, -91], 1131 pa.int8() 1132 ), 1133 ] 1134 1135 for case in non_integer_cases: 1136 # test safe casting raises 1137 msg_regexp = 'Rescaling decimal value would cause data loss' 1138 with pytest.raises(pa.ArrowInvalid, match=msg_regexp): 1139 _check_cast_case(case) 1140 1141 _check_cast_case(case, safe=False) 1142 1143 1144def test_decimal_to_decimal(): 1145 arr = pa.array( 1146 [decimal.Decimal("1234.12"), None], 1147 type=pa.decimal128(19, 10) 1148 ) 1149 result = arr.cast(pa.decimal128(15, 6)) 1150 expected = pa.array( 1151 [decimal.Decimal("1234.12"), None], 1152 type=pa.decimal128(15, 6) 1153 ) 1154 assert result.equals(expected) 1155 1156 with pytest.raises(pa.ArrowInvalid, 1157 match='Rescaling decimal value would cause data loss'): 1158 result = arr.cast(pa.decimal128(9, 1)) 1159 1160 result = arr.cast(pa.decimal128(9, 1), safe=False) 1161 expected = pa.array( 1162 [decimal.Decimal("1234.1"), None], 1163 type=pa.decimal128(9, 1) 1164 ) 1165 assert result.equals(expected) 1166 1167 # TODO FIXME 1168 # this should fail but decimal overflow is not implemented 1169 result = arr.cast(pa.decimal128(1, 2)) 1170 1171 1172def test_safe_cast_nan_to_int_raises(): 1173 arr = pa.array([np.nan, 1.]) 1174 1175 with pytest.raises(pa.ArrowInvalid, 1176 match='Floating point value truncated'): 1177 arr.cast(pa.int64(), safe=True) 1178 1179 1180def test_cast_signed_to_unsigned(): 1181 safe_cases = [ 1182 (np.array([0, 1, 2, 3], dtype='i1'), pa.uint8(), 1183 np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()), 1184 (np.array([0, 1, 2, 3], dtype='i2'), pa.uint16(), 1185 np.array([0, 1, 2, 3], dtype='u2'), pa.uint16()) 1186 ] 1187 1188 for case in safe_cases: 1189 _check_cast_case(case) 1190 1191 1192def test_cast_from_null(): 1193 in_data = [None] * 3 1194 in_type = pa.null() 1195 out_types = [ 1196 pa.null(), 1197 pa.uint8(), 1198 pa.float16(), 1199 pa.utf8(), 1200 pa.binary(), 1201 pa.binary(10), 1202 pa.list_(pa.int16()), 1203 pa.list_(pa.int32(), 4), 1204 pa.large_list(pa.uint8()), 1205 pa.decimal128(19, 4), 1206 pa.timestamp('us'), 1207 pa.timestamp('us', tz='UTC'), 1208 pa.timestamp('us', tz='Europe/Paris'), 1209 pa.duration('us'), 1210 pa.struct([pa.field('a', pa.int32()), 1211 pa.field('b', pa.list_(pa.int8())), 1212 pa.field('c', pa.string())]), 1213 ] 1214 for out_type in out_types: 1215 _check_cast_case((in_data, in_type, in_data, out_type)) 1216 1217 out_types = [ 1218 pa.dictionary(pa.int32(), pa.string()), 1219 pa.union([pa.field('a', pa.binary(10)), 1220 pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), 1221 pa.union([pa.field('a', pa.binary(10)), 1222 pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), 1223 ] 1224 in_arr = pa.array(in_data, type=pa.null()) 1225 for out_type in out_types: 1226 with pytest.raises(NotImplementedError): 1227 in_arr.cast(out_type) 1228 1229 1230def test_cast_string_to_number_roundtrip(): 1231 cases = [ 1232 (pa.array(["1", "127", "-128"]), 1233 pa.array([1, 127, -128], type=pa.int8())), 1234 (pa.array([None, "18446744073709551615"]), 1235 pa.array([None, 18446744073709551615], type=pa.uint64())), 1236 ] 1237 for in_arr, expected in cases: 1238 casted = in_arr.cast(expected.type, safe=True) 1239 casted.validate(full=True) 1240 assert casted.equals(expected) 1241 casted_back = casted.cast(in_arr.type, safe=True) 1242 casted_back.validate(full=True) 1243 assert casted_back.equals(in_arr) 1244 1245 1246def test_cast_dictionary(): 1247 arr = pa.DictionaryArray.from_arrays( 1248 pa.array([0, 1, None], type=pa.int32()), 1249 pa.array(["foo", "bar"])) 1250 assert arr.cast(pa.string()).equals(pa.array(["foo", "bar", None])) 1251 with pytest.raises(pa.ArrowInvalid): 1252 # Shouldn't crash (ARROW-7077) 1253 arr.cast(pa.int32()) 1254 1255 1256def test_view(): 1257 # ARROW-5992 1258 arr = pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) 1259 expected = pa.array(['foo', 'bar', 'baz'], type=pa.binary()) 1260 1261 assert arr.view(pa.binary()).equals(expected) 1262 assert arr.view('binary').equals(expected) 1263 1264 1265def test_unique_simple(): 1266 cases = [ 1267 (pa.array([1, 2, 3, 1, 2, 3]), pa.array([1, 2, 3])), 1268 (pa.array(['foo', None, 'bar', 'foo']), 1269 pa.array(['foo', None, 'bar'])), 1270 (pa.array(['foo', None, 'bar', 'foo'], pa.large_binary()), 1271 pa.array(['foo', None, 'bar'], pa.large_binary())), 1272 ] 1273 for arr, expected in cases: 1274 result = arr.unique() 1275 assert result.equals(expected) 1276 result = pa.chunked_array([arr]).unique() 1277 assert result.equals(expected) 1278 1279 1280def test_value_counts_simple(): 1281 cases = [ 1282 (pa.array([1, 2, 3, 1, 2, 3]), 1283 pa.array([1, 2, 3]), 1284 pa.array([2, 2, 2], type=pa.int64())), 1285 (pa.array(['foo', None, 'bar', 'foo']), 1286 pa.array(['foo', None, 'bar']), 1287 pa.array([2, 1, 1], type=pa.int64())), 1288 (pa.array(['foo', None, 'bar', 'foo'], pa.large_binary()), 1289 pa.array(['foo', None, 'bar'], pa.large_binary()), 1290 pa.array([2, 1, 1], type=pa.int64())), 1291 ] 1292 for arr, expected_values, expected_counts in cases: 1293 for arr_in in (arr, pa.chunked_array([arr])): 1294 result = arr_in.value_counts() 1295 assert result.type.equals( 1296 pa.struct([pa.field("values", arr.type), 1297 pa.field("counts", pa.int64())])) 1298 assert result.field("values").equals(expected_values) 1299 assert result.field("counts").equals(expected_counts) 1300 1301 1302def test_dictionary_encode_simple(): 1303 cases = [ 1304 (pa.array([1, 2, 3, None, 1, 2, 3]), 1305 pa.DictionaryArray.from_arrays( 1306 pa.array([0, 1, 2, None, 0, 1, 2], type='int32'), 1307 [1, 2, 3])), 1308 (pa.array(['foo', None, 'bar', 'foo']), 1309 pa.DictionaryArray.from_arrays( 1310 pa.array([0, None, 1, 0], type='int32'), 1311 ['foo', 'bar'])), 1312 (pa.array(['foo', None, 'bar', 'foo'], type=pa.large_binary()), 1313 pa.DictionaryArray.from_arrays( 1314 pa.array([0, None, 1, 0], type='int32'), 1315 pa.array(['foo', 'bar'], type=pa.large_binary()))), 1316 ] 1317 for arr, expected in cases: 1318 result = arr.dictionary_encode() 1319 assert result.equals(expected) 1320 result = pa.chunked_array([arr]).dictionary_encode() 1321 assert result.num_chunks == 1 1322 assert result.chunk(0).equals(expected) 1323 result = pa.chunked_array([], type=arr.type).dictionary_encode() 1324 assert result.num_chunks == 0 1325 assert result.type == expected.type 1326 1327 1328def test_dictionary_encode_sliced(): 1329 cases = [ 1330 (pa.array([1, 2, 3, None, 1, 2, 3])[1:-1], 1331 pa.DictionaryArray.from_arrays( 1332 pa.array([0, 1, None, 2, 0], type='int32'), 1333 [2, 3, 1])), 1334 (pa.array([None, 'foo', 'bar', 'foo', 'xyzzy'])[1:-1], 1335 pa.DictionaryArray.from_arrays( 1336 pa.array([0, 1, 0], type='int32'), 1337 ['foo', 'bar'])), 1338 (pa.array([None, 'foo', 'bar', 'foo', 'xyzzy'], 1339 type=pa.large_string())[1:-1], 1340 pa.DictionaryArray.from_arrays( 1341 pa.array([0, 1, 0], type='int32'), 1342 pa.array(['foo', 'bar'], type=pa.large_string()))), 1343 ] 1344 for arr, expected in cases: 1345 result = arr.dictionary_encode() 1346 assert result.equals(expected) 1347 result = pa.chunked_array([arr]).dictionary_encode() 1348 assert result.num_chunks == 1 1349 assert result.type == expected.type 1350 assert result.chunk(0).equals(expected) 1351 result = pa.chunked_array([], type=arr.type).dictionary_encode() 1352 assert result.num_chunks == 0 1353 assert result.type == expected.type 1354 1355 1356def test_dictionary_encode_zero_length(): 1357 # User-facing experience of ARROW-7008 1358 arr = pa.array([], type=pa.string()) 1359 encoded = arr.dictionary_encode() 1360 assert len(encoded.dictionary) == 0 1361 encoded.validate(full=True) 1362 1363 1364def test_cast_time32_to_int(): 1365 arr = pa.array(np.array([0, 1, 2], dtype='int32'), 1366 type=pa.time32('s')) 1367 expected = pa.array([0, 1, 2], type='i4') 1368 1369 result = arr.cast('i4') 1370 assert result.equals(expected) 1371 1372 1373def test_cast_time64_to_int(): 1374 arr = pa.array(np.array([0, 1, 2], dtype='int64'), 1375 type=pa.time64('us')) 1376 expected = pa.array([0, 1, 2], type='i8') 1377 1378 result = arr.cast('i8') 1379 assert result.equals(expected) 1380 1381 1382def test_cast_timestamp_to_int(): 1383 arr = pa.array(np.array([0, 1, 2], dtype='int64'), 1384 type=pa.timestamp('us')) 1385 expected = pa.array([0, 1, 2], type='i8') 1386 1387 result = arr.cast('i8') 1388 assert result.equals(expected) 1389 1390 1391def test_cast_date32_to_int(): 1392 arr = pa.array([0, 1, 2], type='i4') 1393 1394 result1 = arr.cast('date32') 1395 result2 = result1.cast('i4') 1396 1397 expected1 = pa.array([ 1398 datetime.date(1970, 1, 1), 1399 datetime.date(1970, 1, 2), 1400 datetime.date(1970, 1, 3) 1401 ]).cast('date32') 1402 1403 assert result1.equals(expected1) 1404 assert result2.equals(arr) 1405 1406 1407def test_cast_duration_to_int(): 1408 arr = pa.array(np.array([0, 1, 2], dtype='int64'), 1409 type=pa.duration('us')) 1410 expected = pa.array([0, 1, 2], type='i8') 1411 1412 result = arr.cast('i8') 1413 assert result.equals(expected) 1414 1415 1416def test_cast_binary_to_utf8(): 1417 binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary()) 1418 utf8_arr = binary_arr.cast(pa.utf8()) 1419 expected = pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) 1420 1421 assert utf8_arr.equals(expected) 1422 1423 non_utf8_values = [('mañana').encode('utf-16-le')] 1424 non_utf8_binary = pa.array(non_utf8_values) 1425 assert non_utf8_binary.type == pa.binary() 1426 with pytest.raises(ValueError): 1427 non_utf8_binary.cast(pa.string()) 1428 1429 non_utf8_all_null = pa.array(non_utf8_values, mask=np.array([True]), 1430 type=pa.binary()) 1431 # No error 1432 casted = non_utf8_all_null.cast(pa.string()) 1433 assert casted.null_count == 1 1434 1435 1436def test_cast_date64_to_int(): 1437 arr = pa.array(np.array([0, 1, 2], dtype='int64'), 1438 type=pa.date64()) 1439 expected = pa.array([0, 1, 2], type='i8') 1440 1441 result = arr.cast('i8') 1442 1443 assert result.equals(expected) 1444 1445 1446def test_date64_from_builtin_datetime(): 1447 val1 = datetime.datetime(2000, 1, 1, 12, 34, 56, 123456) 1448 val2 = datetime.datetime(2000, 1, 1) 1449 result = pa.array([val1, val2], type='date64') 1450 result2 = pa.array([val1.date(), val2.date()], type='date64') 1451 1452 assert result.equals(result2) 1453 1454 as_i8 = result.view('int64') 1455 assert as_i8[0].as_py() == as_i8[1].as_py() 1456 1457 1458@pytest.mark.parametrize(('ty', 'values'), [ 1459 ('bool', [True, False, True]), 1460 ('uint8', range(0, 255)), 1461 ('int8', range(0, 128)), 1462 ('uint16', range(0, 10)), 1463 ('int16', range(0, 10)), 1464 ('uint32', range(0, 10)), 1465 ('int32', range(0, 10)), 1466 ('uint64', range(0, 10)), 1467 ('int64', range(0, 10)), 1468 ('float', [0.0, 0.1, 0.2]), 1469 ('double', [0.0, 0.1, 0.2]), 1470 ('string', ['a', 'b', 'c']), 1471 ('binary', [b'a', b'b', b'c']), 1472 (pa.binary(3), [b'abc', b'bcd', b'cde']) 1473]) 1474def test_cast_identities(ty, values): 1475 arr = pa.array(values, type=ty) 1476 assert arr.cast(ty).equals(arr) 1477 1478 1479pickle_test_parametrize = pytest.mark.parametrize( 1480 ('data', 'typ'), 1481 [ 1482 ([True, False, True, True], pa.bool_()), 1483 ([1, 2, 4, 6], pa.int64()), 1484 ([1.0, 2.5, None], pa.float64()), 1485 (['a', None, 'b'], pa.string()), 1486 ([], None), 1487 ([[1, 2], [3]], pa.list_(pa.int64())), 1488 ([[4, 5], [6]], pa.large_list(pa.int16())), 1489 ([['a'], None, ['b', 'c']], pa.list_(pa.string())), 1490 ([(1, 'a'), (2, 'c'), None], 1491 pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())])) 1492 ] 1493) 1494 1495 1496@pickle_test_parametrize 1497def test_array_pickle(data, typ): 1498 # Allocate here so that we don't have any Arrow data allocated. 1499 # This is needed to ensure that allocator tests can be reliable. 1500 array = pa.array(data, type=typ) 1501 for proto in range(0, pickle.HIGHEST_PROTOCOL + 1): 1502 result = pickle.loads(pickle.dumps(array, proto)) 1503 assert array.equals(result) 1504 1505 1506def test_array_pickle_dictionary(): 1507 # not included in the above as dictionary array cannot be created with 1508 # the pa.array function 1509 array = pa.DictionaryArray.from_arrays([0, 1, 2, 0, 1], ['a', 'b', 'c']) 1510 for proto in range(0, pickle.HIGHEST_PROTOCOL + 1): 1511 result = pickle.loads(pickle.dumps(array, proto)) 1512 assert array.equals(result) 1513 1514 1515@h.given( 1516 past.arrays( 1517 past.all_types, 1518 size=st.integers(min_value=0, max_value=10) 1519 ) 1520) 1521def test_pickling(arr): 1522 data = pickle.dumps(arr) 1523 restored = pickle.loads(data) 1524 assert arr.equals(restored) 1525 1526 1527@pickle_test_parametrize 1528def test_array_pickle5(data, typ): 1529 # Test zero-copy pickling with protocol 5 (PEP 574) 1530 picklemod = pickle5 or pickle 1531 if pickle5 is None and picklemod.HIGHEST_PROTOCOL < 5: 1532 pytest.skip("need pickle5 package or Python 3.8+") 1533 1534 array = pa.array(data, type=typ) 1535 addresses = [buf.address if buf is not None else 0 1536 for buf in array.buffers()] 1537 1538 for proto in range(5, pickle.HIGHEST_PROTOCOL + 1): 1539 buffers = [] 1540 pickled = picklemod.dumps(array, proto, buffer_callback=buffers.append) 1541 result = picklemod.loads(pickled, buffers=buffers) 1542 assert array.equals(result) 1543 1544 result_addresses = [buf.address if buf is not None else 0 1545 for buf in result.buffers()] 1546 assert result_addresses == addresses 1547 1548 1549@pytest.mark.parametrize( 1550 'narr', 1551 [ 1552 np.arange(10, dtype=np.int64), 1553 np.arange(10, dtype=np.int32), 1554 np.arange(10, dtype=np.int16), 1555 np.arange(10, dtype=np.int8), 1556 np.arange(10, dtype=np.uint64), 1557 np.arange(10, dtype=np.uint32), 1558 np.arange(10, dtype=np.uint16), 1559 np.arange(10, dtype=np.uint8), 1560 np.arange(10, dtype=np.float64), 1561 np.arange(10, dtype=np.float32), 1562 np.arange(10, dtype=np.float16), 1563 ] 1564) 1565def test_to_numpy_roundtrip(narr): 1566 arr = pa.array(narr) 1567 assert narr.dtype == arr.to_numpy().dtype 1568 np.testing.assert_array_equal(narr, arr.to_numpy()) 1569 np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy()) 1570 np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy()) 1571 np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy()) 1572 1573 1574def test_array_uint64_from_py_over_range(): 1575 arr = pa.array([2 ** 63], type=pa.uint64()) 1576 expected = pa.array(np.array([2 ** 63], dtype='u8')) 1577 assert arr.equals(expected) 1578 1579 1580def test_array_conversions_no_sentinel_values(): 1581 arr = np.array([1, 2, 3, 4], dtype='int8') 1582 refcount = sys.getrefcount(arr) 1583 arr2 = pa.array(arr) # noqa 1584 assert sys.getrefcount(arr) == (refcount + 1) 1585 1586 assert arr2.type == 'int8' 1587 1588 arr3 = pa.array(np.array([1, np.nan, 2, 3, np.nan, 4], dtype='float32'), 1589 type='float32') 1590 assert arr3.type == 'float32' 1591 assert arr3.null_count == 0 1592 1593 1594def test_time32_time64_from_integer(): 1595 # ARROW-4111 1596 result = pa.array([1, 2, None], type=pa.time32('s')) 1597 expected = pa.array([datetime.time(second=1), 1598 datetime.time(second=2), None], 1599 type=pa.time32('s')) 1600 assert result.equals(expected) 1601 1602 result = pa.array([1, 2, None], type=pa.time32('ms')) 1603 expected = pa.array([datetime.time(microsecond=1000), 1604 datetime.time(microsecond=2000), None], 1605 type=pa.time32('ms')) 1606 assert result.equals(expected) 1607 1608 result = pa.array([1, 2, None], type=pa.time64('us')) 1609 expected = pa.array([datetime.time(microsecond=1), 1610 datetime.time(microsecond=2), None], 1611 type=pa.time64('us')) 1612 assert result.equals(expected) 1613 1614 result = pa.array([1000, 2000, None], type=pa.time64('ns')) 1615 expected = pa.array([datetime.time(microsecond=1), 1616 datetime.time(microsecond=2), None], 1617 type=pa.time64('ns')) 1618 assert result.equals(expected) 1619 1620 1621def test_binary_string_pandas_null_sentinels(): 1622 # ARROW-6227 1623 def _check_case(ty): 1624 arr = pa.array(['string', np.nan], type=ty, from_pandas=True) 1625 expected = pa.array(['string', None], type=ty) 1626 assert arr.equals(expected) 1627 _check_case('binary') 1628 _check_case('utf8') 1629 1630 1631def test_pandas_null_sentinels_raise_error(): 1632 # ARROW-6227 1633 cases = [ 1634 ([None, np.nan], 'null'), 1635 (['string', np.nan], 'binary'), 1636 (['string', np.nan], 'utf8'), 1637 (['string', np.nan], 'large_binary'), 1638 (['string', np.nan], 'large_utf8'), 1639 ([b'string', np.nan], pa.binary(6)), 1640 ([True, np.nan], pa.bool_()), 1641 ([decimal.Decimal('0'), np.nan], pa.decimal128(12, 2)), 1642 ([0, np.nan], pa.date32()), 1643 ([0, np.nan], pa.date32()), 1644 ([0, np.nan], pa.date64()), 1645 ([0, np.nan], pa.time32('s')), 1646 ([0, np.nan], pa.time64('us')), 1647 ([0, np.nan], pa.timestamp('us')), 1648 ([0, np.nan], pa.duration('us')), 1649 ] 1650 for case, ty in cases: 1651 # Both types of exceptions are raised. May want to clean that up 1652 with pytest.raises((ValueError, TypeError)): 1653 pa.array(case, type=ty) 1654 1655 # from_pandas option suppresses failure 1656 result = pa.array(case, type=ty, from_pandas=True) 1657 assert result.null_count == (1 if ty != 'null' else 2) 1658 1659 1660@pytest.mark.pandas 1661def test_pandas_null_sentinels_index(): 1662 # ARROW-7023 - ensure that when passing a pandas Index, "from_pandas" 1663 # semantics are used 1664 import pandas as pd 1665 idx = pd.Index([1, 2, np.nan], dtype=object) 1666 result = pa.array(idx) 1667 expected = pa.array([1, 2, np.nan], from_pandas=True) 1668 assert result.equals(expected) 1669 1670 1671def test_array_from_numpy_datetimeD(): 1672 arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]') 1673 1674 result = pa.array(arr) 1675 expected = pa.array([None, datetime.date(2017, 4, 4)], type=pa.date32()) 1676 assert result.equals(expected) 1677 1678 1679@pytest.mark.parametrize(('dtype', 'type'), [ 1680 ('datetime64[s]', pa.timestamp('s')), 1681 ('datetime64[ms]', pa.timestamp('ms')), 1682 ('datetime64[us]', pa.timestamp('us')), 1683 ('datetime64[ns]', pa.timestamp('ns')) 1684]) 1685def test_array_from_numpy_datetime(dtype, type): 1686 data = [ 1687 None, 1688 datetime.datetime(2017, 4, 4, 12, 11, 10), 1689 datetime.datetime(2018, 1, 1, 0, 2, 0) 1690 ] 1691 1692 # from numpy array 1693 arr = pa.array(np.array(data, dtype=dtype)) 1694 expected = pa.array(data, type=type) 1695 assert arr.equals(expected) 1696 1697 # from list of numpy scalars 1698 arr = pa.array(list(np.array(data, dtype=dtype))) 1699 assert arr.equals(expected) 1700 1701 1702def test_array_from_different_numpy_datetime_units_raises(): 1703 data = [ 1704 None, 1705 datetime.datetime(2017, 4, 4, 12, 11, 10), 1706 datetime.datetime(2018, 1, 1, 0, 2, 0) 1707 ] 1708 s = np.array(data, dtype='datetime64[s]') 1709 ms = np.array(data, dtype='datetime64[ms]') 1710 data = list(s[:2]) + list(ms[2:]) 1711 1712 with pytest.raises(pa.ArrowNotImplementedError): 1713 pa.array(data) 1714 1715 1716@pytest.mark.parametrize('unit', ['ns', 'us', 'ms', 's']) 1717def test_array_from_list_of_timestamps(unit): 1718 n = np.datetime64('NaT', unit) 1719 x = np.datetime64('2017-01-01 01:01:01.111111111', unit) 1720 y = np.datetime64('2018-11-22 12:24:48.111111111', unit) 1721 1722 a1 = pa.array([n, x, y]) 1723 a2 = pa.array([n, x, y], type=pa.timestamp(unit)) 1724 1725 assert a1.type == a2.type 1726 assert a1.type.unit == unit 1727 assert a1[0] == a2[0] 1728 1729 1730def test_array_from_timestamp_with_generic_unit(): 1731 n = np.datetime64('NaT') 1732 x = np.datetime64('2017-01-01 01:01:01.111111111') 1733 y = np.datetime64('2018-11-22 12:24:48.111111111') 1734 1735 with pytest.raises(pa.ArrowNotImplementedError, 1736 match='Unbound or generic datetime64 time unit'): 1737 pa.array([n, x, y]) 1738 1739 1740@pytest.mark.parametrize(('dtype', 'type'), [ 1741 ('timedelta64[s]', pa.duration('s')), 1742 ('timedelta64[ms]', pa.duration('ms')), 1743 ('timedelta64[us]', pa.duration('us')), 1744 ('timedelta64[ns]', pa.duration('ns')) 1745]) 1746def test_array_from_numpy_timedelta(dtype, type): 1747 data = [ 1748 None, 1749 datetime.timedelta(1), 1750 datetime.timedelta(0, 1) 1751 ] 1752 1753 # from numpy array 1754 np_arr = np.array(data, dtype=dtype) 1755 arr = pa.array(np_arr) 1756 assert isinstance(arr, pa.DurationArray) 1757 assert arr.type == type 1758 expected = pa.array(data, type=type) 1759 assert arr.equals(expected) 1760 assert arr.to_pylist() == data 1761 1762 # from list of numpy scalars 1763 arr = pa.array(list(np.array(data, dtype=dtype))) 1764 assert arr.equals(expected) 1765 assert arr.to_pylist() == data 1766 1767 1768def test_array_from_numpy_timedelta_incorrect_unit(): 1769 # generic (no unit) 1770 td = np.timedelta64(1) 1771 1772 for data in [[td], np.array([td])]: 1773 with pytest.raises(NotImplementedError): 1774 pa.array(data) 1775 1776 # unsupported unit 1777 td = np.timedelta64(1, 'M') 1778 for data in [[td], np.array([td])]: 1779 with pytest.raises(NotImplementedError): 1780 pa.array(data) 1781 1782 1783def test_array_from_numpy_ascii(): 1784 arr = np.array(['abcde', 'abc', ''], dtype='|S5') 1785 1786 arrow_arr = pa.array(arr) 1787 assert arrow_arr.type == 'binary' 1788 expected = pa.array(['abcde', 'abc', ''], type='binary') 1789 assert arrow_arr.equals(expected) 1790 1791 mask = np.array([False, True, False]) 1792 arrow_arr = pa.array(arr, mask=mask) 1793 expected = pa.array(['abcde', None, ''], type='binary') 1794 assert arrow_arr.equals(expected) 1795 1796 # Strided variant 1797 arr = np.array(['abcde', 'abc', ''] * 5, dtype='|S5')[::2] 1798 mask = np.array([False, True, False] * 5)[::2] 1799 arrow_arr = pa.array(arr, mask=mask) 1800 1801 expected = pa.array(['abcde', '', None, 'abcde', '', None, 'abcde', ''], 1802 type='binary') 1803 assert arrow_arr.equals(expected) 1804 1805 # 0 itemsize 1806 arr = np.array(['', '', ''], dtype='|S0') 1807 arrow_arr = pa.array(arr) 1808 expected = pa.array(['', '', ''], type='binary') 1809 assert arrow_arr.equals(expected) 1810 1811 1812def test_array_from_numpy_unicode(): 1813 dtypes = ['<U5', '>U5'] 1814 1815 for dtype in dtypes: 1816 arr = np.array(['abcde', 'abc', ''], dtype=dtype) 1817 1818 arrow_arr = pa.array(arr) 1819 assert arrow_arr.type == 'utf8' 1820 expected = pa.array(['abcde', 'abc', ''], type='utf8') 1821 assert arrow_arr.equals(expected) 1822 1823 mask = np.array([False, True, False]) 1824 arrow_arr = pa.array(arr, mask=mask) 1825 expected = pa.array(['abcde', None, ''], type='utf8') 1826 assert arrow_arr.equals(expected) 1827 1828 # Strided variant 1829 arr = np.array(['abcde', 'abc', ''] * 5, dtype=dtype)[::2] 1830 mask = np.array([False, True, False] * 5)[::2] 1831 arrow_arr = pa.array(arr, mask=mask) 1832 1833 expected = pa.array(['abcde', '', None, 'abcde', '', None, 1834 'abcde', ''], type='utf8') 1835 assert arrow_arr.equals(expected) 1836 1837 # 0 itemsize 1838 arr = np.array(['', '', ''], dtype='<U0') 1839 arrow_arr = pa.array(arr) 1840 expected = pa.array(['', '', ''], type='utf8') 1841 assert arrow_arr.equals(expected) 1842 1843 1844def test_array_string_from_non_string(): 1845 # ARROW-5682 - when converting to string raise on non string-like dtype 1846 with pytest.raises(TypeError): 1847 pa.array(np.array([1, 2, 3]), type=pa.string()) 1848 1849 1850def test_array_string_from_all_null(): 1851 # ARROW-5682 1852 vals = np.array([None, None], dtype=object) 1853 arr = pa.array(vals, type=pa.string()) 1854 assert arr.null_count == 2 1855 1856 vals = np.array([np.nan, np.nan], dtype='float64') 1857 # by default raises, but accept as all-null when from_pandas=True 1858 with pytest.raises(TypeError): 1859 pa.array(vals, type=pa.string()) 1860 arr = pa.array(vals, type=pa.string(), from_pandas=True) 1861 assert arr.null_count == 2 1862 1863 1864def test_array_from_masked(): 1865 ma = np.ma.array([1, 2, 3, 4], dtype='int64', 1866 mask=[False, False, True, False]) 1867 result = pa.array(ma) 1868 expected = pa.array([1, 2, None, 4], type='int64') 1869 assert expected.equals(result) 1870 1871 with pytest.raises(ValueError, match="Cannot pass a numpy masked array"): 1872 pa.array(ma, mask=np.array([True, False, False, False])) 1873 1874 1875def test_array_from_shrunken_masked(): 1876 ma = np.ma.array([0], dtype='int64') 1877 result = pa.array(ma) 1878 expected = pa.array([0], type='int64') 1879 assert expected.equals(result) 1880 1881 1882def test_array_from_invalid_dim_raises(): 1883 msg = "only handle 1-dimensional arrays" 1884 arr2d = np.array([[1, 2, 3], [4, 5, 6]]) 1885 with pytest.raises(ValueError, match=msg): 1886 pa.array(arr2d) 1887 1888 arr0d = np.array(0) 1889 with pytest.raises(ValueError, match=msg): 1890 pa.array(arr0d) 1891 1892 1893def test_array_from_strided_bool(): 1894 # ARROW-6325 1895 arr = np.ones((3, 2), dtype=bool) 1896 result = pa.array(arr[:, 0]) 1897 expected = pa.array([True, True, True]) 1898 assert result.equals(expected) 1899 result = pa.array(arr[0, :]) 1900 expected = pa.array([True, True]) 1901 assert result.equals(expected) 1902 1903 1904def test_buffers_primitive(): 1905 a = pa.array([1, 2, None, 4], type=pa.int16()) 1906 buffers = a.buffers() 1907 assert len(buffers) == 2 1908 null_bitmap = buffers[0].to_pybytes() 1909 assert 1 <= len(null_bitmap) <= 64 # XXX this is varying 1910 assert bytearray(null_bitmap)[0] == 0b00001011 1911 1912 # Slicing does not affect the buffers but the offset 1913 a_sliced = a[1:] 1914 buffers = a_sliced.buffers() 1915 a_sliced.offset == 1 1916 assert len(buffers) == 2 1917 null_bitmap = buffers[0].to_pybytes() 1918 assert 1 <= len(null_bitmap) <= 64 # XXX this is varying 1919 assert bytearray(null_bitmap)[0] == 0b00001011 1920 1921 assert struct.unpack('hhxxh', buffers[1].to_pybytes()) == (1, 2, 4) 1922 1923 a = pa.array(np.int8([4, 5, 6])) 1924 buffers = a.buffers() 1925 assert len(buffers) == 2 1926 # No null bitmap from Numpy int array 1927 assert buffers[0] is None 1928 assert struct.unpack('3b', buffers[1].to_pybytes()) == (4, 5, 6) 1929 1930 a = pa.array([b'foo!', None, b'bar!!']) 1931 buffers = a.buffers() 1932 assert len(buffers) == 3 1933 null_bitmap = buffers[0].to_pybytes() 1934 assert bytearray(null_bitmap)[0] == 0b00000101 1935 offsets = buffers[1].to_pybytes() 1936 assert struct.unpack('4i', offsets) == (0, 4, 4, 9) 1937 values = buffers[2].to_pybytes() 1938 assert values == b'foo!bar!!' 1939 1940 1941def test_buffers_nested(): 1942 a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64())) 1943 buffers = a.buffers() 1944 assert len(buffers) == 4 1945 # The parent buffers 1946 null_bitmap = buffers[0].to_pybytes() 1947 assert bytearray(null_bitmap)[0] == 0b00000101 1948 offsets = buffers[1].to_pybytes() 1949 assert struct.unpack('4i', offsets) == (0, 2, 2, 6) 1950 # The child buffers 1951 null_bitmap = buffers[2].to_pybytes() 1952 assert bytearray(null_bitmap)[0] == 0b00110111 1953 values = buffers[3].to_pybytes() 1954 assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5) 1955 1956 a = pa.array([(42, None), None, (None, 43)], 1957 type=pa.struct([pa.field('a', pa.int8()), 1958 pa.field('b', pa.int16())])) 1959 buffers = a.buffers() 1960 assert len(buffers) == 5 1961 # The parent buffer 1962 null_bitmap = buffers[0].to_pybytes() 1963 assert bytearray(null_bitmap)[0] == 0b00000101 1964 # The child buffers: 'a' 1965 null_bitmap = buffers[1].to_pybytes() 1966 assert bytearray(null_bitmap)[0] == 0b00000001 1967 values = buffers[2].to_pybytes() 1968 assert struct.unpack('bxx', values) == (42,) 1969 # The child buffers: 'b' 1970 null_bitmap = buffers[3].to_pybytes() 1971 assert bytearray(null_bitmap)[0] == 0b00000100 1972 values = buffers[4].to_pybytes() 1973 assert struct.unpack('4xh', values) == (43,) 1974 1975 1976def test_nbytes_sizeof(): 1977 a = pa.array(np.array([4, 5, 6], dtype='int64')) 1978 assert a.nbytes == 8 * 3 1979 assert sys.getsizeof(a) >= object.__sizeof__(a) + a.nbytes 1980 a = pa.array([1, None, 3], type='int64') 1981 assert a.nbytes == 8*3 + 1 1982 assert sys.getsizeof(a) >= object.__sizeof__(a) + a.nbytes 1983 a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64())) 1984 assert a.nbytes == 1 + 4 * 4 + 1 + 6 * 8 1985 assert sys.getsizeof(a) >= object.__sizeof__(a) + a.nbytes 1986 1987 1988def test_invalid_tensor_constructor_repr(): 1989 # ARROW-2638: prevent calling extension class constructors directly 1990 with pytest.raises(TypeError): 1991 repr(pa.Tensor([1])) 1992 1993 1994def test_invalid_tensor_construction(): 1995 with pytest.raises(TypeError): 1996 pa.Tensor() 1997 1998 1999@pytest.mark.parametrize(('offset_type', 'list_type_factory'), 2000 [(pa.int32(), pa.list_), (pa.int64(), pa.large_list)]) 2001def test_list_array_flatten(offset_type, list_type_factory): 2002 typ2 = list_type_factory( 2003 list_type_factory( 2004 pa.int64() 2005 ) 2006 ) 2007 arr2 = pa.array([ 2008 None, 2009 [ 2010 [1, None, 2], 2011 None, 2012 [3, 4] 2013 ], 2014 [], 2015 [ 2016 [], 2017 [5, 6], 2018 None 2019 ], 2020 [ 2021 [7, 8] 2022 ] 2023 ], type=typ2) 2024 offsets2 = pa.array([0, 0, 3, 3, 6, 7], type=offset_type) 2025 2026 typ1 = list_type_factory(pa.int64()) 2027 arr1 = pa.array([ 2028 [1, None, 2], 2029 None, 2030 [3, 4], 2031 [], 2032 [5, 6], 2033 None, 2034 [7, 8] 2035 ], type=typ1) 2036 offsets1 = pa.array([0, 3, 3, 5, 5, 7, 7, 9], type=offset_type) 2037 2038 arr0 = pa.array([ 2039 1, None, 2, 2040 3, 4, 2041 5, 6, 2042 7, 8 2043 ], type=pa.int64()) 2044 2045 assert arr2.flatten().equals(arr1) 2046 assert arr2.offsets.equals(offsets2) 2047 assert arr2.values.equals(arr1) 2048 assert arr1.flatten().equals(arr0) 2049 assert arr1.offsets.equals(offsets1) 2050 assert arr1.values.equals(arr0) 2051 assert arr2.flatten().flatten().equals(arr0) 2052 assert arr2.values.values.equals(arr0) 2053 2054 2055@pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list]) 2056def test_list_array_flatten_non_canonical(list_type_factory): 2057 # Non-canonical list array (null elements backed by non-empty sublists) 2058 typ = list_type_factory(pa.int64()) 2059 arr = pa.array([[1], [2, 3], [4, 5, 6]], type=typ) 2060 buffers = arr.buffers()[:2] 2061 buffers[0] = pa.py_buffer(b"\x05") # validity bitmap 2062 arr = arr.from_buffers(arr.type, len(arr), buffers, children=[arr.values]) 2063 assert arr.to_pylist() == [[1], None, [4, 5, 6]] 2064 assert arr.offsets.to_pylist() == [0, 1, 3, 6] 2065 2066 flattened = arr.flatten() 2067 flattened.validate(full=True) 2068 assert flattened.type == typ.value_type 2069 assert flattened.to_pylist() == [1, 4, 5, 6] 2070 2071 # .values is the physical values array (including masked elements) 2072 assert arr.values.to_pylist() == [1, 2, 3, 4, 5, 6] 2073 2074 2075@pytest.mark.parametrize('klass', [pa.ListArray, pa.LargeListArray]) 2076def test_list_array_values_offsets_sliced(klass): 2077 # ARROW-7301 2078 arr = klass.from_arrays(offsets=[0, 3, 4, 6], values=[1, 2, 3, 4, 5, 6]) 2079 assert arr.values.to_pylist() == [1, 2, 3, 4, 5, 6] 2080 assert arr.offsets.to_pylist() == [0, 3, 4, 6] 2081 2082 # sliced -> values keeps referring to full values buffer, but offsets is 2083 # sliced as well so the offsets correctly point into the full values array 2084 # sliced -> flatten() will return the sliced value array. 2085 arr2 = arr[1:] 2086 assert arr2.values.to_pylist() == [1, 2, 3, 4, 5, 6] 2087 assert arr2.offsets.to_pylist() == [3, 4, 6] 2088 assert arr2.flatten().to_pylist() == [4, 5, 6] 2089 i = arr2.offsets[0].as_py() 2090 j = arr2.offsets[1].as_py() 2091 assert arr2[0].as_py() == arr2.values[i:j].to_pylist() == [4] 2092 2093 2094def test_fixed_size_list_array_flatten(): 2095 typ2 = pa.list_(pa.list_(pa.int64(), 2), 3) 2096 arr2 = pa.array([ 2097 [ 2098 [1, 2], 2099 [3, 4], 2100 [5, 6], 2101 ], 2102 None, 2103 [ 2104 [7, None], 2105 None, 2106 [8, 9] 2107 ], 2108 ], type=typ2) 2109 assert arr2.type.equals(typ2) 2110 2111 typ1 = pa.list_(pa.int64(), 2) 2112 arr1 = pa.array([ 2113 [1, 2], [3, 4], [5, 6], 2114 None, None, None, 2115 [7, None], None, [8, 9] 2116 ], type=typ1) 2117 assert arr1.type.equals(typ1) 2118 assert arr2.flatten().equals(arr1) 2119 2120 typ0 = pa.int64() 2121 arr0 = pa.array([ 2122 1, 2, 3, 4, 5, 6, 2123 None, None, None, None, None, None, 2124 7, None, None, None, 8, 9, 2125 ], type=typ0) 2126 assert arr0.type.equals(typ0) 2127 assert arr1.flatten().equals(arr0) 2128 assert arr2.flatten().flatten().equals(arr0) 2129 2130 2131def test_struct_array_flatten(): 2132 ty = pa.struct([pa.field('x', pa.int16()), 2133 pa.field('y', pa.float32())]) 2134 a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) 2135 xs, ys = a.flatten() 2136 assert xs.type == pa.int16() 2137 assert ys.type == pa.float32() 2138 assert xs.to_pylist() == [1, 3, 5] 2139 assert ys.to_pylist() == [2.5, 4.5, 6.5] 2140 xs, ys = a[1:].flatten() 2141 assert xs.to_pylist() == [3, 5] 2142 assert ys.to_pylist() == [4.5, 6.5] 2143 2144 a = pa.array([(1, 2.5), None, (3, 4.5)], type=ty) 2145 xs, ys = a.flatten() 2146 assert xs.to_pylist() == [1, None, 3] 2147 assert ys.to_pylist() == [2.5, None, 4.5] 2148 xs, ys = a[1:].flatten() 2149 assert xs.to_pylist() == [None, 3] 2150 assert ys.to_pylist() == [None, 4.5] 2151 2152 a = pa.array([(1, None), (2, 3.5), (None, 4.5)], type=ty) 2153 xs, ys = a.flatten() 2154 assert xs.to_pylist() == [1, 2, None] 2155 assert ys.to_pylist() == [None, 3.5, 4.5] 2156 xs, ys = a[1:].flatten() 2157 assert xs.to_pylist() == [2, None] 2158 assert ys.to_pylist() == [3.5, 4.5] 2159 2160 a = pa.array([(1, None), None, (None, 2.5)], type=ty) 2161 xs, ys = a.flatten() 2162 assert xs.to_pylist() == [1, None, None] 2163 assert ys.to_pylist() == [None, None, 2.5] 2164 xs, ys = a[1:].flatten() 2165 assert xs.to_pylist() == [None, None] 2166 assert ys.to_pylist() == [None, 2.5] 2167 2168 2169def test_struct_array_field(): 2170 ty = pa.struct([pa.field('x', pa.int16()), 2171 pa.field('y', pa.float32())]) 2172 a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) 2173 2174 x0 = a.field(0) 2175 y0 = a.field(1) 2176 x1 = a.field(-2) 2177 y1 = a.field(-1) 2178 x2 = a.field('x') 2179 y2 = a.field('y') 2180 2181 assert isinstance(x0, pa.lib.Int16Array) 2182 assert isinstance(y1, pa.lib.FloatArray) 2183 assert x0.equals(pa.array([1, 3, 5], type=pa.int16())) 2184 assert y0.equals(pa.array([2.5, 4.5, 6.5], type=pa.float32())) 2185 assert x0.equals(x1) 2186 assert x0.equals(x2) 2187 assert y0.equals(y1) 2188 assert y0.equals(y2) 2189 2190 for invalid_index in [None, pa.int16()]: 2191 with pytest.raises(TypeError): 2192 a.field(invalid_index) 2193 2194 for invalid_index in [3, -3]: 2195 with pytest.raises(IndexError): 2196 a.field(invalid_index) 2197 2198 for invalid_name in ['z', '']: 2199 with pytest.raises(KeyError): 2200 a.field(invalid_name) 2201 2202 2203def test_empty_cast(): 2204 types = [ 2205 pa.null(), 2206 pa.bool_(), 2207 pa.int8(), 2208 pa.int16(), 2209 pa.int32(), 2210 pa.int64(), 2211 pa.uint8(), 2212 pa.uint16(), 2213 pa.uint32(), 2214 pa.uint64(), 2215 pa.float16(), 2216 pa.float32(), 2217 pa.float64(), 2218 pa.date32(), 2219 pa.date64(), 2220 pa.binary(), 2221 pa.binary(length=4), 2222 pa.string(), 2223 ] 2224 2225 for (t1, t2) in itertools.product(types, types): 2226 try: 2227 # ARROW-4766: Ensure that supported types conversion don't segfault 2228 # on empty arrays of common types 2229 pa.array([], type=t1).cast(t2) 2230 except (pa.lib.ArrowNotImplementedError, pa.ArrowInvalid): 2231 continue 2232 2233 2234def test_nested_dictionary_array(): 2235 dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b']) 2236 list_arr = pa.ListArray.from_arrays([0, 2, 3], dict_arr) 2237 assert list_arr.to_pylist() == [['a', 'b'], ['a']] 2238 2239 dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b']) 2240 dict_arr2 = pa.DictionaryArray.from_arrays([0, 1, 2, 1, 0], dict_arr) 2241 assert dict_arr2.to_pylist() == ['a', 'b', 'a', 'b', 'a'] 2242 2243 2244def test_array_from_numpy_str_utf8(): 2245 # ARROW-3890 -- in Python 3, NPY_UNICODE arrays are produced, but in Python 2246 # 2 they are NPY_STRING (binary), so we must do UTF-8 validation 2247 vec = np.array(["toto", "tata"]) 2248 vec2 = np.array(["toto", "tata"], dtype=object) 2249 2250 arr = pa.array(vec, pa.string()) 2251 arr2 = pa.array(vec2, pa.string()) 2252 expected = pa.array(["toto", "tata"]) 2253 assert arr.equals(expected) 2254 assert arr2.equals(expected) 2255 2256 # with mask, separate code path 2257 mask = np.array([False, False], dtype=bool) 2258 arr = pa.array(vec, pa.string(), mask=mask) 2259 assert arr.equals(expected) 2260 2261 # UTF8 validation failures 2262 vec = np.array([('mañana').encode('utf-16-le')]) 2263 with pytest.raises(ValueError): 2264 pa.array(vec, pa.string()) 2265 2266 with pytest.raises(ValueError): 2267 pa.array(vec, pa.string(), mask=np.array([False])) 2268 2269 2270@pytest.mark.large_memory 2271def test_numpy_binary_overflow_to_chunked(): 2272 # ARROW-3762, ARROW-5966 2273 2274 # 2^31 + 1 bytes 2275 values = [b'x'] 2276 unicode_values = ['x'] 2277 2278 # Make 10 unique 1MB strings then repeat then 2048 times 2279 unique_strings = { 2280 i: b'x' * ((1 << 20) - 1) + str(i % 10).encode('utf8') 2281 for i in range(10) 2282 } 2283 unicode_unique_strings = {i: x.decode('utf8') 2284 for i, x in unique_strings.items()} 2285 values += [unique_strings[i % 10] for i in range(1 << 11)] 2286 unicode_values += [unicode_unique_strings[i % 10] 2287 for i in range(1 << 11)] 2288 2289 for case, ex_type in [(values, pa.binary()), 2290 (unicode_values, pa.utf8())]: 2291 arr = np.array(case) 2292 arrow_arr = pa.array(arr) 2293 arr = None 2294 2295 assert isinstance(arrow_arr, pa.ChunkedArray) 2296 assert arrow_arr.type == ex_type 2297 2298 # Split up into 16MB chunks. 128 * 16 = 2048, so 129 2299 assert arrow_arr.num_chunks == 129 2300 2301 value_index = 0 2302 for i in range(arrow_arr.num_chunks): 2303 chunk = arrow_arr.chunk(i) 2304 for val in chunk: 2305 assert val.as_py() == case[value_index] 2306 value_index += 1 2307 2308 2309@pytest.mark.large_memory 2310def test_list_child_overflow_to_chunked(): 2311 vals = [['x' * 1024]] * ((2 << 20) + 1) 2312 with pytest.raises(ValueError, match="overflowed"): 2313 pa.array(vals) 2314 2315 2316def test_infer_type_masked(): 2317 # ARROW-5208 2318 ty = pa.infer_type(['foo', 'bar', None, 2], 2319 mask=[False, False, False, True]) 2320 assert ty == pa.utf8() 2321 2322 # all masked 2323 ty = pa.infer_type(['foo', 'bar', None, 2], 2324 mask=np.array([True, True, True, True])) 2325 assert ty == pa.null() 2326 2327 # length 0 2328 assert pa.infer_type([], mask=[]) == pa.null() 2329 2330 2331def test_array_masked(): 2332 # ARROW-5208 2333 arr = pa.array([4, None, 4, 3.], 2334 mask=np.array([False, True, False, True])) 2335 assert arr.type == pa.int64() 2336 2337 # ndarray dtype=object argument 2338 arr = pa.array(np.array([4, None, 4, 3.], dtype="O"), 2339 mask=np.array([False, True, False, True])) 2340 assert arr.type == pa.int64() 2341 2342 2343def test_array_from_large_pyints(): 2344 # ARROW-5430 2345 with pytest.raises(OverflowError): 2346 # too large for int64 so dtype must be explicitly provided 2347 pa.array([int(2 ** 63)]) 2348 2349 2350def test_array_protocol(): 2351 2352 class MyArray: 2353 def __init__(self, data): 2354 self.data = data 2355 2356 def __arrow_array__(self, type=None): 2357 return pa.array(self.data, type=type) 2358 2359 arr = MyArray(np.array([1, 2, 3], dtype='int64')) 2360 result = pa.array(arr) 2361 expected = pa.array([1, 2, 3], type=pa.int64()) 2362 assert result.equals(expected) 2363 result = pa.array(arr, type=pa.int64()) 2364 expected = pa.array([1, 2, 3], type=pa.int64()) 2365 assert result.equals(expected) 2366 result = pa.array(arr, type=pa.float64()) 2367 expected = pa.array([1, 2, 3], type=pa.float64()) 2368 assert result.equals(expected) 2369 2370 # raise error when passing size or mask keywords 2371 with pytest.raises(ValueError): 2372 pa.array(arr, mask=np.array([True, False, True])) 2373 with pytest.raises(ValueError): 2374 pa.array(arr, size=3) 2375 2376 # ensure the return value is an Array 2377 class MyArrayInvalid: 2378 def __init__(self, data): 2379 self.data = data 2380 2381 def __arrow_array__(self, type=None): 2382 return np.array(self.data) 2383 2384 arr = MyArrayInvalid(np.array([1, 2, 3], dtype='int64')) 2385 with pytest.raises(TypeError): 2386 pa.array(arr) 2387 2388 # ARROW-7066 - allow ChunkedArray output 2389 class MyArray2: 2390 def __init__(self, data): 2391 self.data = data 2392 2393 def __arrow_array__(self, type=None): 2394 return pa.chunked_array([self.data], type=type) 2395 2396 arr = MyArray2(np.array([1, 2, 3], dtype='int64')) 2397 result = pa.array(arr) 2398 expected = pa.chunked_array([[1, 2, 3]], type=pa.int64()) 2399 assert result.equals(expected) 2400 2401 2402def test_concat_array(): 2403 concatenated = pa.concat_arrays( 2404 [pa.array([1, 2]), pa.array([3, 4])]) 2405 assert concatenated.equals(pa.array([1, 2, 3, 4])) 2406 2407 2408def test_concat_array_different_types(): 2409 with pytest.raises(pa.ArrowInvalid): 2410 pa.concat_arrays([pa.array([1]), pa.array([2.])]) 2411 2412 2413@pytest.mark.pandas 2414def test_to_pandas_timezone(): 2415 # https://issues.apache.org/jira/browse/ARROW-6652 2416 arr = pa.array([1, 2, 3], type=pa.timestamp('s', tz='Europe/Brussels')) 2417 s = arr.to_pandas() 2418 assert s.dt.tz is not None 2419 arr = pa.chunked_array([arr]) 2420 s = arr.to_pandas() 2421 assert s.dt.tz is not None 2422