1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements.  See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership.  The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License.  You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied.  See the License for the
15# specific language governing permissions and limitations
16# under the License.
17
18import datetime
19import decimal
20import hypothesis as h
21import hypothesis.strategies as st
22import itertools
23import pickle
24import pytest
25import struct
26import sys
27
28import numpy as np
29try:
30    import pickle5
31except ImportError:
32    pickle5 = None
33
34import pyarrow as pa
35import pyarrow.tests.strategies as past
36from pyarrow import compat
37
38
39def test_total_bytes_allocated():
40    assert pa.total_allocated_bytes() == 0
41
42
43def test_getitem_NULL():
44    arr = pa.array([1, None, 2])
45    assert arr[1] is pa.NULL
46
47
48def test_constructor_raises():
49    # This could happen by wrong capitalization.
50    # ARROW-2638: prevent calling extension class constructors directly
51    with pytest.raises(TypeError):
52        pa.Array([1, 2])
53
54
55def test_list_format():
56    arr = pa.array([[1], None, [2, 3, None]])
57    result = arr.to_string()
58    expected = """\
59[
60  [
61    1
62  ],
63  null,
64  [
65    2,
66    3,
67    null
68  ]
69]"""
70    assert result == expected
71
72
73def test_string_format():
74    arr = pa.array(['', None, 'foo'])
75    result = arr.to_string()
76    expected = """\
77[
78  "",
79  null,
80  "foo"
81]"""
82    assert result == expected
83
84
85def test_long_array_format():
86    arr = pa.array(range(100))
87    result = arr.to_string(window=2)
88    expected = """\
89[
90  0,
91  1,
92  ...
93  98,
94  99
95]"""
96    assert result == expected
97
98
99def test_binary_format():
100    arr = pa.array([b'\x00', b'', None, b'\x01foo', b'\x80\xff'])
101    result = arr.to_string()
102    expected = """\
103[
104  00,
105  ,
106  null,
107  01666F6F,
108  80FF
109]"""
110    assert result == expected
111
112
113def test_to_numpy_zero_copy():
114    arr = pa.array(range(10))
115
116    np_arr = arr.to_numpy()
117
118    # check for zero copy (both arrays using same memory)
119    arrow_buf = arr.buffers()[1]
120    assert arrow_buf.address == np_arr.ctypes.data
121
122    arr = None
123    import gc
124    gc.collect()
125
126    # Ensure base is still valid
127    assert np_arr.base is not None
128    expected = np.arange(10)
129    np.testing.assert_array_equal(np_arr, expected)
130
131
132def test_to_numpy_unsupported_types():
133    # ARROW-2871: Some primitive types are not yet supported in to_numpy
134    bool_arr = pa.array([True, False, True])
135
136    with pytest.raises(ValueError):
137        bool_arr.to_numpy()
138
139    result = bool_arr.to_numpy(zero_copy_only=False)
140    expected = np.array([True, False, True])
141    np.testing.assert_array_equal(result, expected)
142
143    null_arr = pa.array([None, None, None])
144
145    with pytest.raises(ValueError):
146        null_arr.to_numpy()
147
148    result = null_arr.to_numpy(zero_copy_only=False)
149    expected = np.array([None, None, None], dtype=object)
150    np.testing.assert_array_equal(result, expected)
151
152    arr = pa.array([1, 2, None])
153
154    with pytest.raises(ValueError, match="with 1 nulls"):
155        arr.to_numpy()
156
157
158def test_to_numpy_writable():
159    arr = pa.array(range(10))
160    np_arr = arr.to_numpy()
161
162    # by default not writable for zero-copy conversion
163    with pytest.raises(ValueError):
164        np_arr[0] = 10
165
166    np_arr2 = arr.to_numpy(zero_copy_only=False, writable=True)
167    np_arr2[0] = 10
168    assert arr[0].as_py() == 0
169
170    # when asking for writable, cannot do zero-copy
171    with pytest.raises(ValueError):
172        arr.to_numpy(zero_copy_only=True, writable=True)
173
174
175@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
176def test_to_numpy_datetime64(unit):
177    arr = pa.array([1, 2, 3], pa.timestamp(unit))
178    expected = np.array([1, 2, 3], dtype="datetime64[{}]".format(unit))
179    np_arr = arr.to_numpy()
180    np.testing.assert_array_equal(np_arr, expected)
181
182
183@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
184def test_to_numpy_timedelta64(unit):
185    arr = pa.array([1, 2, 3], pa.duration(unit))
186    expected = np.array([1, 2, 3], dtype="timedelta64[{}]".format(unit))
187    np_arr = arr.to_numpy()
188    np.testing.assert_array_equal(np_arr, expected)
189
190
191def test_to_numpy_dictionary():
192    # ARROW-7591
193    arr = pa.array(["a", "b", "a"]).dictionary_encode()
194    expected = np.array(["a", "b", "a"], dtype=object)
195    np_arr = arr.to_numpy(zero_copy_only=False)
196    np.testing.assert_array_equal(np_arr, expected)
197
198
199@pytest.mark.pandas
200def test_to_pandas_zero_copy():
201    import gc
202
203    arr = pa.array(range(10))
204
205    for i in range(10):
206        series = arr.to_pandas()
207        assert sys.getrefcount(series) == 2
208        series = None  # noqa
209
210    assert sys.getrefcount(arr) == 2
211
212    for i in range(10):
213        arr = pa.array(range(10))
214        series = arr.to_pandas()
215        arr = None
216        gc.collect()
217
218        # Ensure base is still valid
219
220        # Because of py.test's assert inspection magic, if you put getrefcount
221        # on the line being examined, it will be 1 higher than you expect
222        base_refcount = sys.getrefcount(series.values.base)
223        assert base_refcount == 2
224        series.sum()
225
226
227@pytest.mark.nopandas
228@pytest.mark.pandas
229def test_asarray():
230    # ensure this is tested both when pandas is present or not (ARROW-6564)
231
232    arr = pa.array(range(4))
233
234    # The iterator interface gives back an array of Int64Value's
235    np_arr = np.asarray([_ for _ in arr])
236    assert np_arr.tolist() == [0, 1, 2, 3]
237    assert np_arr.dtype == np.dtype('O')
238    assert type(np_arr[0]) == pa.lib.Int64Value
239
240    # Calling with the arrow array gives back an array with 'int64' dtype
241    np_arr = np.asarray(arr)
242    assert np_arr.tolist() == [0, 1, 2, 3]
243    assert np_arr.dtype == np.dtype('int64')
244
245    # An optional type can be specified when calling np.asarray
246    np_arr = np.asarray(arr, dtype='str')
247    assert np_arr.tolist() == ['0', '1', '2', '3']
248
249    # If PyArrow array has null values, numpy type will be changed as needed
250    # to support nulls.
251    arr = pa.array([0, 1, 2, None])
252    assert arr.type == pa.int64()
253    np_arr = np.asarray(arr)
254    elements = np_arr.tolist()
255    assert elements[:3] == [0., 1., 2.]
256    assert np.isnan(elements[3])
257    assert np_arr.dtype == np.dtype('float64')
258
259    # DictionaryType data will be converted to dense numpy array
260    arr = pa.DictionaryArray.from_arrays(
261        pa.array([0, 1, 2, 0, 1]), pa.array(['a', 'b', 'c']))
262    np_arr = np.asarray(arr)
263    assert np_arr.dtype == np.dtype('object')
264    assert np_arr.tolist() == ['a', 'b', 'c', 'a', 'b']
265
266
267def test_array_getitem():
268    arr = pa.array(range(10, 15))
269    lst = arr.to_pylist()
270
271    for idx in range(-len(arr), len(arr)):
272        assert arr[idx].as_py() == lst[idx]
273    for idx in range(-2 * len(arr), -len(arr)):
274        with pytest.raises(IndexError):
275            arr[idx]
276    for idx in range(len(arr), 2 * len(arr)):
277        with pytest.raises(IndexError):
278            arr[idx]
279
280
281def test_array_slice():
282    arr = pa.array(range(10))
283
284    sliced = arr.slice(2)
285    expected = pa.array(range(2, 10))
286    assert sliced.equals(expected)
287
288    sliced2 = arr.slice(2, 4)
289    expected2 = pa.array(range(2, 6))
290    assert sliced2.equals(expected2)
291
292    # 0 offset
293    assert arr.slice(0).equals(arr)
294
295    # Slice past end of array
296    assert len(arr.slice(len(arr))) == 0
297
298    with pytest.raises(IndexError):
299        arr.slice(-1)
300
301    # Test slice notation
302    assert arr[2:].equals(arr.slice(2))
303    assert arr[2:5].equals(arr.slice(2, 3))
304    assert arr[-5:].equals(arr.slice(len(arr) - 5))
305
306    n = len(arr)
307    for start in range(-n * 2, n * 2):
308        for stop in range(-n * 2, n * 2):
309            assert arr[start:stop].to_pylist() == arr.to_pylist()[start:stop]
310
311
312def test_array_slice_negative_step():
313    # ARROW-2714
314    np_arr = np.arange(20)
315    arr = pa.array(np_arr)
316    chunked_arr = pa.chunked_array([arr])
317
318    cases = [
319        slice(None, None, -1),
320        slice(None, 6, -2),
321        slice(10, 6, -2),
322        slice(8, None, -2),
323        slice(2, 10, -2),
324        slice(10, 2, -2),
325        slice(None, None, 2),
326        slice(0, 10, 2),
327    ]
328
329    for case in cases:
330        result = arr[case]
331        expected = pa.array(np_arr[case])
332        assert result.equals(expected)
333
334        result = pa.record_batch([arr], names=['f0'])[case]
335        expected = pa.record_batch([expected], names=['f0'])
336        assert result.equals(expected)
337
338        result = chunked_arr[case]
339        expected = pa.chunked_array([np_arr[case]])
340        assert result.equals(expected)
341
342
343def test_array_diff():
344    # ARROW-6252
345    arr1 = pa.array(['foo'], type=pa.utf8())
346    arr2 = pa.array(['foo', 'bar', None], type=pa.utf8())
347    arr3 = pa.array([1, 2, 3])
348    arr4 = pa.array([[], [1], None], type=pa.list_(pa.int64()))
349
350    assert arr1.diff(arr1) == ''
351    assert arr1.diff(arr2) == '''
352@@ -1, +1 @@
353+"bar"
354+null
355'''
356    assert arr1.diff(arr3).strip() == '# Array types differed: string vs int64'
357    assert arr1.diff(arr3).strip() == '# Array types differed: string vs int64'
358    assert arr1.diff(arr4).strip() == ('# Array types differed: string vs '
359                                       'list<item: int64>')
360
361
362def test_array_iter():
363    arr = pa.array(range(10))
364
365    for i, j in zip(range(10), arr):
366        assert i == j
367
368    assert isinstance(arr, compat.Iterable)
369
370
371def test_struct_array_slice():
372    # ARROW-2311: slicing nested arrays needs special care
373    ty = pa.struct([pa.field('a', pa.int8()),
374                    pa.field('b', pa.float32())])
375    arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty)
376    assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5},
377                                   {'a': 5, 'b': 6.5}]
378
379
380def test_array_factory_invalid_type():
381
382    class MyObject:
383        pass
384
385    arr = np.array([MyObject()])
386    with pytest.raises(ValueError):
387        pa.array(arr)
388
389
390def test_array_ref_to_ndarray_base():
391    arr = np.array([1, 2, 3])
392
393    refcount = sys.getrefcount(arr)
394    arr2 = pa.array(arr)  # noqa
395    assert sys.getrefcount(arr) == (refcount + 1)
396
397
398def test_array_from_buffers():
399    values_buf = pa.py_buffer(np.int16([4, 5, 6, 7]))
400    nulls_buf = pa.py_buffer(np.uint8([0b00001101]))
401    arr = pa.Array.from_buffers(pa.int16(), 4, [nulls_buf, values_buf])
402    assert arr.type == pa.int16()
403    assert arr.to_pylist() == [4, None, 6, 7]
404
405    arr = pa.Array.from_buffers(pa.int16(), 4, [None, values_buf])
406    assert arr.type == pa.int16()
407    assert arr.to_pylist() == [4, 5, 6, 7]
408
409    arr = pa.Array.from_buffers(pa.int16(), 3, [nulls_buf, values_buf],
410                                offset=1)
411    assert arr.type == pa.int16()
412    assert arr.to_pylist() == [None, 6, 7]
413
414    with pytest.raises(TypeError):
415        pa.Array.from_buffers(pa.int16(), 3, ['', ''], offset=1)
416
417
418def test_string_binary_from_buffers():
419    array = pa.array(["a", None, "b", "c"])
420
421    buffers = array.buffers()
422    copied = pa.StringArray.from_buffers(
423        len(array), buffers[1], buffers[2], buffers[0], array.null_count,
424        array.offset)
425    assert copied.to_pylist() == ["a", None, "b", "c"]
426
427    binary_copy = pa.Array.from_buffers(pa.binary(), len(array),
428                                        array.buffers(), array.null_count,
429                                        array.offset)
430    assert binary_copy.to_pylist() == [b"a", None, b"b", b"c"]
431
432    copied = pa.StringArray.from_buffers(
433        len(array), buffers[1], buffers[2], buffers[0])
434    assert copied.to_pylist() == ["a", None, "b", "c"]
435
436    sliced = array[1:]
437    buffers = sliced.buffers()
438    copied = pa.StringArray.from_buffers(
439        len(sliced), buffers[1], buffers[2], buffers[0], -1, sliced.offset)
440    assert copied.to_pylist() == [None, "b", "c"]
441    assert copied.null_count == 1
442
443    # Slice but exclude all null entries so that we don't need to pass
444    # the null bitmap.
445    sliced = array[2:]
446    buffers = sliced.buffers()
447    copied = pa.StringArray.from_buffers(
448        len(sliced), buffers[1], buffers[2], None, -1, sliced.offset)
449    assert copied.to_pylist() == ["b", "c"]
450    assert copied.null_count == 0
451
452
453@pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list])
454def test_list_from_buffers(list_type_factory):
455    ty = list_type_factory(pa.int16())
456    array = pa.array([[0, 1, 2], None, [], [3, 4, 5]], type=ty)
457    assert array.type == ty
458
459    buffers = array.buffers()
460
461    with pytest.raises(ValueError):
462        # No children
463        pa.Array.from_buffers(ty, 4, [None, buffers[1]])
464
465    child = pa.Array.from_buffers(pa.int16(), 6, buffers[2:])
466    copied = pa.Array.from_buffers(ty, 4, buffers[:2], children=[child])
467    assert copied.equals(array)
468
469    with pytest.raises(ValueError):
470        # too many children
471        pa.Array.from_buffers(ty, 4, [None, buffers[1]],
472                              children=[child, child])
473
474
475def test_struct_from_buffers():
476    ty = pa.struct([pa.field('a', pa.int16()), pa.field('b', pa.utf8())])
477    array = pa.array([{'a': 0, 'b': 'foo'}, None, {'a': 5, 'b': ''}],
478                     type=ty)
479    buffers = array.buffers()
480
481    with pytest.raises(ValueError):
482        # No children
483        pa.Array.from_buffers(ty, 3, [None, buffers[1]])
484
485    children = [pa.Array.from_buffers(pa.int16(), 3, buffers[1:3]),
486                pa.Array.from_buffers(pa.utf8(), 3, buffers[3:])]
487    copied = pa.Array.from_buffers(ty, 3, buffers[:1], children=children)
488    assert copied.equals(array)
489
490    with pytest.raises(ValueError):
491        # not enough many children
492        pa.Array.from_buffers(ty, 3, [buffers[0]],
493                              children=children[:1])
494
495
496def test_struct_from_arrays():
497    a = pa.array([4, 5, 6], type=pa.int64())
498    b = pa.array(["bar", None, ""])
499    c = pa.array([[1, 2], None, [3, None]])
500    expected_list = [
501        {'a': 4, 'b': 'bar', 'c': [1, 2]},
502        {'a': 5, 'b': None, 'c': None},
503        {'a': 6, 'b': '', 'c': [3, None]},
504    ]
505
506    # From field names
507    arr = pa.StructArray.from_arrays([a, b, c], ["a", "b", "c"])
508    assert arr.type == pa.struct(
509        [("a", a.type), ("b", b.type), ("c", c.type)])
510    assert arr.to_pylist() == expected_list
511
512    with pytest.raises(ValueError):
513        pa.StructArray.from_arrays([a, b, c], ["a", "b"])
514
515    arr = pa.StructArray.from_arrays([], [])
516    assert arr.type == pa.struct([])
517    assert arr.to_pylist() == []
518
519    # From fields
520    fa = pa.field("a", a.type, nullable=False)
521    fb = pa.field("b", b.type)
522    fc = pa.field("c", c.type)
523    arr = pa.StructArray.from_arrays([a, b, c], fields=[fa, fb, fc])
524    assert arr.type == pa.struct([fa, fb, fc])
525    assert not arr.type[0].nullable
526    assert arr.to_pylist() == expected_list
527
528    with pytest.raises(ValueError):
529        pa.StructArray.from_arrays([a, b, c], fields=[fa, fb])
530
531    arr = pa.StructArray.from_arrays([], fields=[])
532    assert arr.type == pa.struct([])
533    assert arr.to_pylist() == []
534
535    # Inconsistent fields
536    fa2 = pa.field("a", pa.int32())
537    with pytest.raises(ValueError, match="int64 vs int32"):
538        pa.StructArray.from_arrays([a, b, c], fields=[fa2, fb, fc])
539
540
541def test_dictionary_from_numpy():
542    indices = np.repeat([0, 1, 2], 2)
543    dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
544    mask = np.array([False, False, True, False, False, False])
545
546    d1 = pa.DictionaryArray.from_arrays(indices, dictionary)
547    d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask)
548
549    assert d1.indices.to_pylist() == indices.tolist()
550    assert d1.indices.to_pylist() == indices.tolist()
551    assert d1.dictionary.to_pylist() == dictionary.tolist()
552    assert d2.dictionary.to_pylist() == dictionary.tolist()
553
554    for i in range(len(indices)):
555        assert d1[i].as_py() == dictionary[indices[i]]
556
557        if mask[i]:
558            assert d2[i] is pa.NULL
559        else:
560            assert d2[i].as_py() == dictionary[indices[i]]
561
562
563def test_dictionary_from_boxed_arrays():
564    indices = np.repeat([0, 1, 2], 2)
565    dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
566
567    iarr = pa.array(indices)
568    darr = pa.array(dictionary)
569
570    d1 = pa.DictionaryArray.from_arrays(iarr, darr)
571
572    assert d1.indices.to_pylist() == indices.tolist()
573    assert d1.dictionary.to_pylist() == dictionary.tolist()
574
575    for i in range(len(indices)):
576        assert d1[i].as_py() == dictionary[indices[i]]
577
578
579def test_dictionary_from_arrays_boundscheck():
580    indices1 = pa.array([0, 1, 2, 0, 1, 2])
581    indices2 = pa.array([0, -1, 2])
582    indices3 = pa.array([0, 1, 2, 3])
583
584    dictionary = pa.array(['foo', 'bar', 'baz'])
585
586    # Works fine
587    pa.DictionaryArray.from_arrays(indices1, dictionary)
588
589    with pytest.raises(pa.ArrowException):
590        pa.DictionaryArray.from_arrays(indices2, dictionary)
591
592    with pytest.raises(pa.ArrowException):
593        pa.DictionaryArray.from_arrays(indices3, dictionary)
594
595    # If we are confident that the indices are "safe" we can pass safe=False to
596    # disable the boundschecking
597    pa.DictionaryArray.from_arrays(indices2, dictionary, safe=False)
598
599
600def test_dictionary_indices():
601    # https://issues.apache.org/jira/browse/ARROW-6882
602    indices = pa.array([0, 1, 2, 0, 1, 2])
603    dictionary = pa.array(['foo', 'bar', 'baz'])
604    arr = pa.DictionaryArray.from_arrays(indices, dictionary)
605    arr.indices.validate(full=True)
606
607
608@pytest.mark.parametrize(('list_array_type', 'list_type_factory'),
609                         [(pa.ListArray, pa.list_),
610                          (pa.LargeListArray, pa.large_list)])
611def test_list_from_arrays(list_array_type, list_type_factory):
612    offsets_arr = np.array([0, 2, 5, 8], dtype='i4')
613    offsets = pa.array(offsets_arr, type='int32')
614    pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h']
615    values = pa.array(pyvalues, type='binary')
616
617    result = list_array_type.from_arrays(offsets, values)
618    expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]],
619                        type=list_type_factory(pa.binary()))
620
621    assert result.equals(expected)
622
623    # With nulls
624    offsets = [0, None, 2, 6]
625    values = [b'a', b'b', b'c', b'd', b'e', b'f']
626
627    result = list_array_type.from_arrays(offsets, values)
628    expected = pa.array([values[:2], None, values[2:]],
629                        type=list_type_factory(pa.binary()))
630
631    assert result.equals(expected)
632
633    # Another edge case
634    offsets2 = [0, 2, None, 6]
635    result = list_array_type.from_arrays(offsets2, values)
636    expected = pa.array([values[:2], values[2:], None],
637                        type=list_type_factory(pa.binary()))
638    assert result.equals(expected)
639
640    # raise on invalid array
641    offsets = [1, 3, 10]
642    values = np.arange(5)
643    with pytest.raises(ValueError):
644        list_array_type.from_arrays(offsets, values)
645
646    # Non-monotonic offsets
647    offsets = [0, 3, 2, 6]
648    values = list(range(6))
649    result = list_array_type.from_arrays(offsets, values)
650    with pytest.raises(ValueError):
651        result.validate(full=True)
652
653
654def test_map_from_arrays():
655    offsets_arr = np.array([0, 2, 5, 8], dtype='i4')
656    offsets = pa.array(offsets_arr, type='int32')
657    pykeys = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h']
658    pyitems = list(range(len(pykeys)))
659    pypairs = list(zip(pykeys, pyitems))
660    pyentries = [pypairs[:2], pypairs[2:5], pypairs[5:8]]
661    keys = pa.array(pykeys, type='binary')
662    items = pa.array(pyitems, type='i4')
663
664    result = pa.MapArray.from_arrays(offsets, keys, items)
665    expected = pa.array(pyentries, type=pa.map_(pa.binary(), pa.int32()))
666
667    assert result.equals(expected)
668
669    # With nulls
670    offsets = [0, None, 2, 6]
671    pykeys = [b'a', b'b', b'c', b'd', b'e', b'f']
672    pyitems = [1, 2, 3, None, 4, 5]
673    pypairs = list(zip(pykeys, pyitems))
674    pyentries = [pypairs[:2], None, pypairs[2:]]
675    keys = pa.array(pykeys, type='binary')
676    items = pa.array(pyitems, type='i4')
677
678    result = pa.MapArray.from_arrays(offsets, keys, items)
679    expected = pa.array(pyentries, type=pa.map_(pa.binary(), pa.int32()))
680
681    assert result.equals(expected)
682
683    # check invalid usage
684
685    offsets = [0, 1, 3, 5]
686    keys = np.arange(5)
687    items = np.arange(5)
688    _ = pa.MapArray.from_arrays(offsets, keys, items)
689
690    # raise on invalid offsets
691    with pytest.raises(ValueError):
692        pa.MapArray.from_arrays(offsets + [6], keys, items)
693
694    # raise on length of keys != items
695    with pytest.raises(ValueError):
696        pa.MapArray.from_arrays(offsets, keys, np.concatenate([items, items]))
697
698    # raise on keys with null
699    keys_with_null = list(keys)[:-1] + [None]
700    assert len(keys_with_null) == len(items)
701    with pytest.raises(ValueError):
702        pa.MapArray.from_arrays(offsets, keys_with_null, items)
703
704
705def test_fixed_size_list_from_arrays():
706    values = pa.array(range(12), pa.int64())
707    result = pa.FixedSizeListArray.from_arrays(values, 4)
708    assert result.to_pylist() == [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]
709    assert result.type.equals(pa.list_(pa.int64(), 4))
710
711    # raise on invalid values / list_size
712    with pytest.raises(ValueError):
713        pa.FixedSizeListArray.from_arrays(values, -4)
714
715    with pytest.raises(ValueError):
716        # array with list size 0 cannot be constructed with from_arrays
717        pa.FixedSizeListArray.from_arrays(pa.array([], pa.int64()), 0)
718
719    with pytest.raises(ValueError):
720        # length of values not multiple of 5
721        pa.FixedSizeListArray.from_arrays(values, 5)
722
723
724def test_union_from_dense():
725    binary = pa.array([b'a', b'b', b'c', b'd'], type='binary')
726    int64 = pa.array([1, 2, 3], type='int64')
727    types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8')
728    logical_types = pa.array([11, 13, 11, 11, 13, 13, 11], type='int8')
729    value_offsets = pa.array([1, 0, 0, 2, 1, 2, 3], type='int32')
730    py_value = [b'b', 1, b'a', b'c', 2, 3, b'd']
731
732    def check_result(result, expected_field_names, expected_type_codes,
733                     expected_type_code_values):
734        result.validate(full=True)
735        actual_field_names = [result.type[i].name
736                              for i in range(result.type.num_children)]
737        assert actual_field_names == expected_field_names
738        assert result.type.mode == "dense"
739        assert result.type.type_codes == expected_type_codes
740        assert result.to_pylist() == py_value
741        assert expected_type_code_values.equals(result.type_codes)
742        assert value_offsets.equals(result.offsets)
743        assert result.child(0).equals(binary)
744        assert result.child(1).equals(int64)
745        with pytest.raises(KeyError):
746            result.child(-1)
747        with pytest.raises(KeyError):
748            result.child(2)
749
750    # without field names and type codes
751    check_result(pa.UnionArray.from_dense(types, value_offsets,
752                                          [binary, int64]),
753                 expected_field_names=['0', '1'],
754                 expected_type_codes=[0, 1],
755                 expected_type_code_values=types)
756
757    # with field names
758    check_result(pa.UnionArray.from_dense(types, value_offsets,
759                                          [binary, int64],
760                                          ['bin', 'int']),
761                 expected_field_names=['bin', 'int'],
762                 expected_type_codes=[0, 1],
763                 expected_type_code_values=types)
764
765    # with type codes
766    check_result(pa.UnionArray.from_dense(logical_types, value_offsets,
767                                          [binary, int64],
768                                          type_codes=[11, 13]),
769                 expected_field_names=['0', '1'],
770                 expected_type_codes=[11, 13],
771                 expected_type_code_values=logical_types)
772
773    # with field names and type codes
774    check_result(pa.UnionArray.from_dense(logical_types, value_offsets,
775                                          [binary, int64],
776                                          ['bin', 'int'], [11, 13]),
777                 expected_field_names=['bin', 'int'],
778                 expected_type_codes=[11, 13],
779                 expected_type_code_values=logical_types)
780
781    # Bad type ids
782    arr = pa.UnionArray.from_dense(logical_types, value_offsets,
783                                   [binary, int64])
784    with pytest.raises(pa.ArrowInvalid):
785        arr.validate(full=True)
786    arr = pa.UnionArray.from_dense(types, value_offsets, [binary, int64],
787                                   type_codes=[11, 13])
788    with pytest.raises(pa.ArrowInvalid):
789        arr.validate(full=True)
790
791    # Offset larger than child size
792    bad_offsets = pa.array([0, 0, 1, 2, 1, 2, 4], type='int32')
793    arr = pa.UnionArray.from_dense(types, bad_offsets, [binary, int64])
794    with pytest.raises(pa.ArrowInvalid):
795        arr.validate(full=True)
796
797
798def test_union_from_sparse():
799    binary = pa.array([b'a', b' ', b'b', b'c', b' ', b' ', b'd'],
800                      type='binary')
801    int64 = pa.array([0, 1, 0, 0, 2, 3, 0], type='int64')
802    types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8')
803    logical_types = pa.array([11, 13, 11, 11, 13, 13, 11], type='int8')
804    py_value = [b'a', 1, b'b', b'c', 2, 3, b'd']
805
806    def check_result(result, expected_field_names, expected_type_codes,
807                     expected_type_code_values):
808        result.validate(full=True)
809        assert result.to_pylist() == py_value
810        actual_field_names = [result.type[i].name
811                              for i in range(result.type.num_children)]
812        assert actual_field_names == expected_field_names
813        assert result.type.mode == "sparse"
814        assert result.type.type_codes == expected_type_codes
815        assert expected_type_code_values.equals(result.type_codes)
816        assert result.child(0).equals(binary)
817        assert result.child(1).equals(int64)
818        with pytest.raises(pa.ArrowTypeError):
819            result.offsets
820        with pytest.raises(KeyError):
821            result.child(-1)
822        with pytest.raises(KeyError):
823            result.child(2)
824
825    # without field names and type codes
826    check_result(pa.UnionArray.from_sparse(types, [binary, int64]),
827                 expected_field_names=['0', '1'],
828                 expected_type_codes=[0, 1],
829                 expected_type_code_values=types)
830
831    # with field names
832    check_result(pa.UnionArray.from_sparse(types, [binary, int64],
833                                           ['bin', 'int']),
834                 expected_field_names=['bin', 'int'],
835                 expected_type_codes=[0, 1],
836                 expected_type_code_values=types)
837
838    # with type codes
839    check_result(pa.UnionArray.from_sparse(logical_types, [binary, int64],
840                                           type_codes=[11, 13]),
841                 expected_field_names=['0', '1'],
842                 expected_type_codes=[11, 13],
843                 expected_type_code_values=logical_types)
844
845    # with field names and type codes
846    check_result(pa.UnionArray.from_sparse(logical_types, [binary, int64],
847                                           ['bin', 'int'],
848                                           [11, 13]),
849                 expected_field_names=['bin', 'int'],
850                 expected_type_codes=[11, 13],
851                 expected_type_code_values=logical_types)
852
853    # Bad type ids
854    arr = pa.UnionArray.from_sparse(logical_types, [binary, int64])
855    with pytest.raises(pa.ArrowInvalid):
856        arr.validate(full=True)
857    arr = pa.UnionArray.from_sparse(types, [binary, int64],
858                                    type_codes=[11, 13])
859    with pytest.raises(pa.ArrowInvalid):
860        arr.validate(full=True)
861
862    # Invalid child length
863    with pytest.raises(pa.ArrowInvalid):
864        arr = pa.UnionArray.from_sparse(logical_types, [binary, int64[1:]])
865
866
867def test_union_array_slice():
868    # ARROW-2314
869    arr = pa.UnionArray.from_sparse(pa.array([0, 0, 1, 1], type=pa.int8()),
870                                    [pa.array(["a", "b", "c", "d"]),
871                                     pa.array([1, 2, 3, 4])])
872    assert arr[1:].to_pylist() == ["b", 3, 4]
873
874    binary = pa.array([b'a', b'b', b'c', b'd'], type='binary')
875    int64 = pa.array([1, 2, 3], type='int64')
876    types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8')
877    value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32')
878
879    arr = pa.UnionArray.from_dense(types, value_offsets, [binary, int64])
880    lst = arr.to_pylist()
881    for i in range(len(arr)):
882        for j in range(i, len(arr)):
883            assert arr[i:j].to_pylist() == lst[i:j]
884
885
886def _check_cast_case(case, *, safe=True, check_array_construction=True):
887    in_data, in_type, out_data, out_type = case
888    if isinstance(out_data, pa.Array):
889        assert out_data.type == out_type
890        expected = out_data
891    else:
892        expected = pa.array(out_data, type=out_type)
893
894    # check casting an already created array
895    if isinstance(in_data, pa.Array):
896        assert in_data.type == in_type
897        in_arr = in_data
898    else:
899        in_arr = pa.array(in_data, type=in_type)
900    casted = in_arr.cast(out_type, safe=safe)
901    casted.validate(full=True)
902    assert casted.equals(expected)
903
904    # constructing an array with out type which optionally involves casting
905    # for more see ARROW-1949
906    if check_array_construction:
907        in_arr = pa.array(in_data, type=out_type, safe=safe)
908        assert in_arr.equals(expected)
909
910
911def test_cast_integers_safe():
912    safe_cases = [
913        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
914         np.array([0, 1, 2, 3], dtype='i4'), pa.int32()),
915        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
916         np.array([0, 1, 2, 3], dtype='u4'), pa.uint16()),
917        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
918         np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()),
919        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
920         np.array([0, 1, 2, 3], dtype='f8'), pa.float64())
921    ]
922
923    for case in safe_cases:
924        _check_cast_case(case)
925
926    unsafe_cases = [
927        (np.array([50000], dtype='i4'), 'int32', 'int16'),
928        (np.array([70000], dtype='i4'), 'int32', 'uint16'),
929        (np.array([-1], dtype='i4'), 'int32', 'uint16'),
930        (np.array([50000], dtype='u2'), 'uint16', 'int16')
931    ]
932    for in_data, in_type, out_type in unsafe_cases:
933        in_arr = pa.array(in_data, type=in_type)
934
935        with pytest.raises(pa.ArrowInvalid):
936            in_arr.cast(out_type)
937
938
939def test_cast_none():
940    # ARROW-3735: Ensure that calling cast(None) doesn't segfault.
941    arr = pa.array([1, 2, 3])
942
943    with pytest.raises(ValueError):
944        arr.cast(None)
945
946
947def test_cast_list_to_primitive():
948    # ARROW-8070: cast segfaults on unsupported cast from list<binary> to utf8
949    arr = pa.array([[1, 2], [3, 4]])
950    with pytest.raises(NotImplementedError):
951        arr.cast(pa.int8())
952
953    arr = pa.array([[b"a", b"b"], [b"c"]], pa.list_(pa.binary()))
954    with pytest.raises(NotImplementedError):
955        arr.cast(pa.binary())
956
957
958def test_slice_chunked_array_zero_chunks():
959    # ARROW-8911
960    arr = pa.chunked_array([], type='int8')
961    assert arr.num_chunks == 0
962
963    result = arr[:]
964    assert result.equals(arr)
965
966    # Do not crash
967    arr[:5]
968
969
970def test_cast_chunked_array():
971    arrays = [pa.array([1, 2, 3]), pa.array([4, 5, 6])]
972    carr = pa.chunked_array(arrays)
973
974    target = pa.float64()
975    casted = carr.cast(target)
976    expected = pa.chunked_array([x.cast(target) for x in arrays])
977    assert casted.equals(expected)
978
979
980def test_cast_chunked_array_empty():
981    # ARROW-8142
982    for typ1, typ2 in [(pa.dictionary(pa.int8(), pa.string()), pa.string()),
983                       (pa.int64(), pa.int32())]:
984
985        arr = pa.chunked_array([], type=typ1)
986        result = arr.cast(typ2)
987        expected = pa.chunked_array([], type=typ2)
988        assert result.equals(expected)
989
990
991def test_chunked_array_data_warns():
992    with pytest.warns(FutureWarning):
993        res = pa.chunked_array([[]]).data
994    assert isinstance(res, pa.ChunkedArray)
995
996
997def test_cast_integers_unsafe():
998    # We let NumPy do the unsafe casting
999    unsafe_cases = [
1000        (np.array([50000], dtype='i4'), 'int32',
1001         np.array([50000], dtype='i2'), pa.int16()),
1002        (np.array([70000], dtype='i4'), 'int32',
1003         np.array([70000], dtype='u2'), pa.uint16()),
1004        (np.array([-1], dtype='i4'), 'int32',
1005         np.array([-1], dtype='u2'), pa.uint16()),
1006        (np.array([50000], dtype='u2'), pa.uint16(),
1007         np.array([50000], dtype='i2'), pa.int16())
1008    ]
1009
1010    for case in unsafe_cases:
1011        _check_cast_case(case, safe=False)
1012
1013
1014def test_floating_point_truncate_safe():
1015    safe_cases = [
1016        (np.array([1.0, 2.0, 3.0], dtype='float32'), 'float32',
1017         np.array([1, 2, 3], dtype='i4'), pa.int32()),
1018        (np.array([1.0, 2.0, 3.0], dtype='float64'), 'float64',
1019         np.array([1, 2, 3], dtype='i4'), pa.int32()),
1020        (np.array([-10.0, 20.0, -30.0], dtype='float64'), 'float64',
1021         np.array([-10, 20, -30], dtype='i4'), pa.int32()),
1022    ]
1023    for case in safe_cases:
1024        _check_cast_case(case, safe=True)
1025
1026
1027def test_floating_point_truncate_unsafe():
1028    unsafe_cases = [
1029        (np.array([1.1, 2.2, 3.3], dtype='float32'), 'float32',
1030         np.array([1, 2, 3], dtype='i4'), pa.int32()),
1031        (np.array([1.1, 2.2, 3.3], dtype='float64'), 'float64',
1032         np.array([1, 2, 3], dtype='i4'), pa.int32()),
1033        (np.array([-10.1, 20.2, -30.3], dtype='float64'), 'float64',
1034         np.array([-10, 20, -30], dtype='i4'), pa.int32()),
1035    ]
1036    for case in unsafe_cases:
1037        # test safe casting raises
1038        with pytest.raises(pa.ArrowInvalid,
1039                           match='Floating point value truncated'):
1040            _check_cast_case(case, safe=True)
1041
1042        # test unsafe casting truncates
1043        _check_cast_case(case, safe=False)
1044
1045
1046def test_decimal_to_int_safe():
1047    safe_cases = [
1048        (
1049            [decimal.Decimal("123456"), None, decimal.Decimal("-912345")],
1050            pa.decimal128(32, 5),
1051            [123456, None, -912345],
1052            pa.int32()
1053        ),
1054        (
1055            [decimal.Decimal("1234"), None, decimal.Decimal("-9123")],
1056            pa.decimal128(19, 10),
1057            [1234, None, -9123],
1058            pa.int16()
1059        ),
1060        (
1061            [decimal.Decimal("123"), None, decimal.Decimal("-91")],
1062            pa.decimal128(19, 10),
1063            [123, None, -91],
1064            pa.int8()
1065        ),
1066    ]
1067    for case in safe_cases:
1068        _check_cast_case(case)
1069        _check_cast_case(case, safe=True)
1070
1071
1072def test_decimal_to_int_value_out_of_bounds():
1073    out_of_bounds_cases = [
1074        (
1075            np.array([
1076                decimal.Decimal("1234567890123"),
1077                None,
1078                decimal.Decimal("-912345678901234")
1079            ]),
1080            pa.decimal128(32, 5),
1081            [1912276171, None, -135950322],
1082            pa.int32()
1083        ),
1084        (
1085            [decimal.Decimal("123456"), None, decimal.Decimal("-912345678")],
1086            pa.decimal128(32, 5),
1087            [-7616, None, -19022],
1088            pa.int16()
1089        ),
1090        (
1091            [decimal.Decimal("1234"), None, decimal.Decimal("-9123")],
1092            pa.decimal128(32, 5),
1093            [-46, None, 93],
1094            pa.int8()
1095        ),
1096    ]
1097
1098    for case in out_of_bounds_cases:
1099        # test safe casting raises
1100        with pytest.raises(pa.ArrowInvalid,
1101                           match='Integer value out of bounds'):
1102            _check_cast_case(case)
1103
1104        # XXX `safe=False` can be ignored when constructing an array
1105        # from a sequence of Python objects (ARROW-8567)
1106        _check_cast_case(case, safe=False, check_array_construction=False)
1107
1108
1109def test_decimal_to_int_non_integer():
1110    non_integer_cases = [
1111        (
1112            [
1113                decimal.Decimal("123456.21"),
1114                None,
1115                decimal.Decimal("-912345.13")
1116            ],
1117            pa.decimal128(32, 5),
1118            [123456, None, -912345],
1119            pa.int32()
1120        ),
1121        (
1122            [decimal.Decimal("1234.134"), None, decimal.Decimal("-9123.1")],
1123            pa.decimal128(19, 10),
1124            [1234, None, -9123],
1125            pa.int16()
1126        ),
1127        (
1128            [decimal.Decimal("123.1451"), None, decimal.Decimal("-91.21")],
1129            pa.decimal128(19, 10),
1130            [123, None, -91],
1131            pa.int8()
1132        ),
1133    ]
1134
1135    for case in non_integer_cases:
1136        # test safe casting raises
1137        msg_regexp = 'Rescaling decimal value would cause data loss'
1138        with pytest.raises(pa.ArrowInvalid, match=msg_regexp):
1139            _check_cast_case(case)
1140
1141        _check_cast_case(case, safe=False)
1142
1143
1144def test_decimal_to_decimal():
1145    arr = pa.array(
1146        [decimal.Decimal("1234.12"), None],
1147        type=pa.decimal128(19, 10)
1148    )
1149    result = arr.cast(pa.decimal128(15, 6))
1150    expected = pa.array(
1151        [decimal.Decimal("1234.12"), None],
1152        type=pa.decimal128(15, 6)
1153    )
1154    assert result.equals(expected)
1155
1156    with pytest.raises(pa.ArrowInvalid,
1157                       match='Rescaling decimal value would cause data loss'):
1158        result = arr.cast(pa.decimal128(9, 1))
1159
1160    result = arr.cast(pa.decimal128(9, 1), safe=False)
1161    expected = pa.array(
1162        [decimal.Decimal("1234.1"), None],
1163        type=pa.decimal128(9, 1)
1164    )
1165    assert result.equals(expected)
1166
1167    # TODO FIXME
1168    # this should fail but decimal overflow is not implemented
1169    result = arr.cast(pa.decimal128(1, 2))
1170
1171
1172def test_safe_cast_nan_to_int_raises():
1173    arr = pa.array([np.nan, 1.])
1174
1175    with pytest.raises(pa.ArrowInvalid,
1176                       match='Floating point value truncated'):
1177        arr.cast(pa.int64(), safe=True)
1178
1179
1180def test_cast_signed_to_unsigned():
1181    safe_cases = [
1182        (np.array([0, 1, 2, 3], dtype='i1'), pa.uint8(),
1183         np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()),
1184        (np.array([0, 1, 2, 3], dtype='i2'), pa.uint16(),
1185         np.array([0, 1, 2, 3], dtype='u2'), pa.uint16())
1186    ]
1187
1188    for case in safe_cases:
1189        _check_cast_case(case)
1190
1191
1192def test_cast_from_null():
1193    in_data = [None] * 3
1194    in_type = pa.null()
1195    out_types = [
1196        pa.null(),
1197        pa.uint8(),
1198        pa.float16(),
1199        pa.utf8(),
1200        pa.binary(),
1201        pa.binary(10),
1202        pa.list_(pa.int16()),
1203        pa.list_(pa.int32(), 4),
1204        pa.large_list(pa.uint8()),
1205        pa.decimal128(19, 4),
1206        pa.timestamp('us'),
1207        pa.timestamp('us', tz='UTC'),
1208        pa.timestamp('us', tz='Europe/Paris'),
1209        pa.duration('us'),
1210        pa.struct([pa.field('a', pa.int32()),
1211                   pa.field('b', pa.list_(pa.int8())),
1212                   pa.field('c', pa.string())]),
1213    ]
1214    for out_type in out_types:
1215        _check_cast_case((in_data, in_type, in_data, out_type))
1216
1217    out_types = [
1218        pa.dictionary(pa.int32(), pa.string()),
1219        pa.union([pa.field('a', pa.binary(10)),
1220                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
1221        pa.union([pa.field('a', pa.binary(10)),
1222                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
1223    ]
1224    in_arr = pa.array(in_data, type=pa.null())
1225    for out_type in out_types:
1226        with pytest.raises(NotImplementedError):
1227            in_arr.cast(out_type)
1228
1229
1230def test_cast_string_to_number_roundtrip():
1231    cases = [
1232        (pa.array(["1", "127", "-128"]),
1233         pa.array([1, 127, -128], type=pa.int8())),
1234        (pa.array([None, "18446744073709551615"]),
1235         pa.array([None, 18446744073709551615], type=pa.uint64())),
1236    ]
1237    for in_arr, expected in cases:
1238        casted = in_arr.cast(expected.type, safe=True)
1239        casted.validate(full=True)
1240        assert casted.equals(expected)
1241        casted_back = casted.cast(in_arr.type, safe=True)
1242        casted_back.validate(full=True)
1243        assert casted_back.equals(in_arr)
1244
1245
1246def test_cast_dictionary():
1247    arr = pa.DictionaryArray.from_arrays(
1248        pa.array([0, 1, None], type=pa.int32()),
1249        pa.array(["foo", "bar"]))
1250    assert arr.cast(pa.string()).equals(pa.array(["foo", "bar", None]))
1251    with pytest.raises(pa.ArrowInvalid):
1252        # Shouldn't crash (ARROW-7077)
1253        arr.cast(pa.int32())
1254
1255
1256def test_view():
1257    # ARROW-5992
1258    arr = pa.array(['foo', 'bar', 'baz'], type=pa.utf8())
1259    expected = pa.array(['foo', 'bar', 'baz'], type=pa.binary())
1260
1261    assert arr.view(pa.binary()).equals(expected)
1262    assert arr.view('binary').equals(expected)
1263
1264
1265def test_unique_simple():
1266    cases = [
1267        (pa.array([1, 2, 3, 1, 2, 3]), pa.array([1, 2, 3])),
1268        (pa.array(['foo', None, 'bar', 'foo']),
1269         pa.array(['foo', None, 'bar'])),
1270        (pa.array(['foo', None, 'bar', 'foo'], pa.large_binary()),
1271         pa.array(['foo', None, 'bar'], pa.large_binary())),
1272    ]
1273    for arr, expected in cases:
1274        result = arr.unique()
1275        assert result.equals(expected)
1276        result = pa.chunked_array([arr]).unique()
1277        assert result.equals(expected)
1278
1279
1280def test_value_counts_simple():
1281    cases = [
1282        (pa.array([1, 2, 3, 1, 2, 3]),
1283         pa.array([1, 2, 3]),
1284         pa.array([2, 2, 2], type=pa.int64())),
1285        (pa.array(['foo', None, 'bar', 'foo']),
1286         pa.array(['foo', None, 'bar']),
1287         pa.array([2, 1, 1], type=pa.int64())),
1288        (pa.array(['foo', None, 'bar', 'foo'], pa.large_binary()),
1289         pa.array(['foo', None, 'bar'], pa.large_binary()),
1290         pa.array([2, 1, 1], type=pa.int64())),
1291    ]
1292    for arr, expected_values, expected_counts in cases:
1293        for arr_in in (arr, pa.chunked_array([arr])):
1294            result = arr_in.value_counts()
1295            assert result.type.equals(
1296                pa.struct([pa.field("values", arr.type),
1297                           pa.field("counts", pa.int64())]))
1298            assert result.field("values").equals(expected_values)
1299            assert result.field("counts").equals(expected_counts)
1300
1301
1302def test_dictionary_encode_simple():
1303    cases = [
1304        (pa.array([1, 2, 3, None, 1, 2, 3]),
1305         pa.DictionaryArray.from_arrays(
1306             pa.array([0, 1, 2, None, 0, 1, 2], type='int32'),
1307             [1, 2, 3])),
1308        (pa.array(['foo', None, 'bar', 'foo']),
1309         pa.DictionaryArray.from_arrays(
1310             pa.array([0, None, 1, 0], type='int32'),
1311             ['foo', 'bar'])),
1312        (pa.array(['foo', None, 'bar', 'foo'], type=pa.large_binary()),
1313         pa.DictionaryArray.from_arrays(
1314             pa.array([0, None, 1, 0], type='int32'),
1315             pa.array(['foo', 'bar'], type=pa.large_binary()))),
1316    ]
1317    for arr, expected in cases:
1318        result = arr.dictionary_encode()
1319        assert result.equals(expected)
1320        result = pa.chunked_array([arr]).dictionary_encode()
1321        assert result.num_chunks == 1
1322        assert result.chunk(0).equals(expected)
1323        result = pa.chunked_array([], type=arr.type).dictionary_encode()
1324        assert result.num_chunks == 0
1325        assert result.type == expected.type
1326
1327
1328def test_dictionary_encode_sliced():
1329    cases = [
1330        (pa.array([1, 2, 3, None, 1, 2, 3])[1:-1],
1331         pa.DictionaryArray.from_arrays(
1332             pa.array([0, 1, None, 2, 0], type='int32'),
1333             [2, 3, 1])),
1334        (pa.array([None, 'foo', 'bar', 'foo', 'xyzzy'])[1:-1],
1335         pa.DictionaryArray.from_arrays(
1336             pa.array([0, 1, 0], type='int32'),
1337             ['foo', 'bar'])),
1338        (pa.array([None, 'foo', 'bar', 'foo', 'xyzzy'],
1339                  type=pa.large_string())[1:-1],
1340         pa.DictionaryArray.from_arrays(
1341             pa.array([0, 1, 0], type='int32'),
1342             pa.array(['foo', 'bar'], type=pa.large_string()))),
1343    ]
1344    for arr, expected in cases:
1345        result = arr.dictionary_encode()
1346        assert result.equals(expected)
1347        result = pa.chunked_array([arr]).dictionary_encode()
1348        assert result.num_chunks == 1
1349        assert result.type == expected.type
1350        assert result.chunk(0).equals(expected)
1351        result = pa.chunked_array([], type=arr.type).dictionary_encode()
1352        assert result.num_chunks == 0
1353        assert result.type == expected.type
1354
1355
1356def test_dictionary_encode_zero_length():
1357    # User-facing experience of ARROW-7008
1358    arr = pa.array([], type=pa.string())
1359    encoded = arr.dictionary_encode()
1360    assert len(encoded.dictionary) == 0
1361    encoded.validate(full=True)
1362
1363
1364def test_cast_time32_to_int():
1365    arr = pa.array(np.array([0, 1, 2], dtype='int32'),
1366                   type=pa.time32('s'))
1367    expected = pa.array([0, 1, 2], type='i4')
1368
1369    result = arr.cast('i4')
1370    assert result.equals(expected)
1371
1372
1373def test_cast_time64_to_int():
1374    arr = pa.array(np.array([0, 1, 2], dtype='int64'),
1375                   type=pa.time64('us'))
1376    expected = pa.array([0, 1, 2], type='i8')
1377
1378    result = arr.cast('i8')
1379    assert result.equals(expected)
1380
1381
1382def test_cast_timestamp_to_int():
1383    arr = pa.array(np.array([0, 1, 2], dtype='int64'),
1384                   type=pa.timestamp('us'))
1385    expected = pa.array([0, 1, 2], type='i8')
1386
1387    result = arr.cast('i8')
1388    assert result.equals(expected)
1389
1390
1391def test_cast_date32_to_int():
1392    arr = pa.array([0, 1, 2], type='i4')
1393
1394    result1 = arr.cast('date32')
1395    result2 = result1.cast('i4')
1396
1397    expected1 = pa.array([
1398        datetime.date(1970, 1, 1),
1399        datetime.date(1970, 1, 2),
1400        datetime.date(1970, 1, 3)
1401    ]).cast('date32')
1402
1403    assert result1.equals(expected1)
1404    assert result2.equals(arr)
1405
1406
1407def test_cast_duration_to_int():
1408    arr = pa.array(np.array([0, 1, 2], dtype='int64'),
1409                   type=pa.duration('us'))
1410    expected = pa.array([0, 1, 2], type='i8')
1411
1412    result = arr.cast('i8')
1413    assert result.equals(expected)
1414
1415
1416def test_cast_binary_to_utf8():
1417    binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary())
1418    utf8_arr = binary_arr.cast(pa.utf8())
1419    expected = pa.array(['foo', 'bar', 'baz'], type=pa.utf8())
1420
1421    assert utf8_arr.equals(expected)
1422
1423    non_utf8_values = [('mañana').encode('utf-16-le')]
1424    non_utf8_binary = pa.array(non_utf8_values)
1425    assert non_utf8_binary.type == pa.binary()
1426    with pytest.raises(ValueError):
1427        non_utf8_binary.cast(pa.string())
1428
1429    non_utf8_all_null = pa.array(non_utf8_values, mask=np.array([True]),
1430                                 type=pa.binary())
1431    # No error
1432    casted = non_utf8_all_null.cast(pa.string())
1433    assert casted.null_count == 1
1434
1435
1436def test_cast_date64_to_int():
1437    arr = pa.array(np.array([0, 1, 2], dtype='int64'),
1438                   type=pa.date64())
1439    expected = pa.array([0, 1, 2], type='i8')
1440
1441    result = arr.cast('i8')
1442
1443    assert result.equals(expected)
1444
1445
1446def test_date64_from_builtin_datetime():
1447    val1 = datetime.datetime(2000, 1, 1, 12, 34, 56, 123456)
1448    val2 = datetime.datetime(2000, 1, 1)
1449    result = pa.array([val1, val2], type='date64')
1450    result2 = pa.array([val1.date(), val2.date()], type='date64')
1451
1452    assert result.equals(result2)
1453
1454    as_i8 = result.view('int64')
1455    assert as_i8[0].as_py() == as_i8[1].as_py()
1456
1457
1458@pytest.mark.parametrize(('ty', 'values'), [
1459    ('bool', [True, False, True]),
1460    ('uint8', range(0, 255)),
1461    ('int8', range(0, 128)),
1462    ('uint16', range(0, 10)),
1463    ('int16', range(0, 10)),
1464    ('uint32', range(0, 10)),
1465    ('int32', range(0, 10)),
1466    ('uint64', range(0, 10)),
1467    ('int64', range(0, 10)),
1468    ('float', [0.0, 0.1, 0.2]),
1469    ('double', [0.0, 0.1, 0.2]),
1470    ('string', ['a', 'b', 'c']),
1471    ('binary', [b'a', b'b', b'c']),
1472    (pa.binary(3), [b'abc', b'bcd', b'cde'])
1473])
1474def test_cast_identities(ty, values):
1475    arr = pa.array(values, type=ty)
1476    assert arr.cast(ty).equals(arr)
1477
1478
1479pickle_test_parametrize = pytest.mark.parametrize(
1480    ('data', 'typ'),
1481    [
1482        ([True, False, True, True], pa.bool_()),
1483        ([1, 2, 4, 6], pa.int64()),
1484        ([1.0, 2.5, None], pa.float64()),
1485        (['a', None, 'b'], pa.string()),
1486        ([], None),
1487        ([[1, 2], [3]], pa.list_(pa.int64())),
1488        ([[4, 5], [6]], pa.large_list(pa.int16())),
1489        ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
1490        ([(1, 'a'), (2, 'c'), None],
1491            pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
1492    ]
1493)
1494
1495
1496@pickle_test_parametrize
1497def test_array_pickle(data, typ):
1498    # Allocate here so that we don't have any Arrow data allocated.
1499    # This is needed to ensure that allocator tests can be reliable.
1500    array = pa.array(data, type=typ)
1501    for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
1502        result = pickle.loads(pickle.dumps(array, proto))
1503        assert array.equals(result)
1504
1505
1506def test_array_pickle_dictionary():
1507    # not included in the above as dictionary array cannot be created with
1508    # the pa.array function
1509    array = pa.DictionaryArray.from_arrays([0, 1, 2, 0, 1], ['a', 'b', 'c'])
1510    for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
1511        result = pickle.loads(pickle.dumps(array, proto))
1512        assert array.equals(result)
1513
1514
1515@h.given(
1516    past.arrays(
1517        past.all_types,
1518        size=st.integers(min_value=0, max_value=10)
1519    )
1520)
1521def test_pickling(arr):
1522    data = pickle.dumps(arr)
1523    restored = pickle.loads(data)
1524    assert arr.equals(restored)
1525
1526
1527@pickle_test_parametrize
1528def test_array_pickle5(data, typ):
1529    # Test zero-copy pickling with protocol 5 (PEP 574)
1530    picklemod = pickle5 or pickle
1531    if pickle5 is None and picklemod.HIGHEST_PROTOCOL < 5:
1532        pytest.skip("need pickle5 package or Python 3.8+")
1533
1534    array = pa.array(data, type=typ)
1535    addresses = [buf.address if buf is not None else 0
1536                 for buf in array.buffers()]
1537
1538    for proto in range(5, pickle.HIGHEST_PROTOCOL + 1):
1539        buffers = []
1540        pickled = picklemod.dumps(array, proto, buffer_callback=buffers.append)
1541        result = picklemod.loads(pickled, buffers=buffers)
1542        assert array.equals(result)
1543
1544        result_addresses = [buf.address if buf is not None else 0
1545                            for buf in result.buffers()]
1546        assert result_addresses == addresses
1547
1548
1549@pytest.mark.parametrize(
1550    'narr',
1551    [
1552        np.arange(10, dtype=np.int64),
1553        np.arange(10, dtype=np.int32),
1554        np.arange(10, dtype=np.int16),
1555        np.arange(10, dtype=np.int8),
1556        np.arange(10, dtype=np.uint64),
1557        np.arange(10, dtype=np.uint32),
1558        np.arange(10, dtype=np.uint16),
1559        np.arange(10, dtype=np.uint8),
1560        np.arange(10, dtype=np.float64),
1561        np.arange(10, dtype=np.float32),
1562        np.arange(10, dtype=np.float16),
1563    ]
1564)
1565def test_to_numpy_roundtrip(narr):
1566    arr = pa.array(narr)
1567    assert narr.dtype == arr.to_numpy().dtype
1568    np.testing.assert_array_equal(narr, arr.to_numpy())
1569    np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
1570    np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
1571    np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())
1572
1573
1574def test_array_uint64_from_py_over_range():
1575    arr = pa.array([2 ** 63], type=pa.uint64())
1576    expected = pa.array(np.array([2 ** 63], dtype='u8'))
1577    assert arr.equals(expected)
1578
1579
1580def test_array_conversions_no_sentinel_values():
1581    arr = np.array([1, 2, 3, 4], dtype='int8')
1582    refcount = sys.getrefcount(arr)
1583    arr2 = pa.array(arr)  # noqa
1584    assert sys.getrefcount(arr) == (refcount + 1)
1585
1586    assert arr2.type == 'int8'
1587
1588    arr3 = pa.array(np.array([1, np.nan, 2, 3, np.nan, 4], dtype='float32'),
1589                    type='float32')
1590    assert arr3.type == 'float32'
1591    assert arr3.null_count == 0
1592
1593
1594def test_time32_time64_from_integer():
1595    # ARROW-4111
1596    result = pa.array([1, 2, None], type=pa.time32('s'))
1597    expected = pa.array([datetime.time(second=1),
1598                         datetime.time(second=2), None],
1599                        type=pa.time32('s'))
1600    assert result.equals(expected)
1601
1602    result = pa.array([1, 2, None], type=pa.time32('ms'))
1603    expected = pa.array([datetime.time(microsecond=1000),
1604                         datetime.time(microsecond=2000), None],
1605                        type=pa.time32('ms'))
1606    assert result.equals(expected)
1607
1608    result = pa.array([1, 2, None], type=pa.time64('us'))
1609    expected = pa.array([datetime.time(microsecond=1),
1610                         datetime.time(microsecond=2), None],
1611                        type=pa.time64('us'))
1612    assert result.equals(expected)
1613
1614    result = pa.array([1000, 2000, None], type=pa.time64('ns'))
1615    expected = pa.array([datetime.time(microsecond=1),
1616                         datetime.time(microsecond=2), None],
1617                        type=pa.time64('ns'))
1618    assert result.equals(expected)
1619
1620
1621def test_binary_string_pandas_null_sentinels():
1622    # ARROW-6227
1623    def _check_case(ty):
1624        arr = pa.array(['string', np.nan], type=ty, from_pandas=True)
1625        expected = pa.array(['string', None], type=ty)
1626        assert arr.equals(expected)
1627    _check_case('binary')
1628    _check_case('utf8')
1629
1630
1631def test_pandas_null_sentinels_raise_error():
1632    # ARROW-6227
1633    cases = [
1634        ([None, np.nan], 'null'),
1635        (['string', np.nan], 'binary'),
1636        (['string', np.nan], 'utf8'),
1637        (['string', np.nan], 'large_binary'),
1638        (['string', np.nan], 'large_utf8'),
1639        ([b'string', np.nan], pa.binary(6)),
1640        ([True, np.nan], pa.bool_()),
1641        ([decimal.Decimal('0'), np.nan], pa.decimal128(12, 2)),
1642        ([0, np.nan], pa.date32()),
1643        ([0, np.nan], pa.date32()),
1644        ([0, np.nan], pa.date64()),
1645        ([0, np.nan], pa.time32('s')),
1646        ([0, np.nan], pa.time64('us')),
1647        ([0, np.nan], pa.timestamp('us')),
1648        ([0, np.nan], pa.duration('us')),
1649    ]
1650    for case, ty in cases:
1651        # Both types of exceptions are raised. May want to clean that up
1652        with pytest.raises((ValueError, TypeError)):
1653            pa.array(case, type=ty)
1654
1655        # from_pandas option suppresses failure
1656        result = pa.array(case, type=ty, from_pandas=True)
1657        assert result.null_count == (1 if ty != 'null' else 2)
1658
1659
1660@pytest.mark.pandas
1661def test_pandas_null_sentinels_index():
1662    # ARROW-7023 - ensure that when passing a pandas Index, "from_pandas"
1663    # semantics are used
1664    import pandas as pd
1665    idx = pd.Index([1, 2, np.nan], dtype=object)
1666    result = pa.array(idx)
1667    expected = pa.array([1, 2, np.nan], from_pandas=True)
1668    assert result.equals(expected)
1669
1670
1671def test_array_from_numpy_datetimeD():
1672    arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]')
1673
1674    result = pa.array(arr)
1675    expected = pa.array([None, datetime.date(2017, 4, 4)], type=pa.date32())
1676    assert result.equals(expected)
1677
1678
1679@pytest.mark.parametrize(('dtype', 'type'), [
1680    ('datetime64[s]', pa.timestamp('s')),
1681    ('datetime64[ms]', pa.timestamp('ms')),
1682    ('datetime64[us]', pa.timestamp('us')),
1683    ('datetime64[ns]', pa.timestamp('ns'))
1684])
1685def test_array_from_numpy_datetime(dtype, type):
1686    data = [
1687        None,
1688        datetime.datetime(2017, 4, 4, 12, 11, 10),
1689        datetime.datetime(2018, 1, 1, 0, 2, 0)
1690    ]
1691
1692    # from numpy array
1693    arr = pa.array(np.array(data, dtype=dtype))
1694    expected = pa.array(data, type=type)
1695    assert arr.equals(expected)
1696
1697    # from list of numpy scalars
1698    arr = pa.array(list(np.array(data, dtype=dtype)))
1699    assert arr.equals(expected)
1700
1701
1702def test_array_from_different_numpy_datetime_units_raises():
1703    data = [
1704        None,
1705        datetime.datetime(2017, 4, 4, 12, 11, 10),
1706        datetime.datetime(2018, 1, 1, 0, 2, 0)
1707    ]
1708    s = np.array(data, dtype='datetime64[s]')
1709    ms = np.array(data, dtype='datetime64[ms]')
1710    data = list(s[:2]) + list(ms[2:])
1711
1712    with pytest.raises(pa.ArrowNotImplementedError):
1713        pa.array(data)
1714
1715
1716@pytest.mark.parametrize('unit', ['ns', 'us', 'ms', 's'])
1717def test_array_from_list_of_timestamps(unit):
1718    n = np.datetime64('NaT', unit)
1719    x = np.datetime64('2017-01-01 01:01:01.111111111', unit)
1720    y = np.datetime64('2018-11-22 12:24:48.111111111', unit)
1721
1722    a1 = pa.array([n, x, y])
1723    a2 = pa.array([n, x, y], type=pa.timestamp(unit))
1724
1725    assert a1.type == a2.type
1726    assert a1.type.unit == unit
1727    assert a1[0] == a2[0]
1728
1729
1730def test_array_from_timestamp_with_generic_unit():
1731    n = np.datetime64('NaT')
1732    x = np.datetime64('2017-01-01 01:01:01.111111111')
1733    y = np.datetime64('2018-11-22 12:24:48.111111111')
1734
1735    with pytest.raises(pa.ArrowNotImplementedError,
1736                       match='Unbound or generic datetime64 time unit'):
1737        pa.array([n, x, y])
1738
1739
1740@pytest.mark.parametrize(('dtype', 'type'), [
1741    ('timedelta64[s]', pa.duration('s')),
1742    ('timedelta64[ms]', pa.duration('ms')),
1743    ('timedelta64[us]', pa.duration('us')),
1744    ('timedelta64[ns]', pa.duration('ns'))
1745])
1746def test_array_from_numpy_timedelta(dtype, type):
1747    data = [
1748        None,
1749        datetime.timedelta(1),
1750        datetime.timedelta(0, 1)
1751    ]
1752
1753    # from numpy array
1754    np_arr = np.array(data, dtype=dtype)
1755    arr = pa.array(np_arr)
1756    assert isinstance(arr, pa.DurationArray)
1757    assert arr.type == type
1758    expected = pa.array(data, type=type)
1759    assert arr.equals(expected)
1760    assert arr.to_pylist() == data
1761
1762    # from list of numpy scalars
1763    arr = pa.array(list(np.array(data, dtype=dtype)))
1764    assert arr.equals(expected)
1765    assert arr.to_pylist() == data
1766
1767
1768def test_array_from_numpy_timedelta_incorrect_unit():
1769    # generic (no unit)
1770    td = np.timedelta64(1)
1771
1772    for data in [[td], np.array([td])]:
1773        with pytest.raises(NotImplementedError):
1774            pa.array(data)
1775
1776    # unsupported unit
1777    td = np.timedelta64(1, 'M')
1778    for data in [[td], np.array([td])]:
1779        with pytest.raises(NotImplementedError):
1780            pa.array(data)
1781
1782
1783def test_array_from_numpy_ascii():
1784    arr = np.array(['abcde', 'abc', ''], dtype='|S5')
1785
1786    arrow_arr = pa.array(arr)
1787    assert arrow_arr.type == 'binary'
1788    expected = pa.array(['abcde', 'abc', ''], type='binary')
1789    assert arrow_arr.equals(expected)
1790
1791    mask = np.array([False, True, False])
1792    arrow_arr = pa.array(arr, mask=mask)
1793    expected = pa.array(['abcde', None, ''], type='binary')
1794    assert arrow_arr.equals(expected)
1795
1796    # Strided variant
1797    arr = np.array(['abcde', 'abc', ''] * 5, dtype='|S5')[::2]
1798    mask = np.array([False, True, False] * 5)[::2]
1799    arrow_arr = pa.array(arr, mask=mask)
1800
1801    expected = pa.array(['abcde', '', None, 'abcde', '', None, 'abcde', ''],
1802                        type='binary')
1803    assert arrow_arr.equals(expected)
1804
1805    # 0 itemsize
1806    arr = np.array(['', '', ''], dtype='|S0')
1807    arrow_arr = pa.array(arr)
1808    expected = pa.array(['', '', ''], type='binary')
1809    assert arrow_arr.equals(expected)
1810
1811
1812def test_array_from_numpy_unicode():
1813    dtypes = ['<U5', '>U5']
1814
1815    for dtype in dtypes:
1816        arr = np.array(['abcde', 'abc', ''], dtype=dtype)
1817
1818        arrow_arr = pa.array(arr)
1819        assert arrow_arr.type == 'utf8'
1820        expected = pa.array(['abcde', 'abc', ''], type='utf8')
1821        assert arrow_arr.equals(expected)
1822
1823        mask = np.array([False, True, False])
1824        arrow_arr = pa.array(arr, mask=mask)
1825        expected = pa.array(['abcde', None, ''], type='utf8')
1826        assert arrow_arr.equals(expected)
1827
1828        # Strided variant
1829        arr = np.array(['abcde', 'abc', ''] * 5, dtype=dtype)[::2]
1830        mask = np.array([False, True, False] * 5)[::2]
1831        arrow_arr = pa.array(arr, mask=mask)
1832
1833        expected = pa.array(['abcde', '', None, 'abcde', '', None,
1834                             'abcde', ''], type='utf8')
1835        assert arrow_arr.equals(expected)
1836
1837    # 0 itemsize
1838    arr = np.array(['', '', ''], dtype='<U0')
1839    arrow_arr = pa.array(arr)
1840    expected = pa.array(['', '', ''], type='utf8')
1841    assert arrow_arr.equals(expected)
1842
1843
1844def test_array_string_from_non_string():
1845    # ARROW-5682 - when converting to string raise on non string-like dtype
1846    with pytest.raises(TypeError):
1847        pa.array(np.array([1, 2, 3]), type=pa.string())
1848
1849
1850def test_array_string_from_all_null():
1851    # ARROW-5682
1852    vals = np.array([None, None], dtype=object)
1853    arr = pa.array(vals, type=pa.string())
1854    assert arr.null_count == 2
1855
1856    vals = np.array([np.nan, np.nan], dtype='float64')
1857    # by default raises, but accept as all-null when from_pandas=True
1858    with pytest.raises(TypeError):
1859        pa.array(vals, type=pa.string())
1860    arr = pa.array(vals, type=pa.string(), from_pandas=True)
1861    assert arr.null_count == 2
1862
1863
1864def test_array_from_masked():
1865    ma = np.ma.array([1, 2, 3, 4], dtype='int64',
1866                     mask=[False, False, True, False])
1867    result = pa.array(ma)
1868    expected = pa.array([1, 2, None, 4], type='int64')
1869    assert expected.equals(result)
1870
1871    with pytest.raises(ValueError, match="Cannot pass a numpy masked array"):
1872        pa.array(ma, mask=np.array([True, False, False, False]))
1873
1874
1875def test_array_from_shrunken_masked():
1876    ma = np.ma.array([0], dtype='int64')
1877    result = pa.array(ma)
1878    expected = pa.array([0], type='int64')
1879    assert expected.equals(result)
1880
1881
1882def test_array_from_invalid_dim_raises():
1883    msg = "only handle 1-dimensional arrays"
1884    arr2d = np.array([[1, 2, 3], [4, 5, 6]])
1885    with pytest.raises(ValueError, match=msg):
1886        pa.array(arr2d)
1887
1888    arr0d = np.array(0)
1889    with pytest.raises(ValueError, match=msg):
1890        pa.array(arr0d)
1891
1892
1893def test_array_from_strided_bool():
1894    # ARROW-6325
1895    arr = np.ones((3, 2), dtype=bool)
1896    result = pa.array(arr[:, 0])
1897    expected = pa.array([True, True, True])
1898    assert result.equals(expected)
1899    result = pa.array(arr[0, :])
1900    expected = pa.array([True, True])
1901    assert result.equals(expected)
1902
1903
1904def test_buffers_primitive():
1905    a = pa.array([1, 2, None, 4], type=pa.int16())
1906    buffers = a.buffers()
1907    assert len(buffers) == 2
1908    null_bitmap = buffers[0].to_pybytes()
1909    assert 1 <= len(null_bitmap) <= 64  # XXX this is varying
1910    assert bytearray(null_bitmap)[0] == 0b00001011
1911
1912    # Slicing does not affect the buffers but the offset
1913    a_sliced = a[1:]
1914    buffers = a_sliced.buffers()
1915    a_sliced.offset == 1
1916    assert len(buffers) == 2
1917    null_bitmap = buffers[0].to_pybytes()
1918    assert 1 <= len(null_bitmap) <= 64  # XXX this is varying
1919    assert bytearray(null_bitmap)[0] == 0b00001011
1920
1921    assert struct.unpack('hhxxh', buffers[1].to_pybytes()) == (1, 2, 4)
1922
1923    a = pa.array(np.int8([4, 5, 6]))
1924    buffers = a.buffers()
1925    assert len(buffers) == 2
1926    # No null bitmap from Numpy int array
1927    assert buffers[0] is None
1928    assert struct.unpack('3b', buffers[1].to_pybytes()) == (4, 5, 6)
1929
1930    a = pa.array([b'foo!', None, b'bar!!'])
1931    buffers = a.buffers()
1932    assert len(buffers) == 3
1933    null_bitmap = buffers[0].to_pybytes()
1934    assert bytearray(null_bitmap)[0] == 0b00000101
1935    offsets = buffers[1].to_pybytes()
1936    assert struct.unpack('4i', offsets) == (0, 4, 4, 9)
1937    values = buffers[2].to_pybytes()
1938    assert values == b'foo!bar!!'
1939
1940
1941def test_buffers_nested():
1942    a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64()))
1943    buffers = a.buffers()
1944    assert len(buffers) == 4
1945    # The parent buffers
1946    null_bitmap = buffers[0].to_pybytes()
1947    assert bytearray(null_bitmap)[0] == 0b00000101
1948    offsets = buffers[1].to_pybytes()
1949    assert struct.unpack('4i', offsets) == (0, 2, 2, 6)
1950    # The child buffers
1951    null_bitmap = buffers[2].to_pybytes()
1952    assert bytearray(null_bitmap)[0] == 0b00110111
1953    values = buffers[3].to_pybytes()
1954    assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5)
1955
1956    a = pa.array([(42, None), None, (None, 43)],
1957                 type=pa.struct([pa.field('a', pa.int8()),
1958                                 pa.field('b', pa.int16())]))
1959    buffers = a.buffers()
1960    assert len(buffers) == 5
1961    # The parent buffer
1962    null_bitmap = buffers[0].to_pybytes()
1963    assert bytearray(null_bitmap)[0] == 0b00000101
1964    # The child buffers: 'a'
1965    null_bitmap = buffers[1].to_pybytes()
1966    assert bytearray(null_bitmap)[0] == 0b00000001
1967    values = buffers[2].to_pybytes()
1968    assert struct.unpack('bxx', values) == (42,)
1969    # The child buffers: 'b'
1970    null_bitmap = buffers[3].to_pybytes()
1971    assert bytearray(null_bitmap)[0] == 0b00000100
1972    values = buffers[4].to_pybytes()
1973    assert struct.unpack('4xh', values) == (43,)
1974
1975
1976def test_nbytes_sizeof():
1977    a = pa.array(np.array([4, 5, 6], dtype='int64'))
1978    assert a.nbytes == 8 * 3
1979    assert sys.getsizeof(a) >= object.__sizeof__(a) + a.nbytes
1980    a = pa.array([1, None, 3], type='int64')
1981    assert a.nbytes == 8*3 + 1
1982    assert sys.getsizeof(a) >= object.__sizeof__(a) + a.nbytes
1983    a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64()))
1984    assert a.nbytes == 1 + 4 * 4 + 1 + 6 * 8
1985    assert sys.getsizeof(a) >= object.__sizeof__(a) + a.nbytes
1986
1987
1988def test_invalid_tensor_constructor_repr():
1989    # ARROW-2638: prevent calling extension class constructors directly
1990    with pytest.raises(TypeError):
1991        repr(pa.Tensor([1]))
1992
1993
1994def test_invalid_tensor_construction():
1995    with pytest.raises(TypeError):
1996        pa.Tensor()
1997
1998
1999@pytest.mark.parametrize(('offset_type', 'list_type_factory'),
2000                         [(pa.int32(), pa.list_), (pa.int64(), pa.large_list)])
2001def test_list_array_flatten(offset_type, list_type_factory):
2002    typ2 = list_type_factory(
2003        list_type_factory(
2004            pa.int64()
2005        )
2006    )
2007    arr2 = pa.array([
2008        None,
2009        [
2010            [1, None, 2],
2011            None,
2012            [3, 4]
2013        ],
2014        [],
2015        [
2016            [],
2017            [5, 6],
2018            None
2019        ],
2020        [
2021            [7, 8]
2022        ]
2023    ], type=typ2)
2024    offsets2 = pa.array([0, 0, 3, 3, 6, 7], type=offset_type)
2025
2026    typ1 = list_type_factory(pa.int64())
2027    arr1 = pa.array([
2028        [1, None, 2],
2029        None,
2030        [3, 4],
2031        [],
2032        [5, 6],
2033        None,
2034        [7, 8]
2035    ], type=typ1)
2036    offsets1 = pa.array([0, 3, 3, 5, 5, 7, 7, 9], type=offset_type)
2037
2038    arr0 = pa.array([
2039        1, None, 2,
2040        3, 4,
2041        5, 6,
2042        7, 8
2043    ], type=pa.int64())
2044
2045    assert arr2.flatten().equals(arr1)
2046    assert arr2.offsets.equals(offsets2)
2047    assert arr2.values.equals(arr1)
2048    assert arr1.flatten().equals(arr0)
2049    assert arr1.offsets.equals(offsets1)
2050    assert arr1.values.equals(arr0)
2051    assert arr2.flatten().flatten().equals(arr0)
2052    assert arr2.values.values.equals(arr0)
2053
2054
2055@pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list])
2056def test_list_array_flatten_non_canonical(list_type_factory):
2057    # Non-canonical list array (null elements backed by non-empty sublists)
2058    typ = list_type_factory(pa.int64())
2059    arr = pa.array([[1], [2, 3], [4, 5, 6]], type=typ)
2060    buffers = arr.buffers()[:2]
2061    buffers[0] = pa.py_buffer(b"\x05")  # validity bitmap
2062    arr = arr.from_buffers(arr.type, len(arr), buffers, children=[arr.values])
2063    assert arr.to_pylist() == [[1], None, [4, 5, 6]]
2064    assert arr.offsets.to_pylist() == [0, 1, 3, 6]
2065
2066    flattened = arr.flatten()
2067    flattened.validate(full=True)
2068    assert flattened.type == typ.value_type
2069    assert flattened.to_pylist() == [1, 4, 5, 6]
2070
2071    # .values is the physical values array (including masked elements)
2072    assert arr.values.to_pylist() == [1, 2, 3, 4, 5, 6]
2073
2074
2075@pytest.mark.parametrize('klass', [pa.ListArray, pa.LargeListArray])
2076def test_list_array_values_offsets_sliced(klass):
2077    # ARROW-7301
2078    arr = klass.from_arrays(offsets=[0, 3, 4, 6], values=[1, 2, 3, 4, 5, 6])
2079    assert arr.values.to_pylist() == [1, 2, 3, 4, 5, 6]
2080    assert arr.offsets.to_pylist() == [0, 3, 4, 6]
2081
2082    # sliced -> values keeps referring to full values buffer, but offsets is
2083    # sliced as well so the offsets correctly point into the full values array
2084    # sliced -> flatten() will return the sliced value array.
2085    arr2 = arr[1:]
2086    assert arr2.values.to_pylist() == [1, 2, 3, 4, 5, 6]
2087    assert arr2.offsets.to_pylist() == [3, 4, 6]
2088    assert arr2.flatten().to_pylist() == [4, 5, 6]
2089    i = arr2.offsets[0].as_py()
2090    j = arr2.offsets[1].as_py()
2091    assert arr2[0].as_py() == arr2.values[i:j].to_pylist() == [4]
2092
2093
2094def test_fixed_size_list_array_flatten():
2095    typ2 = pa.list_(pa.list_(pa.int64(), 2), 3)
2096    arr2 = pa.array([
2097        [
2098            [1, 2],
2099            [3, 4],
2100            [5, 6],
2101        ],
2102        None,
2103        [
2104            [7, None],
2105            None,
2106            [8, 9]
2107        ],
2108    ], type=typ2)
2109    assert arr2.type.equals(typ2)
2110
2111    typ1 = pa.list_(pa.int64(), 2)
2112    arr1 = pa.array([
2113        [1, 2], [3, 4], [5, 6],
2114        None, None, None,
2115        [7, None], None, [8, 9]
2116    ], type=typ1)
2117    assert arr1.type.equals(typ1)
2118    assert arr2.flatten().equals(arr1)
2119
2120    typ0 = pa.int64()
2121    arr0 = pa.array([
2122        1, 2, 3, 4, 5, 6,
2123        None, None, None, None, None, None,
2124        7, None, None, None, 8, 9,
2125    ], type=typ0)
2126    assert arr0.type.equals(typ0)
2127    assert arr1.flatten().equals(arr0)
2128    assert arr2.flatten().flatten().equals(arr0)
2129
2130
2131def test_struct_array_flatten():
2132    ty = pa.struct([pa.field('x', pa.int16()),
2133                    pa.field('y', pa.float32())])
2134    a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty)
2135    xs, ys = a.flatten()
2136    assert xs.type == pa.int16()
2137    assert ys.type == pa.float32()
2138    assert xs.to_pylist() == [1, 3, 5]
2139    assert ys.to_pylist() == [2.5, 4.5, 6.5]
2140    xs, ys = a[1:].flatten()
2141    assert xs.to_pylist() == [3, 5]
2142    assert ys.to_pylist() == [4.5, 6.5]
2143
2144    a = pa.array([(1, 2.5), None, (3, 4.5)], type=ty)
2145    xs, ys = a.flatten()
2146    assert xs.to_pylist() == [1, None, 3]
2147    assert ys.to_pylist() == [2.5, None, 4.5]
2148    xs, ys = a[1:].flatten()
2149    assert xs.to_pylist() == [None, 3]
2150    assert ys.to_pylist() == [None, 4.5]
2151
2152    a = pa.array([(1, None), (2, 3.5), (None, 4.5)], type=ty)
2153    xs, ys = a.flatten()
2154    assert xs.to_pylist() == [1, 2, None]
2155    assert ys.to_pylist() == [None, 3.5, 4.5]
2156    xs, ys = a[1:].flatten()
2157    assert xs.to_pylist() == [2, None]
2158    assert ys.to_pylist() == [3.5, 4.5]
2159
2160    a = pa.array([(1, None), None, (None, 2.5)], type=ty)
2161    xs, ys = a.flatten()
2162    assert xs.to_pylist() == [1, None, None]
2163    assert ys.to_pylist() == [None, None, 2.5]
2164    xs, ys = a[1:].flatten()
2165    assert xs.to_pylist() == [None, None]
2166    assert ys.to_pylist() == [None, 2.5]
2167
2168
2169def test_struct_array_field():
2170    ty = pa.struct([pa.field('x', pa.int16()),
2171                    pa.field('y', pa.float32())])
2172    a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty)
2173
2174    x0 = a.field(0)
2175    y0 = a.field(1)
2176    x1 = a.field(-2)
2177    y1 = a.field(-1)
2178    x2 = a.field('x')
2179    y2 = a.field('y')
2180
2181    assert isinstance(x0, pa.lib.Int16Array)
2182    assert isinstance(y1, pa.lib.FloatArray)
2183    assert x0.equals(pa.array([1, 3, 5], type=pa.int16()))
2184    assert y0.equals(pa.array([2.5, 4.5, 6.5], type=pa.float32()))
2185    assert x0.equals(x1)
2186    assert x0.equals(x2)
2187    assert y0.equals(y1)
2188    assert y0.equals(y2)
2189
2190    for invalid_index in [None, pa.int16()]:
2191        with pytest.raises(TypeError):
2192            a.field(invalid_index)
2193
2194    for invalid_index in [3, -3]:
2195        with pytest.raises(IndexError):
2196            a.field(invalid_index)
2197
2198    for invalid_name in ['z', '']:
2199        with pytest.raises(KeyError):
2200            a.field(invalid_name)
2201
2202
2203def test_empty_cast():
2204    types = [
2205        pa.null(),
2206        pa.bool_(),
2207        pa.int8(),
2208        pa.int16(),
2209        pa.int32(),
2210        pa.int64(),
2211        pa.uint8(),
2212        pa.uint16(),
2213        pa.uint32(),
2214        pa.uint64(),
2215        pa.float16(),
2216        pa.float32(),
2217        pa.float64(),
2218        pa.date32(),
2219        pa.date64(),
2220        pa.binary(),
2221        pa.binary(length=4),
2222        pa.string(),
2223    ]
2224
2225    for (t1, t2) in itertools.product(types, types):
2226        try:
2227            # ARROW-4766: Ensure that supported types conversion don't segfault
2228            # on empty arrays of common types
2229            pa.array([], type=t1).cast(t2)
2230        except (pa.lib.ArrowNotImplementedError, pa.ArrowInvalid):
2231            continue
2232
2233
2234def test_nested_dictionary_array():
2235    dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b'])
2236    list_arr = pa.ListArray.from_arrays([0, 2, 3], dict_arr)
2237    assert list_arr.to_pylist() == [['a', 'b'], ['a']]
2238
2239    dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b'])
2240    dict_arr2 = pa.DictionaryArray.from_arrays([0, 1, 2, 1, 0], dict_arr)
2241    assert dict_arr2.to_pylist() == ['a', 'b', 'a', 'b', 'a']
2242
2243
2244def test_array_from_numpy_str_utf8():
2245    # ARROW-3890 -- in Python 3, NPY_UNICODE arrays are produced, but in Python
2246    # 2 they are NPY_STRING (binary), so we must do UTF-8 validation
2247    vec = np.array(["toto", "tata"])
2248    vec2 = np.array(["toto", "tata"], dtype=object)
2249
2250    arr = pa.array(vec, pa.string())
2251    arr2 = pa.array(vec2, pa.string())
2252    expected = pa.array(["toto", "tata"])
2253    assert arr.equals(expected)
2254    assert arr2.equals(expected)
2255
2256    # with mask, separate code path
2257    mask = np.array([False, False], dtype=bool)
2258    arr = pa.array(vec, pa.string(), mask=mask)
2259    assert arr.equals(expected)
2260
2261    # UTF8 validation failures
2262    vec = np.array([('mañana').encode('utf-16-le')])
2263    with pytest.raises(ValueError):
2264        pa.array(vec, pa.string())
2265
2266    with pytest.raises(ValueError):
2267        pa.array(vec, pa.string(), mask=np.array([False]))
2268
2269
2270@pytest.mark.large_memory
2271def test_numpy_binary_overflow_to_chunked():
2272    # ARROW-3762, ARROW-5966
2273
2274    # 2^31 + 1 bytes
2275    values = [b'x']
2276    unicode_values = ['x']
2277
2278    # Make 10 unique 1MB strings then repeat then 2048 times
2279    unique_strings = {
2280        i: b'x' * ((1 << 20) - 1) + str(i % 10).encode('utf8')
2281        for i in range(10)
2282    }
2283    unicode_unique_strings = {i: x.decode('utf8')
2284                              for i, x in unique_strings.items()}
2285    values += [unique_strings[i % 10] for i in range(1 << 11)]
2286    unicode_values += [unicode_unique_strings[i % 10]
2287                       for i in range(1 << 11)]
2288
2289    for case, ex_type in [(values, pa.binary()),
2290                          (unicode_values, pa.utf8())]:
2291        arr = np.array(case)
2292        arrow_arr = pa.array(arr)
2293        arr = None
2294
2295        assert isinstance(arrow_arr, pa.ChunkedArray)
2296        assert arrow_arr.type == ex_type
2297
2298        # Split up into 16MB chunks. 128 * 16 = 2048, so 129
2299        assert arrow_arr.num_chunks == 129
2300
2301        value_index = 0
2302        for i in range(arrow_arr.num_chunks):
2303            chunk = arrow_arr.chunk(i)
2304            for val in chunk:
2305                assert val.as_py() == case[value_index]
2306                value_index += 1
2307
2308
2309@pytest.mark.large_memory
2310def test_list_child_overflow_to_chunked():
2311    vals = [['x' * 1024]] * ((2 << 20) + 1)
2312    with pytest.raises(ValueError, match="overflowed"):
2313        pa.array(vals)
2314
2315
2316def test_infer_type_masked():
2317    # ARROW-5208
2318    ty = pa.infer_type(['foo', 'bar', None, 2],
2319                       mask=[False, False, False, True])
2320    assert ty == pa.utf8()
2321
2322    # all masked
2323    ty = pa.infer_type(['foo', 'bar', None, 2],
2324                       mask=np.array([True, True, True, True]))
2325    assert ty == pa.null()
2326
2327    # length 0
2328    assert pa.infer_type([], mask=[]) == pa.null()
2329
2330
2331def test_array_masked():
2332    # ARROW-5208
2333    arr = pa.array([4, None, 4, 3.],
2334                   mask=np.array([False, True, False, True]))
2335    assert arr.type == pa.int64()
2336
2337    # ndarray dtype=object argument
2338    arr = pa.array(np.array([4, None, 4, 3.], dtype="O"),
2339                   mask=np.array([False, True, False, True]))
2340    assert arr.type == pa.int64()
2341
2342
2343def test_array_from_large_pyints():
2344    # ARROW-5430
2345    with pytest.raises(OverflowError):
2346        # too large for int64 so dtype must be explicitly provided
2347        pa.array([int(2 ** 63)])
2348
2349
2350def test_array_protocol():
2351
2352    class MyArray:
2353        def __init__(self, data):
2354            self.data = data
2355
2356        def __arrow_array__(self, type=None):
2357            return pa.array(self.data, type=type)
2358
2359    arr = MyArray(np.array([1, 2, 3], dtype='int64'))
2360    result = pa.array(arr)
2361    expected = pa.array([1, 2, 3], type=pa.int64())
2362    assert result.equals(expected)
2363    result = pa.array(arr, type=pa.int64())
2364    expected = pa.array([1, 2, 3], type=pa.int64())
2365    assert result.equals(expected)
2366    result = pa.array(arr, type=pa.float64())
2367    expected = pa.array([1, 2, 3], type=pa.float64())
2368    assert result.equals(expected)
2369
2370    # raise error when passing size or mask keywords
2371    with pytest.raises(ValueError):
2372        pa.array(arr, mask=np.array([True, False, True]))
2373    with pytest.raises(ValueError):
2374        pa.array(arr, size=3)
2375
2376    # ensure the return value is an Array
2377    class MyArrayInvalid:
2378        def __init__(self, data):
2379            self.data = data
2380
2381        def __arrow_array__(self, type=None):
2382            return np.array(self.data)
2383
2384    arr = MyArrayInvalid(np.array([1, 2, 3], dtype='int64'))
2385    with pytest.raises(TypeError):
2386        pa.array(arr)
2387
2388    # ARROW-7066 - allow ChunkedArray output
2389    class MyArray2:
2390        def __init__(self, data):
2391            self.data = data
2392
2393        def __arrow_array__(self, type=None):
2394            return pa.chunked_array([self.data], type=type)
2395
2396    arr = MyArray2(np.array([1, 2, 3], dtype='int64'))
2397    result = pa.array(arr)
2398    expected = pa.chunked_array([[1, 2, 3]], type=pa.int64())
2399    assert result.equals(expected)
2400
2401
2402def test_concat_array():
2403    concatenated = pa.concat_arrays(
2404        [pa.array([1, 2]), pa.array([3, 4])])
2405    assert concatenated.equals(pa.array([1, 2, 3, 4]))
2406
2407
2408def test_concat_array_different_types():
2409    with pytest.raises(pa.ArrowInvalid):
2410        pa.concat_arrays([pa.array([1]), pa.array([2.])])
2411
2412
2413@pytest.mark.pandas
2414def test_to_pandas_timezone():
2415    # https://issues.apache.org/jira/browse/ARROW-6652
2416    arr = pa.array([1, 2, 3], type=pa.timestamp('s', tz='Europe/Brussels'))
2417    s = arr.to_pandas()
2418    assert s.dt.tz is not None
2419    arr = pa.chunked_array([arr])
2420    s = arr.to_pandas()
2421    assert s.dt.tz is not None
2422