1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements.  See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership.  The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License.  You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied.  See the License for the
15# specific language governing permissions and limitations
16# under the License.
17
18import io
19import json
20
21import numpy as np
22import pytest
23
24import pyarrow as pa
25from pyarrow.fs import LocalFileSystem, SubTreeFileSystem
26from pyarrow.tests.parquet.common import (
27    parametrize_legacy_dataset, parametrize_legacy_dataset_not_supported)
28from pyarrow.util import guid
29from pyarrow.vendored.version import Version
30
31try:
32    import pyarrow.parquet as pq
33    from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
34                                              _write_table)
35except ImportError:
36    pq = None
37
38
39try:
40    import pandas as pd
41    import pandas.testing as tm
42
43    from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe,
44                                              alltypes_sample)
45except ImportError:
46    pd = tm = None
47
48
49pytestmark = pytest.mark.parquet
50
51
52@pytest.mark.pandas
53def test_pandas_parquet_custom_metadata(tempdir):
54    df = alltypes_sample(size=10000)
55
56    filename = tempdir / 'pandas_roundtrip.parquet'
57    arrow_table = pa.Table.from_pandas(df)
58    assert b'pandas' in arrow_table.schema.metadata
59
60    _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
61
62    metadata = pq.read_metadata(filename).metadata
63    assert b'pandas' in metadata
64
65    js = json.loads(metadata[b'pandas'].decode('utf8'))
66    assert js['index_columns'] == [{'kind': 'range',
67                                    'name': None,
68                                    'start': 0, 'stop': 10000,
69                                    'step': 1}]
70
71
72@pytest.mark.pandas
73def test_merging_parquet_tables_with_different_pandas_metadata(tempdir):
74    # ARROW-3728: Merging Parquet Files - Pandas Meta in Schema Mismatch
75    schema = pa.schema([
76        pa.field('int', pa.int16()),
77        pa.field('float', pa.float32()),
78        pa.field('string', pa.string())
79    ])
80    df1 = pd.DataFrame({
81        'int': np.arange(3, dtype=np.uint8),
82        'float': np.arange(3, dtype=np.float32),
83        'string': ['ABBA', 'EDDA', 'ACDC']
84    })
85    df2 = pd.DataFrame({
86        'int': [4, 5],
87        'float': [1.1, None],
88        'string': [None, None]
89    })
90    table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False)
91    table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False)
92
93    assert not table1.schema.equals(table2.schema, check_metadata=True)
94    assert table1.schema.equals(table2.schema)
95
96    writer = pq.ParquetWriter(tempdir / 'merged.parquet', schema=schema)
97    writer.write_table(table1)
98    writer.write_table(table2)
99
100
101@pytest.mark.pandas
102@parametrize_legacy_dataset
103def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset):
104    df = alltypes_sample(size=10)
105    df.columns = pd.MultiIndex.from_tuples(
106        list(zip(df.columns, df.columns[::-1])),
107        names=['level_1', 'level_2']
108    )
109
110    filename = tempdir / 'pandas_roundtrip.parquet'
111    arrow_table = pa.Table.from_pandas(df)
112    assert arrow_table.schema.pandas_metadata is not None
113
114    _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
115
116    table_read = pq.read_pandas(
117        filename, use_legacy_dataset=use_legacy_dataset)
118    df_read = table_read.to_pandas()
119    tm.assert_frame_equal(df, df_read)
120
121
122@pytest.mark.pandas
123@parametrize_legacy_dataset
124def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(
125    tempdir, use_legacy_dataset
126):
127    df = alltypes_sample(size=10000)
128
129    filename = tempdir / 'pandas_roundtrip.parquet'
130    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
131    js = arrow_table.schema.pandas_metadata
132    assert not js['index_columns']
133    # ARROW-2170
134    # While index_columns should be empty, columns needs to be filled still.
135    assert js['columns']
136
137    _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
138    table_read = pq.read_pandas(
139        filename, use_legacy_dataset=use_legacy_dataset)
140
141    js = table_read.schema.pandas_metadata
142    assert not js['index_columns']
143
144    read_metadata = table_read.schema.metadata
145    assert arrow_table.schema.metadata == read_metadata
146
147    df_read = table_read.to_pandas()
148    tm.assert_frame_equal(df, df_read)
149
150
151# TODO(dataset) duplicate column selection actually gives duplicate columns now
152@pytest.mark.pandas
153@parametrize_legacy_dataset_not_supported
154def test_pandas_column_selection(tempdir, use_legacy_dataset):
155    size = 10000
156    np.random.seed(0)
157    df = pd.DataFrame({
158        'uint8': np.arange(size, dtype=np.uint8),
159        'uint16': np.arange(size, dtype=np.uint16)
160    })
161    filename = tempdir / 'pandas_roundtrip.parquet'
162    arrow_table = pa.Table.from_pandas(df)
163    _write_table(arrow_table, filename)
164    table_read = _read_table(
165        filename, columns=['uint8'], use_legacy_dataset=use_legacy_dataset)
166    df_read = table_read.to_pandas()
167
168    tm.assert_frame_equal(df[['uint8']], df_read)
169
170    # ARROW-4267: Selection of duplicate columns still leads to these columns
171    # being read uniquely.
172    table_read = _read_table(
173        filename, columns=['uint8', 'uint8'],
174        use_legacy_dataset=use_legacy_dataset)
175    df_read = table_read.to_pandas()
176
177    tm.assert_frame_equal(df[['uint8']], df_read)
178
179
180@pytest.mark.pandas
181@parametrize_legacy_dataset
182def test_pandas_parquet_native_file_roundtrip(tempdir, use_legacy_dataset):
183    df = _test_dataframe(10000)
184    arrow_table = pa.Table.from_pandas(df)
185    imos = pa.BufferOutputStream()
186    _write_table(arrow_table, imos, version='2.6')
187    buf = imos.getvalue()
188    reader = pa.BufferReader(buf)
189    df_read = _read_table(
190        reader, use_legacy_dataset=use_legacy_dataset).to_pandas()
191    tm.assert_frame_equal(df, df_read)
192
193
194@pytest.mark.pandas
195@parametrize_legacy_dataset
196def test_read_pandas_column_subset(tempdir, use_legacy_dataset):
197    df = _test_dataframe(10000)
198    arrow_table = pa.Table.from_pandas(df)
199    imos = pa.BufferOutputStream()
200    _write_table(arrow_table, imos, version='2.6')
201    buf = imos.getvalue()
202    reader = pa.BufferReader(buf)
203    df_read = pq.read_pandas(
204        reader, columns=['strings', 'uint8'],
205        use_legacy_dataset=use_legacy_dataset
206    ).to_pandas()
207    tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
208
209
210@pytest.mark.pandas
211@parametrize_legacy_dataset
212def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset):
213    df = _test_dataframe(0)
214    arrow_table = pa.Table.from_pandas(df)
215    imos = pa.BufferOutputStream()
216    _write_table(arrow_table, imos, version='2.6')
217    buf = imos.getvalue()
218    reader = pa.BufferReader(buf)
219    df_read = _read_table(
220        reader, use_legacy_dataset=use_legacy_dataset).to_pandas()
221    tm.assert_frame_equal(df, df_read)
222
223
224@pytest.mark.pandas
225def test_pandas_can_write_nested_data(tempdir):
226    data = {
227        "agg_col": [
228            {"page_type": 1},
229            {"record_type": 1},
230            {"non_consecutive_home": 0},
231        ],
232        "uid_first": "1001"
233    }
234    df = pd.DataFrame(data=data)
235    arrow_table = pa.Table.from_pandas(df)
236    imos = pa.BufferOutputStream()
237    # This succeeds under V2
238    _write_table(arrow_table, imos)
239
240
241@pytest.mark.pandas
242@parametrize_legacy_dataset
243def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset):
244    filename = tempdir / 'pandas_pyfile_roundtrip.parquet'
245    size = 5
246    df = pd.DataFrame({
247        'int64': np.arange(size, dtype=np.int64),
248        'float32': np.arange(size, dtype=np.float32),
249        'float64': np.arange(size, dtype=np.float64),
250        'bool': np.random.randn(size) > 0,
251        'strings': ['foo', 'bar', None, 'baz', 'qux']
252    })
253
254    arrow_table = pa.Table.from_pandas(df)
255
256    with filename.open('wb') as f:
257        _write_table(arrow_table, f, version="1.0")
258
259    data = io.BytesIO(filename.read_bytes())
260
261    table_read = _read_table(data, use_legacy_dataset=use_legacy_dataset)
262    df_read = table_read.to_pandas()
263    tm.assert_frame_equal(df, df_read)
264
265
266@pytest.mark.pandas
267@parametrize_legacy_dataset
268def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset):
269    size = 10000
270    np.random.seed(0)
271    df = pd.DataFrame({
272        'uint8': np.arange(size, dtype=np.uint8),
273        'uint16': np.arange(size, dtype=np.uint16),
274        'uint32': np.arange(size, dtype=np.uint32),
275        'uint64': np.arange(size, dtype=np.uint64),
276        'int8': np.arange(size, dtype=np.int16),
277        'int16': np.arange(size, dtype=np.int16),
278        'int32': np.arange(size, dtype=np.int32),
279        'int64': np.arange(size, dtype=np.int64),
280        'float32': np.arange(size, dtype=np.float32),
281        'float64': np.arange(size, dtype=np.float64),
282        'bool': np.random.randn(size) > 0
283    })
284    filename = tempdir / 'pandas_roundtrip.parquet'
285    arrow_table = pa.Table.from_pandas(df)
286
287    for use_dictionary in [True, False]:
288        _write_table(arrow_table, filename, version='2.6',
289                     use_dictionary=use_dictionary)
290        table_read = _read_table(
291            filename, use_legacy_dataset=use_legacy_dataset)
292        df_read = table_read.to_pandas()
293        tm.assert_frame_equal(df, df_read)
294
295    for write_statistics in [True, False]:
296        _write_table(arrow_table, filename, version='2.6',
297                     write_statistics=write_statistics)
298        table_read = _read_table(filename,
299                                 use_legacy_dataset=use_legacy_dataset)
300        df_read = table_read.to_pandas()
301        tm.assert_frame_equal(df, df_read)
302
303    for compression in ['NONE', 'SNAPPY', 'GZIP', 'LZ4', 'ZSTD']:
304        if (compression != 'NONE' and
305                not pa.lib.Codec.is_available(compression)):
306            continue
307        _write_table(arrow_table, filename, version='2.6',
308                     compression=compression)
309        table_read = _read_table(
310            filename, use_legacy_dataset=use_legacy_dataset)
311        df_read = table_read.to_pandas()
312        tm.assert_frame_equal(df, df_read)
313
314
315@pytest.mark.pandas
316def test_spark_flavor_preserves_pandas_metadata():
317    df = _test_dataframe(size=100)
318    df.index = np.arange(0, 10 * len(df), 10)
319    df.index.name = 'foo'
320
321    result = _roundtrip_pandas_dataframe(df, {'version': '2.0',
322                                              'flavor': 'spark'})
323    tm.assert_frame_equal(result, df)
324
325
326@pytest.mark.pandas
327@parametrize_legacy_dataset
328def test_index_column_name_duplicate(tempdir, use_legacy_dataset):
329    data = {
330        'close': {
331            pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998,
332            pd.Timestamp('2017-06-30 01:32:00'): 154.99958999999998,
333        },
334        'time': {
335            pd.Timestamp('2017-06-30 01:31:00'): pd.Timestamp(
336                '2017-06-30 01:31:00'
337            ),
338            pd.Timestamp('2017-06-30 01:32:00'): pd.Timestamp(
339                '2017-06-30 01:32:00'
340            ),
341        }
342    }
343    path = str(tempdir / 'data.parquet')
344    dfx = pd.DataFrame(data).set_index('time', drop=False)
345    tdfx = pa.Table.from_pandas(dfx)
346    _write_table(tdfx, path)
347    arrow_table = _read_table(path, use_legacy_dataset=use_legacy_dataset)
348    result_df = arrow_table.to_pandas()
349    tm.assert_frame_equal(result_df, dfx)
350
351
352@pytest.mark.pandas
353@parametrize_legacy_dataset
354def test_multiindex_duplicate_values(tempdir, use_legacy_dataset):
355    num_rows = 3
356    numbers = list(range(num_rows))
357    index = pd.MultiIndex.from_arrays(
358        [['foo', 'foo', 'bar'], numbers],
359        names=['foobar', 'some_numbers'],
360    )
361
362    df = pd.DataFrame({'numbers': numbers}, index=index)
363    table = pa.Table.from_pandas(df)
364
365    filename = tempdir / 'dup_multi_index_levels.parquet'
366
367    _write_table(table, filename)
368    result_table = _read_table(filename, use_legacy_dataset=use_legacy_dataset)
369    assert table.equals(result_table)
370
371    result_df = result_table.to_pandas()
372    tm.assert_frame_equal(result_df, df)
373
374
375@pytest.mark.pandas
376@parametrize_legacy_dataset
377def test_backwards_compatible_index_naming(datadir, use_legacy_dataset):
378    expected_string = b"""\
379carat        cut  color  clarity  depth  table  price     x     y     z
380 0.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
381 0.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
382 0.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
383 0.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
384 0.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
385 0.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
386 0.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
387 0.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
388 0.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
389 0.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
390    expected = pd.read_csv(io.BytesIO(expected_string), sep=r'\s{2,}',
391                           index_col=None, header=0, engine='python')
392    table = _read_table(
393        datadir / 'v0.7.1.parquet', use_legacy_dataset=use_legacy_dataset)
394    result = table.to_pandas()
395    tm.assert_frame_equal(result, expected)
396
397
398@pytest.mark.pandas
399@parametrize_legacy_dataset
400def test_backwards_compatible_index_multi_level_named(
401    datadir, use_legacy_dataset
402):
403    expected_string = b"""\
404carat        cut  color  clarity  depth  table  price     x     y     z
405 0.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
406 0.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
407 0.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
408 0.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
409 0.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
410 0.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
411 0.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
412 0.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
413 0.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
414 0.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
415    expected = pd.read_csv(
416        io.BytesIO(expected_string), sep=r'\s{2,}',
417        index_col=['cut', 'color', 'clarity'],
418        header=0, engine='python'
419    ).sort_index()
420
421    table = _read_table(datadir / 'v0.7.1.all-named-index.parquet',
422                        use_legacy_dataset=use_legacy_dataset)
423    result = table.to_pandas()
424    tm.assert_frame_equal(result, expected)
425
426
427@pytest.mark.pandas
428@parametrize_legacy_dataset
429def test_backwards_compatible_index_multi_level_some_named(
430        datadir, use_legacy_dataset
431):
432    expected_string = b"""\
433carat        cut  color  clarity  depth  table  price     x     y     z
434 0.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
435 0.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
436 0.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
437 0.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
438 0.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
439 0.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
440 0.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
441 0.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
442 0.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
443 0.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
444    expected = pd.read_csv(
445        io.BytesIO(expected_string),
446        sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'],
447        header=0, engine='python'
448    ).sort_index()
449    expected.index = expected.index.set_names(['cut', None, 'clarity'])
450
451    table = _read_table(datadir / 'v0.7.1.some-named-index.parquet',
452                        use_legacy_dataset=use_legacy_dataset)
453    result = table.to_pandas()
454    tm.assert_frame_equal(result, expected)
455
456
457@pytest.mark.pandas
458@parametrize_legacy_dataset
459def test_backwards_compatible_column_metadata_handling(
460    datadir, use_legacy_dataset
461):
462    expected = pd.DataFrame(
463        {'a': [1, 2, 3], 'b': [.1, .2, .3],
464         'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')})
465    expected.index = pd.MultiIndex.from_arrays(
466        [['a', 'b', 'c'],
467         pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')],
468        names=['index', None])
469
470    path = datadir / 'v0.7.1.column-metadata-handling.parquet'
471    table = _read_table(path, use_legacy_dataset=use_legacy_dataset)
472    result = table.to_pandas()
473    tm.assert_frame_equal(result, expected)
474
475    table = _read_table(
476        path, columns=['a'], use_legacy_dataset=use_legacy_dataset)
477    result = table.to_pandas()
478    tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True))
479
480
481@pytest.mark.pandas
482@parametrize_legacy_dataset
483def test_categorical_index_survives_roundtrip(use_legacy_dataset):
484    # ARROW-3652, addressed by ARROW-3246
485    df = pd.DataFrame([['a', 'b'], ['c', 'd']], columns=['c1', 'c2'])
486    df['c1'] = df['c1'].astype('category')
487    df = df.set_index(['c1'])
488
489    table = pa.Table.from_pandas(df)
490    bos = pa.BufferOutputStream()
491    pq.write_table(table, bos)
492    ref_df = pq.read_pandas(
493        bos.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas()
494    assert isinstance(ref_df.index, pd.CategoricalIndex)
495    assert ref_df.index.equals(df.index)
496
497
498@pytest.mark.pandas
499@parametrize_legacy_dataset
500def test_categorical_order_survives_roundtrip(use_legacy_dataset):
501    # ARROW-6302
502    df = pd.DataFrame({"a": pd.Categorical(
503        ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True)})
504
505    table = pa.Table.from_pandas(df)
506    bos = pa.BufferOutputStream()
507    pq.write_table(table, bos)
508
509    contents = bos.getvalue()
510    result = pq.read_pandas(
511        contents, use_legacy_dataset=use_legacy_dataset).to_pandas()
512
513    tm.assert_frame_equal(result, df)
514
515
516@pytest.mark.pandas
517@parametrize_legacy_dataset
518def test_pandas_categorical_na_type_row_groups(use_legacy_dataset):
519    # ARROW-5085
520    df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100})
521    df_category = df.astype({"col": "category", "int": "category"})
522    table = pa.Table.from_pandas(df)
523    table_cat = pa.Table.from_pandas(df_category)
524    buf = pa.BufferOutputStream()
525
526    # it works
527    pq.write_table(table_cat, buf, version='2.6', chunk_size=10)
528    result = pq.read_table(
529        buf.getvalue(), use_legacy_dataset=use_legacy_dataset)
530
531    # Result is non-categorical
532    assert result[0].equals(table[0])
533    assert result[1].equals(table[1])
534
535
536@pytest.mark.pandas
537@parametrize_legacy_dataset
538def test_pandas_categorical_roundtrip(use_legacy_dataset):
539    # ARROW-5480, this was enabled by ARROW-3246
540
541    # Have one of the categories unobserved and include a null (-1)
542    codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32')
543    categories = ['foo', 'bar', 'baz']
544    df = pd.DataFrame({'x': pd.Categorical.from_codes(
545        codes, categories=categories)})
546
547    buf = pa.BufferOutputStream()
548    pq.write_table(pa.table(df), buf)
549
550    result = pq.read_table(
551        buf.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas()
552    assert result.x.dtype == 'category'
553    assert (result.x.cat.categories == categories).all()
554    tm.assert_frame_equal(result, df)
555
556
557@pytest.mark.pandas
558@parametrize_legacy_dataset
559def test_write_to_dataset_pandas_preserve_extensiondtypes(
560    tempdir, use_legacy_dataset
561):
562    # ARROW-8251 - preserve pandas extension dtypes in roundtrip
563    if Version(pd.__version__) < Version("1.0.0"):
564        pytest.skip("__arrow_array__ added to pandas in 1.0.0")
565
566    df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]})
567    df['col'] = df['col'].astype("Int64")
568    table = pa.table(df)
569
570    pq.write_to_dataset(
571        table, str(tempdir / "case1"), partition_cols=['part'],
572        use_legacy_dataset=use_legacy_dataset
573    )
574    result = pq.read_table(
575        str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset
576    ).to_pandas()
577    tm.assert_frame_equal(result[["col"]], df[["col"]])
578
579    pq.write_to_dataset(
580        table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
581    )
582    result = pq.read_table(
583        str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
584    ).to_pandas()
585    tm.assert_frame_equal(result[["col"]], df[["col"]])
586
587    pq.write_table(table, str(tempdir / "data.parquet"))
588    result = pq.read_table(
589        str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset
590    ).to_pandas()
591    tm.assert_frame_equal(result[["col"]], df[["col"]])
592
593
594@pytest.mark.pandas
595@parametrize_legacy_dataset
596def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset):
597    # ARROW-8251 - preserve pandas index in roundtrip
598
599    df = pd.DataFrame({'part': ['a', 'a', 'b'], "col": [1, 2, 3]})
600    df.index = pd.Index(['a', 'b', 'c'], name="idx")
601    table = pa.table(df)
602    df_cat = df[["col", "part"]].copy()
603    df_cat["part"] = df_cat["part"].astype("category")
604
605    pq.write_to_dataset(
606        table, str(tempdir / "case1"), partition_cols=['part'],
607        use_legacy_dataset=use_legacy_dataset
608    )
609    result = pq.read_table(
610        str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset
611    ).to_pandas()
612    tm.assert_frame_equal(result, df_cat)
613
614    pq.write_to_dataset(
615        table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
616    )
617    result = pq.read_table(
618        str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset
619    ).to_pandas()
620    tm.assert_frame_equal(result, df)
621
622    pq.write_table(table, str(tempdir / "data.parquet"))
623    result = pq.read_table(
624        str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset
625    ).to_pandas()
626    tm.assert_frame_equal(result, df)
627
628
629@pytest.mark.pandas
630@pytest.mark.parametrize('preserve_index', [True, False, None])
631def test_dataset_read_pandas_common_metadata(tempdir, preserve_index):
632    # ARROW-1103
633    nfiles = 5
634    size = 5
635
636    dirpath = tempdir / guid()
637    dirpath.mkdir()
638
639    test_data = []
640    frames = []
641    paths = []
642    for i in range(nfiles):
643        df = _test_dataframe(size, seed=i)
644        df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index')
645
646        path = dirpath / '{}.parquet'.format(i)
647
648        table = pa.Table.from_pandas(df, preserve_index=preserve_index)
649
650        # Obliterate metadata
651        table = table.replace_schema_metadata(None)
652        assert table.schema.metadata is None
653
654        _write_table(table, path)
655        test_data.append(table)
656        frames.append(df)
657        paths.append(path)
658
659    # Write _metadata common file
660    table_for_metadata = pa.Table.from_pandas(
661        df, preserve_index=preserve_index
662    )
663    pq.write_metadata(table_for_metadata.schema, dirpath / '_metadata')
664
665    dataset = pq.ParquetDataset(dirpath)
666    columns = ['uint8', 'strings']
667    result = dataset.read_pandas(columns=columns).to_pandas()
668    expected = pd.concat([x[columns] for x in frames])
669    expected.index.name = (
670        df.index.name if preserve_index is not False else None)
671    tm.assert_frame_equal(result, expected)
672
673
674@pytest.mark.pandas
675def test_read_pandas_passthrough_keywords(tempdir):
676    # ARROW-11464 - previously not all keywords were passed through (such as
677    # the filesystem keyword)
678    df = pd.DataFrame({'a': [1, 2, 3]})
679
680    filename = tempdir / 'data.parquet'
681    _write_table(df, filename)
682
683    result = pq.read_pandas(
684        'data.parquet',
685        filesystem=SubTreeFileSystem(str(tempdir), LocalFileSystem())
686    )
687    assert result.equals(pa.table(df))
688