1# Licensed to the Apache Software Foundation (ASF) under one 2# or more contributor license agreements. See the NOTICE file 3# distributed with this work for additional information 4# regarding copyright ownership. The ASF licenses this file 5# to you under the Apache License, Version 2.0 (the 6# "License"); you may not use this file except in compliance 7# with the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, 12# software distributed under the License is distributed on an 13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14# KIND, either express or implied. See the License for the 15# specific language governing permissions and limitations 16# under the License. 17 18import io 19import json 20 21import numpy as np 22import pytest 23 24import pyarrow as pa 25from pyarrow.fs import LocalFileSystem, SubTreeFileSystem 26from pyarrow.tests.parquet.common import ( 27 parametrize_legacy_dataset, parametrize_legacy_dataset_not_supported) 28from pyarrow.util import guid 29from pyarrow.vendored.version import Version 30 31try: 32 import pyarrow.parquet as pq 33 from pyarrow.tests.parquet.common import (_read_table, _test_dataframe, 34 _write_table) 35except ImportError: 36 pq = None 37 38 39try: 40 import pandas as pd 41 import pandas.testing as tm 42 43 from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe, 44 alltypes_sample) 45except ImportError: 46 pd = tm = None 47 48 49pytestmark = pytest.mark.parquet 50 51 52@pytest.mark.pandas 53def test_pandas_parquet_custom_metadata(tempdir): 54 df = alltypes_sample(size=10000) 55 56 filename = tempdir / 'pandas_roundtrip.parquet' 57 arrow_table = pa.Table.from_pandas(df) 58 assert b'pandas' in arrow_table.schema.metadata 59 60 _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms') 61 62 metadata = pq.read_metadata(filename).metadata 63 assert b'pandas' in metadata 64 65 js = json.loads(metadata[b'pandas'].decode('utf8')) 66 assert js['index_columns'] == [{'kind': 'range', 67 'name': None, 68 'start': 0, 'stop': 10000, 69 'step': 1}] 70 71 72@pytest.mark.pandas 73def test_merging_parquet_tables_with_different_pandas_metadata(tempdir): 74 # ARROW-3728: Merging Parquet Files - Pandas Meta in Schema Mismatch 75 schema = pa.schema([ 76 pa.field('int', pa.int16()), 77 pa.field('float', pa.float32()), 78 pa.field('string', pa.string()) 79 ]) 80 df1 = pd.DataFrame({ 81 'int': np.arange(3, dtype=np.uint8), 82 'float': np.arange(3, dtype=np.float32), 83 'string': ['ABBA', 'EDDA', 'ACDC'] 84 }) 85 df2 = pd.DataFrame({ 86 'int': [4, 5], 87 'float': [1.1, None], 88 'string': [None, None] 89 }) 90 table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False) 91 table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False) 92 93 assert not table1.schema.equals(table2.schema, check_metadata=True) 94 assert table1.schema.equals(table2.schema) 95 96 writer = pq.ParquetWriter(tempdir / 'merged.parquet', schema=schema) 97 writer.write_table(table1) 98 writer.write_table(table2) 99 100 101@pytest.mark.pandas 102@parametrize_legacy_dataset 103def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset): 104 df = alltypes_sample(size=10) 105 df.columns = pd.MultiIndex.from_tuples( 106 list(zip(df.columns, df.columns[::-1])), 107 names=['level_1', 'level_2'] 108 ) 109 110 filename = tempdir / 'pandas_roundtrip.parquet' 111 arrow_table = pa.Table.from_pandas(df) 112 assert arrow_table.schema.pandas_metadata is not None 113 114 _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms') 115 116 table_read = pq.read_pandas( 117 filename, use_legacy_dataset=use_legacy_dataset) 118 df_read = table_read.to_pandas() 119 tm.assert_frame_equal(df, df_read) 120 121 122@pytest.mark.pandas 123@parametrize_legacy_dataset 124def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( 125 tempdir, use_legacy_dataset 126): 127 df = alltypes_sample(size=10000) 128 129 filename = tempdir / 'pandas_roundtrip.parquet' 130 arrow_table = pa.Table.from_pandas(df, preserve_index=False) 131 js = arrow_table.schema.pandas_metadata 132 assert not js['index_columns'] 133 # ARROW-2170 134 # While index_columns should be empty, columns needs to be filled still. 135 assert js['columns'] 136 137 _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms') 138 table_read = pq.read_pandas( 139 filename, use_legacy_dataset=use_legacy_dataset) 140 141 js = table_read.schema.pandas_metadata 142 assert not js['index_columns'] 143 144 read_metadata = table_read.schema.metadata 145 assert arrow_table.schema.metadata == read_metadata 146 147 df_read = table_read.to_pandas() 148 tm.assert_frame_equal(df, df_read) 149 150 151# TODO(dataset) duplicate column selection actually gives duplicate columns now 152@pytest.mark.pandas 153@parametrize_legacy_dataset_not_supported 154def test_pandas_column_selection(tempdir, use_legacy_dataset): 155 size = 10000 156 np.random.seed(0) 157 df = pd.DataFrame({ 158 'uint8': np.arange(size, dtype=np.uint8), 159 'uint16': np.arange(size, dtype=np.uint16) 160 }) 161 filename = tempdir / 'pandas_roundtrip.parquet' 162 arrow_table = pa.Table.from_pandas(df) 163 _write_table(arrow_table, filename) 164 table_read = _read_table( 165 filename, columns=['uint8'], use_legacy_dataset=use_legacy_dataset) 166 df_read = table_read.to_pandas() 167 168 tm.assert_frame_equal(df[['uint8']], df_read) 169 170 # ARROW-4267: Selection of duplicate columns still leads to these columns 171 # being read uniquely. 172 table_read = _read_table( 173 filename, columns=['uint8', 'uint8'], 174 use_legacy_dataset=use_legacy_dataset) 175 df_read = table_read.to_pandas() 176 177 tm.assert_frame_equal(df[['uint8']], df_read) 178 179 180@pytest.mark.pandas 181@parametrize_legacy_dataset 182def test_pandas_parquet_native_file_roundtrip(tempdir, use_legacy_dataset): 183 df = _test_dataframe(10000) 184 arrow_table = pa.Table.from_pandas(df) 185 imos = pa.BufferOutputStream() 186 _write_table(arrow_table, imos, version='2.6') 187 buf = imos.getvalue() 188 reader = pa.BufferReader(buf) 189 df_read = _read_table( 190 reader, use_legacy_dataset=use_legacy_dataset).to_pandas() 191 tm.assert_frame_equal(df, df_read) 192 193 194@pytest.mark.pandas 195@parametrize_legacy_dataset 196def test_read_pandas_column_subset(tempdir, use_legacy_dataset): 197 df = _test_dataframe(10000) 198 arrow_table = pa.Table.from_pandas(df) 199 imos = pa.BufferOutputStream() 200 _write_table(arrow_table, imos, version='2.6') 201 buf = imos.getvalue() 202 reader = pa.BufferReader(buf) 203 df_read = pq.read_pandas( 204 reader, columns=['strings', 'uint8'], 205 use_legacy_dataset=use_legacy_dataset 206 ).to_pandas() 207 tm.assert_frame_equal(df[['strings', 'uint8']], df_read) 208 209 210@pytest.mark.pandas 211@parametrize_legacy_dataset 212def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset): 213 df = _test_dataframe(0) 214 arrow_table = pa.Table.from_pandas(df) 215 imos = pa.BufferOutputStream() 216 _write_table(arrow_table, imos, version='2.6') 217 buf = imos.getvalue() 218 reader = pa.BufferReader(buf) 219 df_read = _read_table( 220 reader, use_legacy_dataset=use_legacy_dataset).to_pandas() 221 tm.assert_frame_equal(df, df_read) 222 223 224@pytest.mark.pandas 225def test_pandas_can_write_nested_data(tempdir): 226 data = { 227 "agg_col": [ 228 {"page_type": 1}, 229 {"record_type": 1}, 230 {"non_consecutive_home": 0}, 231 ], 232 "uid_first": "1001" 233 } 234 df = pd.DataFrame(data=data) 235 arrow_table = pa.Table.from_pandas(df) 236 imos = pa.BufferOutputStream() 237 # This succeeds under V2 238 _write_table(arrow_table, imos) 239 240 241@pytest.mark.pandas 242@parametrize_legacy_dataset 243def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset): 244 filename = tempdir / 'pandas_pyfile_roundtrip.parquet' 245 size = 5 246 df = pd.DataFrame({ 247 'int64': np.arange(size, dtype=np.int64), 248 'float32': np.arange(size, dtype=np.float32), 249 'float64': np.arange(size, dtype=np.float64), 250 'bool': np.random.randn(size) > 0, 251 'strings': ['foo', 'bar', None, 'baz', 'qux'] 252 }) 253 254 arrow_table = pa.Table.from_pandas(df) 255 256 with filename.open('wb') as f: 257 _write_table(arrow_table, f, version="1.0") 258 259 data = io.BytesIO(filename.read_bytes()) 260 261 table_read = _read_table(data, use_legacy_dataset=use_legacy_dataset) 262 df_read = table_read.to_pandas() 263 tm.assert_frame_equal(df, df_read) 264 265 266@pytest.mark.pandas 267@parametrize_legacy_dataset 268def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): 269 size = 10000 270 np.random.seed(0) 271 df = pd.DataFrame({ 272 'uint8': np.arange(size, dtype=np.uint8), 273 'uint16': np.arange(size, dtype=np.uint16), 274 'uint32': np.arange(size, dtype=np.uint32), 275 'uint64': np.arange(size, dtype=np.uint64), 276 'int8': np.arange(size, dtype=np.int16), 277 'int16': np.arange(size, dtype=np.int16), 278 'int32': np.arange(size, dtype=np.int32), 279 'int64': np.arange(size, dtype=np.int64), 280 'float32': np.arange(size, dtype=np.float32), 281 'float64': np.arange(size, dtype=np.float64), 282 'bool': np.random.randn(size) > 0 283 }) 284 filename = tempdir / 'pandas_roundtrip.parquet' 285 arrow_table = pa.Table.from_pandas(df) 286 287 for use_dictionary in [True, False]: 288 _write_table(arrow_table, filename, version='2.6', 289 use_dictionary=use_dictionary) 290 table_read = _read_table( 291 filename, use_legacy_dataset=use_legacy_dataset) 292 df_read = table_read.to_pandas() 293 tm.assert_frame_equal(df, df_read) 294 295 for write_statistics in [True, False]: 296 _write_table(arrow_table, filename, version='2.6', 297 write_statistics=write_statistics) 298 table_read = _read_table(filename, 299 use_legacy_dataset=use_legacy_dataset) 300 df_read = table_read.to_pandas() 301 tm.assert_frame_equal(df, df_read) 302 303 for compression in ['NONE', 'SNAPPY', 'GZIP', 'LZ4', 'ZSTD']: 304 if (compression != 'NONE' and 305 not pa.lib.Codec.is_available(compression)): 306 continue 307 _write_table(arrow_table, filename, version='2.6', 308 compression=compression) 309 table_read = _read_table( 310 filename, use_legacy_dataset=use_legacy_dataset) 311 df_read = table_read.to_pandas() 312 tm.assert_frame_equal(df, df_read) 313 314 315@pytest.mark.pandas 316def test_spark_flavor_preserves_pandas_metadata(): 317 df = _test_dataframe(size=100) 318 df.index = np.arange(0, 10 * len(df), 10) 319 df.index.name = 'foo' 320 321 result = _roundtrip_pandas_dataframe(df, {'version': '2.0', 322 'flavor': 'spark'}) 323 tm.assert_frame_equal(result, df) 324 325 326@pytest.mark.pandas 327@parametrize_legacy_dataset 328def test_index_column_name_duplicate(tempdir, use_legacy_dataset): 329 data = { 330 'close': { 331 pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998, 332 pd.Timestamp('2017-06-30 01:32:00'): 154.99958999999998, 333 }, 334 'time': { 335 pd.Timestamp('2017-06-30 01:31:00'): pd.Timestamp( 336 '2017-06-30 01:31:00' 337 ), 338 pd.Timestamp('2017-06-30 01:32:00'): pd.Timestamp( 339 '2017-06-30 01:32:00' 340 ), 341 } 342 } 343 path = str(tempdir / 'data.parquet') 344 dfx = pd.DataFrame(data).set_index('time', drop=False) 345 tdfx = pa.Table.from_pandas(dfx) 346 _write_table(tdfx, path) 347 arrow_table = _read_table(path, use_legacy_dataset=use_legacy_dataset) 348 result_df = arrow_table.to_pandas() 349 tm.assert_frame_equal(result_df, dfx) 350 351 352@pytest.mark.pandas 353@parametrize_legacy_dataset 354def test_multiindex_duplicate_values(tempdir, use_legacy_dataset): 355 num_rows = 3 356 numbers = list(range(num_rows)) 357 index = pd.MultiIndex.from_arrays( 358 [['foo', 'foo', 'bar'], numbers], 359 names=['foobar', 'some_numbers'], 360 ) 361 362 df = pd.DataFrame({'numbers': numbers}, index=index) 363 table = pa.Table.from_pandas(df) 364 365 filename = tempdir / 'dup_multi_index_levels.parquet' 366 367 _write_table(table, filename) 368 result_table = _read_table(filename, use_legacy_dataset=use_legacy_dataset) 369 assert table.equals(result_table) 370 371 result_df = result_table.to_pandas() 372 tm.assert_frame_equal(result_df, df) 373 374 375@pytest.mark.pandas 376@parametrize_legacy_dataset 377def test_backwards_compatible_index_naming(datadir, use_legacy_dataset): 378 expected_string = b"""\ 379carat cut color clarity depth table price x y z 380 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 381 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 382 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 383 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 384 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 385 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 386 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 387 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 388 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 389 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" 390 expected = pd.read_csv(io.BytesIO(expected_string), sep=r'\s{2,}', 391 index_col=None, header=0, engine='python') 392 table = _read_table( 393 datadir / 'v0.7.1.parquet', use_legacy_dataset=use_legacy_dataset) 394 result = table.to_pandas() 395 tm.assert_frame_equal(result, expected) 396 397 398@pytest.mark.pandas 399@parametrize_legacy_dataset 400def test_backwards_compatible_index_multi_level_named( 401 datadir, use_legacy_dataset 402): 403 expected_string = b"""\ 404carat cut color clarity depth table price x y z 405 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 406 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 407 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 408 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 409 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 410 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 411 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 412 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 413 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 414 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" 415 expected = pd.read_csv( 416 io.BytesIO(expected_string), sep=r'\s{2,}', 417 index_col=['cut', 'color', 'clarity'], 418 header=0, engine='python' 419 ).sort_index() 420 421 table = _read_table(datadir / 'v0.7.1.all-named-index.parquet', 422 use_legacy_dataset=use_legacy_dataset) 423 result = table.to_pandas() 424 tm.assert_frame_equal(result, expected) 425 426 427@pytest.mark.pandas 428@parametrize_legacy_dataset 429def test_backwards_compatible_index_multi_level_some_named( 430 datadir, use_legacy_dataset 431): 432 expected_string = b"""\ 433carat cut color clarity depth table price x y z 434 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 435 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 436 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 437 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 438 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 439 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 440 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 441 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 442 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 443 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" 444 expected = pd.read_csv( 445 io.BytesIO(expected_string), 446 sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], 447 header=0, engine='python' 448 ).sort_index() 449 expected.index = expected.index.set_names(['cut', None, 'clarity']) 450 451 table = _read_table(datadir / 'v0.7.1.some-named-index.parquet', 452 use_legacy_dataset=use_legacy_dataset) 453 result = table.to_pandas() 454 tm.assert_frame_equal(result, expected) 455 456 457@pytest.mark.pandas 458@parametrize_legacy_dataset 459def test_backwards_compatible_column_metadata_handling( 460 datadir, use_legacy_dataset 461): 462 expected = pd.DataFrame( 463 {'a': [1, 2, 3], 'b': [.1, .2, .3], 464 'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')}) 465 expected.index = pd.MultiIndex.from_arrays( 466 [['a', 'b', 'c'], 467 pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')], 468 names=['index', None]) 469 470 path = datadir / 'v0.7.1.column-metadata-handling.parquet' 471 table = _read_table(path, use_legacy_dataset=use_legacy_dataset) 472 result = table.to_pandas() 473 tm.assert_frame_equal(result, expected) 474 475 table = _read_table( 476 path, columns=['a'], use_legacy_dataset=use_legacy_dataset) 477 result = table.to_pandas() 478 tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True)) 479 480 481@pytest.mark.pandas 482@parametrize_legacy_dataset 483def test_categorical_index_survives_roundtrip(use_legacy_dataset): 484 # ARROW-3652, addressed by ARROW-3246 485 df = pd.DataFrame([['a', 'b'], ['c', 'd']], columns=['c1', 'c2']) 486 df['c1'] = df['c1'].astype('category') 487 df = df.set_index(['c1']) 488 489 table = pa.Table.from_pandas(df) 490 bos = pa.BufferOutputStream() 491 pq.write_table(table, bos) 492 ref_df = pq.read_pandas( 493 bos.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas() 494 assert isinstance(ref_df.index, pd.CategoricalIndex) 495 assert ref_df.index.equals(df.index) 496 497 498@pytest.mark.pandas 499@parametrize_legacy_dataset 500def test_categorical_order_survives_roundtrip(use_legacy_dataset): 501 # ARROW-6302 502 df = pd.DataFrame({"a": pd.Categorical( 503 ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True)}) 504 505 table = pa.Table.from_pandas(df) 506 bos = pa.BufferOutputStream() 507 pq.write_table(table, bos) 508 509 contents = bos.getvalue() 510 result = pq.read_pandas( 511 contents, use_legacy_dataset=use_legacy_dataset).to_pandas() 512 513 tm.assert_frame_equal(result, df) 514 515 516@pytest.mark.pandas 517@parametrize_legacy_dataset 518def test_pandas_categorical_na_type_row_groups(use_legacy_dataset): 519 # ARROW-5085 520 df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100}) 521 df_category = df.astype({"col": "category", "int": "category"}) 522 table = pa.Table.from_pandas(df) 523 table_cat = pa.Table.from_pandas(df_category) 524 buf = pa.BufferOutputStream() 525 526 # it works 527 pq.write_table(table_cat, buf, version='2.6', chunk_size=10) 528 result = pq.read_table( 529 buf.getvalue(), use_legacy_dataset=use_legacy_dataset) 530 531 # Result is non-categorical 532 assert result[0].equals(table[0]) 533 assert result[1].equals(table[1]) 534 535 536@pytest.mark.pandas 537@parametrize_legacy_dataset 538def test_pandas_categorical_roundtrip(use_legacy_dataset): 539 # ARROW-5480, this was enabled by ARROW-3246 540 541 # Have one of the categories unobserved and include a null (-1) 542 codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32') 543 categories = ['foo', 'bar', 'baz'] 544 df = pd.DataFrame({'x': pd.Categorical.from_codes( 545 codes, categories=categories)}) 546 547 buf = pa.BufferOutputStream() 548 pq.write_table(pa.table(df), buf) 549 550 result = pq.read_table( 551 buf.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas() 552 assert result.x.dtype == 'category' 553 assert (result.x.cat.categories == categories).all() 554 tm.assert_frame_equal(result, df) 555 556 557@pytest.mark.pandas 558@parametrize_legacy_dataset 559def test_write_to_dataset_pandas_preserve_extensiondtypes( 560 tempdir, use_legacy_dataset 561): 562 # ARROW-8251 - preserve pandas extension dtypes in roundtrip 563 if Version(pd.__version__) < Version("1.0.0"): 564 pytest.skip("__arrow_array__ added to pandas in 1.0.0") 565 566 df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]}) 567 df['col'] = df['col'].astype("Int64") 568 table = pa.table(df) 569 570 pq.write_to_dataset( 571 table, str(tempdir / "case1"), partition_cols=['part'], 572 use_legacy_dataset=use_legacy_dataset 573 ) 574 result = pq.read_table( 575 str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset 576 ).to_pandas() 577 tm.assert_frame_equal(result[["col"]], df[["col"]]) 578 579 pq.write_to_dataset( 580 table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset 581 ) 582 result = pq.read_table( 583 str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset 584 ).to_pandas() 585 tm.assert_frame_equal(result[["col"]], df[["col"]]) 586 587 pq.write_table(table, str(tempdir / "data.parquet")) 588 result = pq.read_table( 589 str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset 590 ).to_pandas() 591 tm.assert_frame_equal(result[["col"]], df[["col"]]) 592 593 594@pytest.mark.pandas 595@parametrize_legacy_dataset 596def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset): 597 # ARROW-8251 - preserve pandas index in roundtrip 598 599 df = pd.DataFrame({'part': ['a', 'a', 'b'], "col": [1, 2, 3]}) 600 df.index = pd.Index(['a', 'b', 'c'], name="idx") 601 table = pa.table(df) 602 df_cat = df[["col", "part"]].copy() 603 df_cat["part"] = df_cat["part"].astype("category") 604 605 pq.write_to_dataset( 606 table, str(tempdir / "case1"), partition_cols=['part'], 607 use_legacy_dataset=use_legacy_dataset 608 ) 609 result = pq.read_table( 610 str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset 611 ).to_pandas() 612 tm.assert_frame_equal(result, df_cat) 613 614 pq.write_to_dataset( 615 table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset 616 ) 617 result = pq.read_table( 618 str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset 619 ).to_pandas() 620 tm.assert_frame_equal(result, df) 621 622 pq.write_table(table, str(tempdir / "data.parquet")) 623 result = pq.read_table( 624 str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset 625 ).to_pandas() 626 tm.assert_frame_equal(result, df) 627 628 629@pytest.mark.pandas 630@pytest.mark.parametrize('preserve_index', [True, False, None]) 631def test_dataset_read_pandas_common_metadata(tempdir, preserve_index): 632 # ARROW-1103 633 nfiles = 5 634 size = 5 635 636 dirpath = tempdir / guid() 637 dirpath.mkdir() 638 639 test_data = [] 640 frames = [] 641 paths = [] 642 for i in range(nfiles): 643 df = _test_dataframe(size, seed=i) 644 df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index') 645 646 path = dirpath / '{}.parquet'.format(i) 647 648 table = pa.Table.from_pandas(df, preserve_index=preserve_index) 649 650 # Obliterate metadata 651 table = table.replace_schema_metadata(None) 652 assert table.schema.metadata is None 653 654 _write_table(table, path) 655 test_data.append(table) 656 frames.append(df) 657 paths.append(path) 658 659 # Write _metadata common file 660 table_for_metadata = pa.Table.from_pandas( 661 df, preserve_index=preserve_index 662 ) 663 pq.write_metadata(table_for_metadata.schema, dirpath / '_metadata') 664 665 dataset = pq.ParquetDataset(dirpath) 666 columns = ['uint8', 'strings'] 667 result = dataset.read_pandas(columns=columns).to_pandas() 668 expected = pd.concat([x[columns] for x in frames]) 669 expected.index.name = ( 670 df.index.name if preserve_index is not False else None) 671 tm.assert_frame_equal(result, expected) 672 673 674@pytest.mark.pandas 675def test_read_pandas_passthrough_keywords(tempdir): 676 # ARROW-11464 - previously not all keywords were passed through (such as 677 # the filesystem keyword) 678 df = pd.DataFrame({'a': [1, 2, 3]}) 679 680 filename = tempdir / 'data.parquet' 681 _write_table(df, filename) 682 683 result = pq.read_pandas( 684 'data.parquet', 685 filesystem=SubTreeFileSystem(str(tempdir), LocalFileSystem()) 686 ) 687 assert result.equals(pa.table(df)) 688