1# Licensed to the Apache Software Foundation (ASF) under one 2# or more contributor license agreements. See the NOTICE file 3# distributed with this work for additional information 4# regarding copyright ownership. The ASF licenses this file 5# to you under the Apache License, Version 2.0 (the 6# "License"); you may not use this file except in compliance 7# with the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, 12# software distributed under the License is distributed on an 13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14# KIND, either express or implied. See the License for the 15# specific language governing permissions and limitations 16# under the License. 17 18import pytest 19import decimal 20import datetime 21 22import pyarrow as pa 23 24# Marks all of the tests in this module 25# Ignore these with pytest ... -m 'not orc' 26pytestmark = pytest.mark.orc 27 28 29try: 30 from pandas.testing import assert_frame_equal 31 import pandas as pd 32except ImportError: 33 pass 34 35 36@pytest.fixture(scope='module') 37def datadir(datadir): 38 return datadir / 'orc' 39 40 41def fix_example_values(actual_cols, expected_cols): 42 """ 43 Fix type of expected values (as read from JSON) according to 44 actual ORC datatype. 45 """ 46 for name in expected_cols: 47 expected = expected_cols[name] 48 actual = actual_cols[name] 49 typ = actual[0].__class__ 50 if issubclass(typ, datetime.datetime): 51 # timestamp fields are represented as strings in JSON files 52 expected = pd.to_datetime(expected) 53 elif issubclass(typ, datetime.date): 54 # # date fields are represented as strings in JSON files 55 expected = expected.dt.date 56 elif typ is decimal.Decimal: 57 converted_decimals = [None] * len(expected) 58 # decimal fields are represented as reals in JSON files 59 for i, (d, v) in enumerate(zip(actual, expected)): 60 if not pd.isnull(v): 61 exp = d.as_tuple().exponent 62 factor = 10 ** -exp 63 converted_decimals[i] = ( 64 decimal.Decimal(round(v * factor)).scaleb(exp)) 65 expected = pd.Series(converted_decimals) 66 67 expected_cols[name] = expected 68 69 70def check_example_values(orc_df, expected_df, start=None, stop=None): 71 if start is not None or stop is not None: 72 expected_df = expected_df[start:stop].reset_index(drop=True) 73 assert_frame_equal(orc_df, expected_df, check_dtype=False) 74 75 76def check_example_file(orc_path, expected_df, need_fix=False): 77 """ 78 Check a ORC file against the expected columns dictionary. 79 """ 80 from pyarrow import orc 81 82 orc_file = orc.ORCFile(orc_path) 83 # Exercise ORCFile.read() 84 table = orc_file.read() 85 assert isinstance(table, pa.Table) 86 table.validate() 87 88 # This workaround needed because of ARROW-3080 89 orc_df = pd.DataFrame(table.to_pydict()) 90 91 assert set(expected_df.columns) == set(orc_df.columns) 92 93 # reorder columns if necessary 94 if not orc_df.columns.equals(expected_df.columns): 95 expected_df = expected_df.reindex(columns=orc_df.columns) 96 97 if need_fix: 98 fix_example_values(orc_df, expected_df) 99 100 check_example_values(orc_df, expected_df) 101 # Exercise ORCFile.read_stripe() 102 json_pos = 0 103 for i in range(orc_file.nstripes): 104 batch = orc_file.read_stripe(i) 105 check_example_values(pd.DataFrame(batch.to_pydict()), 106 expected_df, 107 start=json_pos, 108 stop=json_pos + len(batch)) 109 json_pos += len(batch) 110 assert json_pos == orc_file.nrows 111 112 113@pytest.mark.pandas 114@pytest.mark.parametrize('filename', [ 115 'TestOrcFile.test1.orc', 116 'TestOrcFile.testDate1900.orc', 117 'decimal.orc' 118]) 119def test_example_using_json(filename, datadir): 120 """ 121 Check a ORC file example against the equivalent JSON file, as given 122 in the Apache ORC repository (the JSON file has one JSON object per 123 line, corresponding to one row in the ORC file). 124 """ 125 # Read JSON file 126 path = datadir / filename 127 table = pd.read_json(str(path.with_suffix('.jsn.gz')), lines=True) 128 check_example_file(path, table, need_fix=True) 129 130 131def test_orcfile_empty(datadir): 132 from pyarrow import orc 133 134 table = orc.ORCFile(datadir / 'TestOrcFile.emptyFile.orc').read() 135 assert table.num_rows == 0 136 137 expected_schema = pa.schema([ 138 ('boolean1', pa.bool_()), 139 ('byte1', pa.int8()), 140 ('short1', pa.int16()), 141 ('int1', pa.int32()), 142 ('long1', pa.int64()), 143 ('float1', pa.float32()), 144 ('double1', pa.float64()), 145 ('bytes1', pa.binary()), 146 ('string1', pa.string()), 147 ('middle', pa.struct([ 148 ('list', pa.list_(pa.struct([ 149 ('int1', pa.int32()), 150 ('string1', pa.string()), 151 ]))), 152 ])), 153 ('list', pa.list_(pa.struct([ 154 ('int1', pa.int32()), 155 ('string1', pa.string()), 156 ]))), 157 ('map', pa.list_(pa.struct([ 158 ('key', pa.string()), 159 ('value', pa.struct([ 160 ('int1', pa.int32()), 161 ('string1', pa.string()), 162 ])), 163 ]))), 164 ]) 165 assert table.schema == expected_schema 166