1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements.  See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership.  The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License.  You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied.  See the License for the
15# specific language governing permissions and limitations
16# under the License.
17
18import pytest
19import decimal
20import datetime
21
22import pyarrow as pa
23
24# Marks all of the tests in this module
25# Ignore these with pytest ... -m 'not orc'
26pytestmark = pytest.mark.orc
27
28
29try:
30    from pandas.testing import assert_frame_equal
31    import pandas as pd
32except ImportError:
33    pass
34
35
36@pytest.fixture(scope='module')
37def datadir(datadir):
38    return datadir / 'orc'
39
40
41def fix_example_values(actual_cols, expected_cols):
42    """
43    Fix type of expected values (as read from JSON) according to
44    actual ORC datatype.
45    """
46    for name in expected_cols:
47        expected = expected_cols[name]
48        actual = actual_cols[name]
49        typ = actual[0].__class__
50        if issubclass(typ, datetime.datetime):
51            # timestamp fields are represented as strings in JSON files
52            expected = pd.to_datetime(expected)
53        elif issubclass(typ, datetime.date):
54            # # date fields are represented as strings in JSON files
55            expected = expected.dt.date
56        elif typ is decimal.Decimal:
57            converted_decimals = [None] * len(expected)
58            # decimal fields are represented as reals in JSON files
59            for i, (d, v) in enumerate(zip(actual, expected)):
60                if not pd.isnull(v):
61                    exp = d.as_tuple().exponent
62                    factor = 10 ** -exp
63                    converted_decimals[i] = (
64                        decimal.Decimal(round(v * factor)).scaleb(exp))
65            expected = pd.Series(converted_decimals)
66
67        expected_cols[name] = expected
68
69
70def check_example_values(orc_df, expected_df, start=None, stop=None):
71    if start is not None or stop is not None:
72        expected_df = expected_df[start:stop].reset_index(drop=True)
73    assert_frame_equal(orc_df, expected_df, check_dtype=False)
74
75
76def check_example_file(orc_path, expected_df, need_fix=False):
77    """
78    Check a ORC file against the expected columns dictionary.
79    """
80    from pyarrow import orc
81
82    orc_file = orc.ORCFile(orc_path)
83    # Exercise ORCFile.read()
84    table = orc_file.read()
85    assert isinstance(table, pa.Table)
86    table.validate()
87
88    # This workaround needed because of ARROW-3080
89    orc_df = pd.DataFrame(table.to_pydict())
90
91    assert set(expected_df.columns) == set(orc_df.columns)
92
93    # reorder columns if necessary
94    if not orc_df.columns.equals(expected_df.columns):
95        expected_df = expected_df.reindex(columns=orc_df.columns)
96
97    if need_fix:
98        fix_example_values(orc_df, expected_df)
99
100    check_example_values(orc_df, expected_df)
101    # Exercise ORCFile.read_stripe()
102    json_pos = 0
103    for i in range(orc_file.nstripes):
104        batch = orc_file.read_stripe(i)
105        check_example_values(pd.DataFrame(batch.to_pydict()),
106                             expected_df,
107                             start=json_pos,
108                             stop=json_pos + len(batch))
109        json_pos += len(batch)
110    assert json_pos == orc_file.nrows
111
112
113@pytest.mark.pandas
114@pytest.mark.parametrize('filename', [
115    'TestOrcFile.test1.orc',
116    'TestOrcFile.testDate1900.orc',
117    'decimal.orc'
118])
119def test_example_using_json(filename, datadir):
120    """
121    Check a ORC file example against the equivalent JSON file, as given
122    in the Apache ORC repository (the JSON file has one JSON object per
123    line, corresponding to one row in the ORC file).
124    """
125    # Read JSON file
126    path = datadir / filename
127    table = pd.read_json(str(path.with_suffix('.jsn.gz')), lines=True)
128    check_example_file(path, table, need_fix=True)
129
130
131def test_orcfile_empty(datadir):
132    from pyarrow import orc
133
134    table = orc.ORCFile(datadir / 'TestOrcFile.emptyFile.orc').read()
135    assert table.num_rows == 0
136
137    expected_schema = pa.schema([
138        ('boolean1', pa.bool_()),
139        ('byte1', pa.int8()),
140        ('short1', pa.int16()),
141        ('int1', pa.int32()),
142        ('long1', pa.int64()),
143        ('float1', pa.float32()),
144        ('double1', pa.float64()),
145        ('bytes1', pa.binary()),
146        ('string1', pa.string()),
147        ('middle', pa.struct([
148            ('list', pa.list_(pa.struct([
149                ('int1', pa.int32()),
150                ('string1', pa.string()),
151            ]))),
152        ])),
153        ('list', pa.list_(pa.struct([
154            ('int1', pa.int32()),
155            ('string1', pa.string()),
156        ]))),
157        ('map', pa.list_(pa.struct([
158            ('key', pa.string()),
159            ('value', pa.struct([
160                ('int1', pa.int32()),
161                ('string1', pa.string()),
162            ])),
163        ]))),
164    ])
165    assert table.schema == expected_schema
166