1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements.  See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership.  The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License.  You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied.  See the License for the
15# specific language governing permissions and limitations
16# under the License.
17
18import os
19import pathlib
20import subprocess
21from tempfile import TemporaryDirectory
22
23import pytest
24import hypothesis as h
25
26from pyarrow.util import find_free_port
27from pyarrow import Codec
28
29
30# setup hypothesis profiles
31h.settings.register_profile('ci', max_examples=1000)
32h.settings.register_profile('dev', max_examples=50)
33h.settings.register_profile('debug', max_examples=10,
34                            verbosity=h.Verbosity.verbose)
35
36# load default hypothesis profile, either set HYPOTHESIS_PROFILE environment
37# variable or pass --hypothesis-profile option to pytest, to see the generated
38# examples try:
39# pytest pyarrow -sv --enable-hypothesis --hypothesis-profile=debug
40h.settings.load_profile(os.environ.get('HYPOTHESIS_PROFILE', 'dev'))
41
42# Set this at the beginning before the AWS SDK was loaded to avoid reading in
43# user configuration values.
44os.environ['AWS_CONFIG_FILE'] = "/dev/null"
45
46
47groups = [
48    'brotli',
49    'bz2',
50    'cython',
51    'dataset',
52    'hypothesis',
53    'fastparquet',
54    'gandiva',
55    'gzip',
56    'hdfs',
57    'large_memory',
58    'lz4',
59    'memory_leak',
60    'nopandas',
61    'orc',
62    'pandas',
63    'parquet',
64    'plasma',
65    's3',
66    'snappy',
67    'tensorflow',
68    'flight',
69    'slow',
70    'requires_testing_data',
71    'zstd',
72]
73
74defaults = {
75    'brotli': Codec.is_available('brotli'),
76    'bz2': Codec.is_available('bz2'),
77    'cython': False,
78    'dataset': False,
79    'fastparquet': False,
80    'hypothesis': False,
81    'gandiva': False,
82    'gzip': Codec.is_available('gzip'),
83    'hdfs': False,
84    'large_memory': False,
85    'lz4': Codec.is_available('lz4'),
86    'memory_leak': False,
87    'orc': False,
88    'nopandas': False,
89    'pandas': False,
90    'parquet': False,
91    'plasma': False,
92    's3': False,
93    'snappy': Codec.is_available('snappy'),
94    'tensorflow': False,
95    'flight': False,
96    'slow': False,
97    'requires_testing_data': True,
98    'zstd': Codec.is_available('zstd'),
99}
100
101try:
102    import cython  # noqa
103    defaults['cython'] = True
104except ImportError:
105    pass
106
107try:
108    import fastparquet  # noqa
109    defaults['fastparquet'] = True
110except ImportError:
111    pass
112
113try:
114    import pyarrow.gandiva  # noqa
115    defaults['gandiva'] = True
116except ImportError:
117    pass
118
119try:
120    import pyarrow.dataset  # noqa
121    defaults['dataset'] = True
122except ImportError:
123    pass
124
125try:
126    import pyarrow.orc  # noqa
127    defaults['orc'] = True
128except ImportError:
129    pass
130
131try:
132    import pandas  # noqa
133    defaults['pandas'] = True
134except ImportError:
135    defaults['nopandas'] = True
136
137try:
138    import pyarrow.parquet  # noqa
139    defaults['parquet'] = True
140except ImportError:
141    pass
142
143try:
144    import pyarrow.plasma  # noqa
145    defaults['plasma'] = True
146except ImportError:
147    pass
148
149try:
150    import tensorflow  # noqa
151    defaults['tensorflow'] = True
152except ImportError:
153    pass
154
155try:
156    import pyarrow.flight  # noqa
157    defaults['flight'] = True
158except ImportError:
159    pass
160
161try:
162    from pyarrow.fs import S3FileSystem  # noqa
163    defaults['s3'] = True
164except ImportError:
165    pass
166
167try:
168    from pyarrow.fs import HadoopFileSystem  # noqa
169    defaults['hdfs'] = True
170except ImportError:
171    pass
172
173
174def pytest_addoption(parser):
175    # Create options to selectively enable test groups
176    def bool_env(name, default=None):
177        value = os.environ.get(name.upper())
178        if value is None:
179            return default
180        value = value.lower()
181        if value in {'1', 'true', 'on', 'yes', 'y'}:
182            return True
183        elif value in {'0', 'false', 'off', 'no', 'n'}:
184            return False
185        else:
186            raise ValueError('{}={} is not parsable as boolean'
187                             .format(name.upper(), value))
188
189    for group in groups:
190        default = bool_env('PYARROW_TEST_{}'.format(group), defaults[group])
191        parser.addoption('--enable-{}'.format(group),
192                         action='store_true', default=default,
193                         help=('Enable the {} test group'.format(group)))
194        parser.addoption('--disable-{}'.format(group),
195                         action='store_true', default=False,
196                         help=('Disable the {} test group'.format(group)))
197
198
199class PyArrowConfig:
200    def __init__(self):
201        self.is_enabled = {}
202
203    def apply_mark(self, mark):
204        group = mark.name
205        if group in groups:
206            self.requires(group)
207
208    def requires(self, group):
209        if not self.is_enabled[group]:
210            pytest.skip('{} NOT enabled'.format(group))
211
212
213def pytest_configure(config):
214    # Apply command-line options to initialize PyArrow-specific config object
215    config.pyarrow = PyArrowConfig()
216
217    for mark in groups:
218        config.addinivalue_line(
219            "markers", mark,
220        )
221
222        enable_flag = '--enable-{}'.format(mark)
223        disable_flag = '--disable-{}'.format(mark)
224
225        is_enabled = (config.getoption(enable_flag) and not
226                      config.getoption(disable_flag))
227        config.pyarrow.is_enabled[mark] = is_enabled
228
229
230def pytest_runtest_setup(item):
231    # Apply test markers to skip tests selectively
232    for mark in item.iter_markers():
233        item.config.pyarrow.apply_mark(mark)
234
235
236@pytest.fixture
237def tempdir(tmpdir):
238    # convert pytest's LocalPath to pathlib.Path
239    return pathlib.Path(tmpdir.strpath)
240
241
242@pytest.fixture(scope='session')
243def base_datadir():
244    return pathlib.Path(__file__).parent / 'data'
245
246
247@pytest.fixture(autouse=True)
248def disable_aws_metadata(monkeypatch):
249    """Stop the AWS SDK from trying to contact the EC2 metadata server.
250
251    Otherwise, this causes a 5 second delay in tests that exercise the
252    S3 filesystem.
253    """
254    monkeypatch.setenv("AWS_EC2_METADATA_DISABLED", "true")
255
256
257# TODO(kszucs): move the following fixtures to test_fs.py once the previous
258# parquet dataset implementation and hdfs implementation are removed.
259
260@pytest.fixture(scope='session')
261def hdfs_connection():
262    host = os.environ.get('ARROW_HDFS_TEST_HOST', 'default')
263    port = int(os.environ.get('ARROW_HDFS_TEST_PORT', 0))
264    user = os.environ.get('ARROW_HDFS_TEST_USER', 'hdfs')
265    return host, port, user
266
267
268@pytest.fixture(scope='session')
269def s3_connection():
270    host, port = 'localhost', find_free_port()
271    access_key, secret_key = 'arrow', 'apachearrow'
272    return host, port, access_key, secret_key
273
274
275@pytest.fixture(scope='session')
276def s3_server(s3_connection):
277    host, port, access_key, secret_key = s3_connection
278
279    address = '{}:{}'.format(host, port)
280    env = os.environ.copy()
281    env.update({
282        'MINIO_ACCESS_KEY': access_key,
283        'MINIO_SECRET_KEY': secret_key
284    })
285
286    with TemporaryDirectory() as tempdir:
287        args = ['minio', '--compat', 'server', '--quiet', '--address',
288                address, tempdir]
289        proc = None
290        try:
291            proc = subprocess.Popen(args, env=env)
292        except OSError:
293            pytest.skip('`minio` command cannot be located')
294        else:
295            yield {
296                'connection': s3_connection,
297                'process': proc,
298                'tempdir': tempdir
299            }
300        finally:
301            if proc is not None:
302                proc.kill()
303