1# Licensed to the Apache Software Foundation (ASF) under one 2# or more contributor license agreements. See the NOTICE file 3# distributed with this work for additional information 4# regarding copyright ownership. The ASF licenses this file 5# to you under the Apache License, Version 2.0 (the 6# "License"); you may not use this file except in compliance 7# with the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, 12# software distributed under the License is distributed on an 13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14# KIND, either express or implied. See the License for the 15# specific language governing permissions and limitations 16# under the License. 17 18import os 19import pathlib 20import subprocess 21from tempfile import TemporaryDirectory 22 23import pytest 24import hypothesis as h 25 26from pyarrow.util import find_free_port 27from pyarrow import Codec 28 29 30# setup hypothesis profiles 31h.settings.register_profile('ci', max_examples=1000) 32h.settings.register_profile('dev', max_examples=50) 33h.settings.register_profile('debug', max_examples=10, 34 verbosity=h.Verbosity.verbose) 35 36# load default hypothesis profile, either set HYPOTHESIS_PROFILE environment 37# variable or pass --hypothesis-profile option to pytest, to see the generated 38# examples try: 39# pytest pyarrow -sv --enable-hypothesis --hypothesis-profile=debug 40h.settings.load_profile(os.environ.get('HYPOTHESIS_PROFILE', 'dev')) 41 42# Set this at the beginning before the AWS SDK was loaded to avoid reading in 43# user configuration values. 44os.environ['AWS_CONFIG_FILE'] = "/dev/null" 45 46 47groups = [ 48 'brotli', 49 'bz2', 50 'cython', 51 'dataset', 52 'hypothesis', 53 'fastparquet', 54 'gandiva', 55 'gzip', 56 'hdfs', 57 'large_memory', 58 'lz4', 59 'memory_leak', 60 'nopandas', 61 'orc', 62 'pandas', 63 'parquet', 64 'plasma', 65 's3', 66 'snappy', 67 'tensorflow', 68 'flight', 69 'slow', 70 'requires_testing_data', 71 'zstd', 72] 73 74defaults = { 75 'brotli': Codec.is_available('brotli'), 76 'bz2': Codec.is_available('bz2'), 77 'cython': False, 78 'dataset': False, 79 'fastparquet': False, 80 'hypothesis': False, 81 'gandiva': False, 82 'gzip': Codec.is_available('gzip'), 83 'hdfs': False, 84 'large_memory': False, 85 'lz4': Codec.is_available('lz4'), 86 'memory_leak': False, 87 'orc': False, 88 'nopandas': False, 89 'pandas': False, 90 'parquet': False, 91 'plasma': False, 92 's3': False, 93 'snappy': Codec.is_available('snappy'), 94 'tensorflow': False, 95 'flight': False, 96 'slow': False, 97 'requires_testing_data': True, 98 'zstd': Codec.is_available('zstd'), 99} 100 101try: 102 import cython # noqa 103 defaults['cython'] = True 104except ImportError: 105 pass 106 107try: 108 import fastparquet # noqa 109 defaults['fastparquet'] = True 110except ImportError: 111 pass 112 113try: 114 import pyarrow.gandiva # noqa 115 defaults['gandiva'] = True 116except ImportError: 117 pass 118 119try: 120 import pyarrow.dataset # noqa 121 defaults['dataset'] = True 122except ImportError: 123 pass 124 125try: 126 import pyarrow.orc # noqa 127 defaults['orc'] = True 128except ImportError: 129 pass 130 131try: 132 import pandas # noqa 133 defaults['pandas'] = True 134except ImportError: 135 defaults['nopandas'] = True 136 137try: 138 import pyarrow.parquet # noqa 139 defaults['parquet'] = True 140except ImportError: 141 pass 142 143try: 144 import pyarrow.plasma # noqa 145 defaults['plasma'] = True 146except ImportError: 147 pass 148 149try: 150 import tensorflow # noqa 151 defaults['tensorflow'] = True 152except ImportError: 153 pass 154 155try: 156 import pyarrow.flight # noqa 157 defaults['flight'] = True 158except ImportError: 159 pass 160 161try: 162 from pyarrow.fs import S3FileSystem # noqa 163 defaults['s3'] = True 164except ImportError: 165 pass 166 167try: 168 from pyarrow.fs import HadoopFileSystem # noqa 169 defaults['hdfs'] = True 170except ImportError: 171 pass 172 173 174def pytest_addoption(parser): 175 # Create options to selectively enable test groups 176 def bool_env(name, default=None): 177 value = os.environ.get(name.upper()) 178 if value is None: 179 return default 180 value = value.lower() 181 if value in {'1', 'true', 'on', 'yes', 'y'}: 182 return True 183 elif value in {'0', 'false', 'off', 'no', 'n'}: 184 return False 185 else: 186 raise ValueError('{}={} is not parsable as boolean' 187 .format(name.upper(), value)) 188 189 for group in groups: 190 default = bool_env('PYARROW_TEST_{}'.format(group), defaults[group]) 191 parser.addoption('--enable-{}'.format(group), 192 action='store_true', default=default, 193 help=('Enable the {} test group'.format(group))) 194 parser.addoption('--disable-{}'.format(group), 195 action='store_true', default=False, 196 help=('Disable the {} test group'.format(group))) 197 198 199class PyArrowConfig: 200 def __init__(self): 201 self.is_enabled = {} 202 203 def apply_mark(self, mark): 204 group = mark.name 205 if group in groups: 206 self.requires(group) 207 208 def requires(self, group): 209 if not self.is_enabled[group]: 210 pytest.skip('{} NOT enabled'.format(group)) 211 212 213def pytest_configure(config): 214 # Apply command-line options to initialize PyArrow-specific config object 215 config.pyarrow = PyArrowConfig() 216 217 for mark in groups: 218 config.addinivalue_line( 219 "markers", mark, 220 ) 221 222 enable_flag = '--enable-{}'.format(mark) 223 disable_flag = '--disable-{}'.format(mark) 224 225 is_enabled = (config.getoption(enable_flag) and not 226 config.getoption(disable_flag)) 227 config.pyarrow.is_enabled[mark] = is_enabled 228 229 230def pytest_runtest_setup(item): 231 # Apply test markers to skip tests selectively 232 for mark in item.iter_markers(): 233 item.config.pyarrow.apply_mark(mark) 234 235 236@pytest.fixture 237def tempdir(tmpdir): 238 # convert pytest's LocalPath to pathlib.Path 239 return pathlib.Path(tmpdir.strpath) 240 241 242@pytest.fixture(scope='session') 243def base_datadir(): 244 return pathlib.Path(__file__).parent / 'data' 245 246 247@pytest.fixture(autouse=True) 248def disable_aws_metadata(monkeypatch): 249 """Stop the AWS SDK from trying to contact the EC2 metadata server. 250 251 Otherwise, this causes a 5 second delay in tests that exercise the 252 S3 filesystem. 253 """ 254 monkeypatch.setenv("AWS_EC2_METADATA_DISABLED", "true") 255 256 257# TODO(kszucs): move the following fixtures to test_fs.py once the previous 258# parquet dataset implementation and hdfs implementation are removed. 259 260@pytest.fixture(scope='session') 261def hdfs_connection(): 262 host = os.environ.get('ARROW_HDFS_TEST_HOST', 'default') 263 port = int(os.environ.get('ARROW_HDFS_TEST_PORT', 0)) 264 user = os.environ.get('ARROW_HDFS_TEST_USER', 'hdfs') 265 return host, port, user 266 267 268@pytest.fixture(scope='session') 269def s3_connection(): 270 host, port = 'localhost', find_free_port() 271 access_key, secret_key = 'arrow', 'apachearrow' 272 return host, port, access_key, secret_key 273 274 275@pytest.fixture(scope='session') 276def s3_server(s3_connection): 277 host, port, access_key, secret_key = s3_connection 278 279 address = '{}:{}'.format(host, port) 280 env = os.environ.copy() 281 env.update({ 282 'MINIO_ACCESS_KEY': access_key, 283 'MINIO_SECRET_KEY': secret_key 284 }) 285 286 with TemporaryDirectory() as tempdir: 287 args = ['minio', '--compat', 'server', '--quiet', '--address', 288 address, tempdir] 289 proc = None 290 try: 291 proc = subprocess.Popen(args, env=env) 292 except OSError: 293 pytest.skip('`minio` command cannot be located') 294 else: 295 yield { 296 'connection': s3_connection, 297 'process': proc, 298 'tempdir': tempdir 299 } 300 finally: 301 if proc is not None: 302 proc.kill() 303