1# 2# Licensed to the Apache Software Foundation (ASF) under one or more 3# contributor license agreements. See the NOTICE file distributed with 4# this work for additional information regarding copyright ownership. 5# The ASF licenses this file to You under the Apache License, Version 2.0 6# (the "License"); you may not use this file except in compliance with 7# the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16# 17 18from functools import total_ordering 19import itertools 20import re 21 22all_modules = [] 23 24 25@total_ordering 26class Module(object): 27 """ 28 A module is the basic abstraction in our test runner script. Each module consists of a set of 29 source files, a set of test commands, and a set of dependencies on other modules. We use modules 30 to define a dependency graph that lets determine which tests to run based on which files have 31 changed. 32 """ 33 34 def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={}, 35 sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(), 36 test_tags=(), should_run_r_tests=False, should_run_build_tests=False): 37 """ 38 Define a new module. 39 40 :param name: A short module name, for display in logging and error messages. 41 :param dependencies: A set of dependencies for this module. This should only include direct 42 dependencies; transitive dependencies are resolved automatically. 43 :param source_file_regexes: a set of regexes that match source files belonging to this 44 module. These regexes are applied by attempting to match at the beginning of the 45 filename strings. 46 :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in 47 order to build and test this module (e.g. '-PprofileName'). 48 :param environ: A dict of environment variables that should be set when files in this 49 module are changed. 50 :param sbt_test_goals: A set of SBT test goals for testing this module. 51 :param python_test_goals: A set of Python test goals for testing this module. 52 :param blacklisted_python_implementations: A set of Python implementations that are not 53 supported by this module's Python components. The values in this set should match 54 strings returned by Python's `platform.python_implementation()`. 55 :param test_tags A set of tags that will be excluded when running unit tests if the module 56 is not explicitly changed. 57 :param should_run_r_tests: If true, changes in this module will trigger all R tests. 58 :param should_run_build_tests: If true, changes in this module will trigger build tests. 59 """ 60 self.name = name 61 self.dependencies = dependencies 62 self.source_file_prefixes = source_file_regexes 63 self.sbt_test_goals = sbt_test_goals 64 self.build_profile_flags = build_profile_flags 65 self.environ = environ 66 self.python_test_goals = python_test_goals 67 self.blacklisted_python_implementations = blacklisted_python_implementations 68 self.test_tags = test_tags 69 self.should_run_r_tests = should_run_r_tests 70 self.should_run_build_tests = should_run_build_tests 71 72 self.dependent_modules = set() 73 for dep in dependencies: 74 dep.dependent_modules.add(self) 75 all_modules.append(self) 76 77 def contains_file(self, filename): 78 return any(re.match(p, filename) for p in self.source_file_prefixes) 79 80 def __repr__(self): 81 return "Module<%s>" % self.name 82 83 def __lt__(self, other): 84 return self.name < other.name 85 86 def __eq__(self, other): 87 return self.name == other.name 88 89 def __ne__(self, other): 90 return not (self.name == other.name) 91 92 def __hash__(self): 93 return hash(self.name) 94 95tags = Module( 96 name="tags", 97 dependencies=[], 98 source_file_regexes=[ 99 "common/tags/", 100 ] 101) 102 103catalyst = Module( 104 name="catalyst", 105 dependencies=[tags], 106 source_file_regexes=[ 107 "sql/catalyst/", 108 ], 109 sbt_test_goals=[ 110 "catalyst/test", 111 ], 112) 113 114 115sql = Module( 116 name="sql", 117 dependencies=[catalyst], 118 source_file_regexes=[ 119 "sql/core/", 120 ], 121 sbt_test_goals=[ 122 "sql/test", 123 ], 124) 125 126hive = Module( 127 name="hive", 128 dependencies=[sql], 129 source_file_regexes=[ 130 "sql/hive/", 131 "bin/spark-sql", 132 ], 133 build_profile_flags=[ 134 "-Phive", 135 ], 136 sbt_test_goals=[ 137 "hive/test", 138 ], 139 test_tags=[ 140 "org.apache.spark.tags.ExtendedHiveTest" 141 ] 142) 143 144 145hive_thriftserver = Module( 146 name="hive-thriftserver", 147 dependencies=[hive], 148 source_file_regexes=[ 149 "sql/hive-thriftserver", 150 "sbin/start-thriftserver.sh", 151 ], 152 build_profile_flags=[ 153 "-Phive-thriftserver", 154 ], 155 sbt_test_goals=[ 156 "hive-thriftserver/test", 157 ] 158) 159 160 161sql_kafka = Module( 162 name="sql-kafka-0-10", 163 dependencies=[sql], 164 source_file_regexes=[ 165 "external/kafka-0-10-sql", 166 ], 167 sbt_test_goals=[ 168 "sql-kafka-0-10/test", 169 ] 170) 171 172 173sketch = Module( 174 name="sketch", 175 dependencies=[tags], 176 source_file_regexes=[ 177 "common/sketch/", 178 ], 179 sbt_test_goals=[ 180 "sketch/test" 181 ] 182) 183 184 185graphx = Module( 186 name="graphx", 187 dependencies=[tags], 188 source_file_regexes=[ 189 "graphx/", 190 ], 191 sbt_test_goals=[ 192 "graphx/test" 193 ] 194) 195 196 197streaming = Module( 198 name="streaming", 199 dependencies=[tags], 200 source_file_regexes=[ 201 "streaming", 202 ], 203 sbt_test_goals=[ 204 "streaming/test", 205 ] 206) 207 208 209# Don't set the dependencies because changes in other modules should not trigger Kinesis tests. 210# Kinesis tests depends on external Amazon kinesis service. We should run these tests only when 211# files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't 212# fail other PRs. 213streaming_kinesis_asl = Module( 214 name="streaming-kinesis-asl", 215 dependencies=[tags], 216 source_file_regexes=[ 217 "external/kinesis-asl/", 218 "external/kinesis-asl-assembly/", 219 ], 220 build_profile_flags=[ 221 "-Pkinesis-asl", 222 ], 223 environ={ 224 "ENABLE_KINESIS_TESTS": "1" 225 }, 226 sbt_test_goals=[ 227 "streaming-kinesis-asl/test", 228 ] 229) 230 231 232streaming_kafka = Module( 233 name="streaming-kafka-0-8", 234 dependencies=[streaming], 235 source_file_regexes=[ 236 "external/kafka-0-8", 237 "external/kafka-0-8-assembly", 238 ], 239 sbt_test_goals=[ 240 "streaming-kafka-0-8/test", 241 ] 242) 243 244streaming_kafka_0_10 = Module( 245 name="streaming-kafka-0-10", 246 dependencies=[streaming], 247 source_file_regexes=[ 248 # The ending "/" is necessary otherwise it will include "sql-kafka" codes 249 "external/kafka-0-10/", 250 "external/kafka-0-10-assembly", 251 ], 252 sbt_test_goals=[ 253 "streaming-kafka-0-10/test", 254 ] 255) 256 257streaming_flume_sink = Module( 258 name="streaming-flume-sink", 259 dependencies=[streaming], 260 source_file_regexes=[ 261 "external/flume-sink", 262 ], 263 sbt_test_goals=[ 264 "streaming-flume-sink/test", 265 ] 266) 267 268 269streaming_flume = Module( 270 name="streaming-flume", 271 dependencies=[streaming], 272 source_file_regexes=[ 273 "external/flume", 274 ], 275 sbt_test_goals=[ 276 "streaming-flume/test", 277 ] 278) 279 280 281streaming_flume_assembly = Module( 282 name="streaming-flume-assembly", 283 dependencies=[streaming_flume, streaming_flume_sink], 284 source_file_regexes=[ 285 "external/flume-assembly", 286 ] 287) 288 289 290mllib_local = Module( 291 name="mllib-local", 292 dependencies=[tags], 293 source_file_regexes=[ 294 "mllib-local", 295 ], 296 sbt_test_goals=[ 297 "mllib-local/test", 298 ] 299) 300 301 302mllib = Module( 303 name="mllib", 304 dependencies=[mllib_local, streaming, sql], 305 source_file_regexes=[ 306 "data/mllib/", 307 "mllib/", 308 ], 309 sbt_test_goals=[ 310 "mllib/test", 311 ] 312) 313 314 315examples = Module( 316 name="examples", 317 dependencies=[graphx, mllib, streaming, hive], 318 source_file_regexes=[ 319 "examples/", 320 ], 321 sbt_test_goals=[ 322 "examples/test", 323 ] 324) 325 326 327pyspark_core = Module( 328 name="pyspark-core", 329 dependencies=[], 330 source_file_regexes=[ 331 "python/(?!pyspark/(ml|mllib|sql|streaming))" 332 ], 333 python_test_goals=[ 334 "pyspark.rdd", 335 "pyspark.context", 336 "pyspark.conf", 337 "pyspark.broadcast", 338 "pyspark.accumulators", 339 "pyspark.serializers", 340 "pyspark.profiler", 341 "pyspark.shuffle", 342 "pyspark.tests", 343 ] 344) 345 346 347pyspark_sql = Module( 348 name="pyspark-sql", 349 dependencies=[pyspark_core, hive], 350 source_file_regexes=[ 351 "python/pyspark/sql" 352 ], 353 python_test_goals=[ 354 "pyspark.sql.types", 355 "pyspark.sql.context", 356 "pyspark.sql.session", 357 "pyspark.sql.conf", 358 "pyspark.sql.catalog", 359 "pyspark.sql.column", 360 "pyspark.sql.dataframe", 361 "pyspark.sql.group", 362 "pyspark.sql.functions", 363 "pyspark.sql.readwriter", 364 "pyspark.sql.streaming", 365 "pyspark.sql.window", 366 "pyspark.sql.tests", 367 ] 368) 369 370 371pyspark_streaming = Module( 372 name="pyspark-streaming", 373 dependencies=[ 374 pyspark_core, 375 streaming, 376 streaming_kafka, 377 streaming_flume_assembly, 378 streaming_kinesis_asl 379 ], 380 source_file_regexes=[ 381 "python/pyspark/streaming" 382 ], 383 python_test_goals=[ 384 "pyspark.streaming.util", 385 "pyspark.streaming.tests", 386 ] 387) 388 389 390pyspark_mllib = Module( 391 name="pyspark-mllib", 392 dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib], 393 source_file_regexes=[ 394 "python/pyspark/mllib" 395 ], 396 python_test_goals=[ 397 "pyspark.mllib.classification", 398 "pyspark.mllib.clustering", 399 "pyspark.mllib.evaluation", 400 "pyspark.mllib.feature", 401 "pyspark.mllib.fpm", 402 "pyspark.mllib.linalg.__init__", 403 "pyspark.mllib.linalg.distributed", 404 "pyspark.mllib.random", 405 "pyspark.mllib.recommendation", 406 "pyspark.mllib.regression", 407 "pyspark.mllib.stat._statistics", 408 "pyspark.mllib.stat.KernelDensity", 409 "pyspark.mllib.tree", 410 "pyspark.mllib.util", 411 "pyspark.mllib.tests", 412 ], 413 blacklisted_python_implementations=[ 414 "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there 415 ] 416) 417 418 419pyspark_ml = Module( 420 name="pyspark-ml", 421 dependencies=[pyspark_core, pyspark_mllib], 422 source_file_regexes=[ 423 "python/pyspark/ml/" 424 ], 425 python_test_goals=[ 426 "pyspark.ml.feature", 427 "pyspark.ml.classification", 428 "pyspark.ml.clustering", 429 "pyspark.ml.linalg.__init__", 430 "pyspark.ml.recommendation", 431 "pyspark.ml.regression", 432 "pyspark.ml.tuning", 433 "pyspark.ml.tests", 434 "pyspark.ml.evaluation", 435 ], 436 blacklisted_python_implementations=[ 437 "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there 438 ] 439) 440 441sparkr = Module( 442 name="sparkr", 443 dependencies=[hive, mllib], 444 source_file_regexes=[ 445 "R/", 446 ], 447 should_run_r_tests=True 448) 449 450 451docs = Module( 452 name="docs", 453 dependencies=[], 454 source_file_regexes=[ 455 "docs/", 456 ] 457) 458 459build = Module( 460 name="build", 461 dependencies=[], 462 source_file_regexes=[ 463 ".*pom.xml", 464 "dev/test-dependencies.sh", 465 ], 466 should_run_build_tests=True 467) 468 469yarn = Module( 470 name="yarn", 471 dependencies=[], 472 source_file_regexes=[ 473 "yarn/", 474 "common/network-yarn/", 475 ], 476 build_profile_flags=["-Pyarn"], 477 sbt_test_goals=[ 478 "yarn/test", 479 "network-yarn/test", 480 ], 481 test_tags=[ 482 "org.apache.spark.tags.ExtendedYarnTest" 483 ] 484) 485 486mesos = Module( 487 name="mesos", 488 dependencies=[], 489 source_file_regexes=["mesos/"], 490 build_profile_flags=["-Pmesos"], 491 sbt_test_goals=["mesos/test"] 492) 493 494# The root module is a dummy module which is used to run all of the tests. 495# No other modules should directly depend on this module. 496root = Module( 497 name="root", 498 dependencies=[build], # Changes to build should trigger all tests. 499 source_file_regexes=[], 500 # In order to run all of the tests, enable every test profile: 501 build_profile_flags=list(set( 502 itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))), 503 sbt_test_goals=[ 504 "test", 505 ], 506 python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)), 507 should_run_r_tests=True, 508 should_run_build_tests=True 509) 510