1#
2# Licensed to the Apache Software Foundation (ASF) under one or more
3# contributor license agreements.  See the NOTICE file distributed with
4# this work for additional information regarding copyright ownership.
5# The ASF licenses this file to You under the Apache License, Version 2.0
6# (the "License"); you may not use this file except in compliance with
7# the License.  You may obtain a copy of the License at
8#
9#    http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17
18from functools import total_ordering
19import itertools
20import re
21
22all_modules = []
23
24
25@total_ordering
26class Module(object):
27    """
28    A module is the basic abstraction in our test runner script. Each module consists of a set of
29    source files, a set of test commands, and a set of dependencies on other modules. We use modules
30    to define a dependency graph that lets determine which tests to run based on which files have
31    changed.
32    """
33
34    def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={},
35                 sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(),
36                 test_tags=(), should_run_r_tests=False, should_run_build_tests=False):
37        """
38        Define a new module.
39
40        :param name: A short module name, for display in logging and error messages.
41        :param dependencies: A set of dependencies for this module. This should only include direct
42            dependencies; transitive dependencies are resolved automatically.
43        :param source_file_regexes: a set of regexes that match source files belonging to this
44            module. These regexes are applied by attempting to match at the beginning of the
45            filename strings.
46        :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in
47            order to build and test this module (e.g. '-PprofileName').
48        :param environ: A dict of environment variables that should be set when files in this
49            module are changed.
50        :param sbt_test_goals: A set of SBT test goals for testing this module.
51        :param python_test_goals: A set of Python test goals for testing this module.
52        :param blacklisted_python_implementations: A set of Python implementations that are not
53            supported by this module's Python components. The values in this set should match
54            strings returned by Python's `platform.python_implementation()`.
55        :param test_tags A set of tags that will be excluded when running unit tests if the module
56            is not explicitly changed.
57        :param should_run_r_tests: If true, changes in this module will trigger all R tests.
58        :param should_run_build_tests: If true, changes in this module will trigger build tests.
59        """
60        self.name = name
61        self.dependencies = dependencies
62        self.source_file_prefixes = source_file_regexes
63        self.sbt_test_goals = sbt_test_goals
64        self.build_profile_flags = build_profile_flags
65        self.environ = environ
66        self.python_test_goals = python_test_goals
67        self.blacklisted_python_implementations = blacklisted_python_implementations
68        self.test_tags = test_tags
69        self.should_run_r_tests = should_run_r_tests
70        self.should_run_build_tests = should_run_build_tests
71
72        self.dependent_modules = set()
73        for dep in dependencies:
74            dep.dependent_modules.add(self)
75        all_modules.append(self)
76
77    def contains_file(self, filename):
78        return any(re.match(p, filename) for p in self.source_file_prefixes)
79
80    def __repr__(self):
81        return "Module<%s>" % self.name
82
83    def __lt__(self, other):
84        return self.name < other.name
85
86    def __eq__(self, other):
87        return self.name == other.name
88
89    def __ne__(self, other):
90        return not (self.name == other.name)
91
92    def __hash__(self):
93        return hash(self.name)
94
95tags = Module(
96    name="tags",
97    dependencies=[],
98    source_file_regexes=[
99        "common/tags/",
100    ]
101)
102
103catalyst = Module(
104    name="catalyst",
105    dependencies=[tags],
106    source_file_regexes=[
107        "sql/catalyst/",
108    ],
109    sbt_test_goals=[
110        "catalyst/test",
111    ],
112)
113
114
115sql = Module(
116    name="sql",
117    dependencies=[catalyst],
118    source_file_regexes=[
119        "sql/core/",
120    ],
121    sbt_test_goals=[
122        "sql/test",
123    ],
124)
125
126hive = Module(
127    name="hive",
128    dependencies=[sql],
129    source_file_regexes=[
130        "sql/hive/",
131        "bin/spark-sql",
132    ],
133    build_profile_flags=[
134        "-Phive",
135    ],
136    sbt_test_goals=[
137        "hive/test",
138    ],
139    test_tags=[
140        "org.apache.spark.tags.ExtendedHiveTest"
141    ]
142)
143
144
145hive_thriftserver = Module(
146    name="hive-thriftserver",
147    dependencies=[hive],
148    source_file_regexes=[
149        "sql/hive-thriftserver",
150        "sbin/start-thriftserver.sh",
151    ],
152    build_profile_flags=[
153        "-Phive-thriftserver",
154    ],
155    sbt_test_goals=[
156        "hive-thriftserver/test",
157    ]
158)
159
160
161sql_kafka = Module(
162    name="sql-kafka-0-10",
163    dependencies=[sql],
164    source_file_regexes=[
165        "external/kafka-0-10-sql",
166    ],
167    sbt_test_goals=[
168        "sql-kafka-0-10/test",
169    ]
170)
171
172
173sketch = Module(
174    name="sketch",
175    dependencies=[tags],
176    source_file_regexes=[
177        "common/sketch/",
178    ],
179    sbt_test_goals=[
180        "sketch/test"
181    ]
182)
183
184
185graphx = Module(
186    name="graphx",
187    dependencies=[tags],
188    source_file_regexes=[
189        "graphx/",
190    ],
191    sbt_test_goals=[
192        "graphx/test"
193    ]
194)
195
196
197streaming = Module(
198    name="streaming",
199    dependencies=[tags],
200    source_file_regexes=[
201        "streaming",
202    ],
203    sbt_test_goals=[
204        "streaming/test",
205    ]
206)
207
208
209# Don't set the dependencies because changes in other modules should not trigger Kinesis tests.
210# Kinesis tests depends on external Amazon kinesis service. We should run these tests only when
211# files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't
212# fail other PRs.
213streaming_kinesis_asl = Module(
214    name="streaming-kinesis-asl",
215    dependencies=[tags],
216    source_file_regexes=[
217        "external/kinesis-asl/",
218        "external/kinesis-asl-assembly/",
219    ],
220    build_profile_flags=[
221        "-Pkinesis-asl",
222    ],
223    environ={
224        "ENABLE_KINESIS_TESTS": "1"
225    },
226    sbt_test_goals=[
227        "streaming-kinesis-asl/test",
228    ]
229)
230
231
232streaming_kafka = Module(
233    name="streaming-kafka-0-8",
234    dependencies=[streaming],
235    source_file_regexes=[
236        "external/kafka-0-8",
237        "external/kafka-0-8-assembly",
238    ],
239    sbt_test_goals=[
240        "streaming-kafka-0-8/test",
241    ]
242)
243
244streaming_kafka_0_10 = Module(
245    name="streaming-kafka-0-10",
246    dependencies=[streaming],
247    source_file_regexes=[
248        # The ending "/" is necessary otherwise it will include "sql-kafka" codes
249        "external/kafka-0-10/",
250        "external/kafka-0-10-assembly",
251    ],
252    sbt_test_goals=[
253        "streaming-kafka-0-10/test",
254    ]
255)
256
257streaming_flume_sink = Module(
258    name="streaming-flume-sink",
259    dependencies=[streaming],
260    source_file_regexes=[
261        "external/flume-sink",
262    ],
263    sbt_test_goals=[
264        "streaming-flume-sink/test",
265    ]
266)
267
268
269streaming_flume = Module(
270    name="streaming-flume",
271    dependencies=[streaming],
272    source_file_regexes=[
273        "external/flume",
274    ],
275    sbt_test_goals=[
276        "streaming-flume/test",
277    ]
278)
279
280
281streaming_flume_assembly = Module(
282    name="streaming-flume-assembly",
283    dependencies=[streaming_flume, streaming_flume_sink],
284    source_file_regexes=[
285        "external/flume-assembly",
286    ]
287)
288
289
290mllib_local = Module(
291    name="mllib-local",
292    dependencies=[tags],
293    source_file_regexes=[
294        "mllib-local",
295    ],
296    sbt_test_goals=[
297        "mllib-local/test",
298    ]
299)
300
301
302mllib = Module(
303    name="mllib",
304    dependencies=[mllib_local, streaming, sql],
305    source_file_regexes=[
306        "data/mllib/",
307        "mllib/",
308    ],
309    sbt_test_goals=[
310        "mllib/test",
311    ]
312)
313
314
315examples = Module(
316    name="examples",
317    dependencies=[graphx, mllib, streaming, hive],
318    source_file_regexes=[
319        "examples/",
320    ],
321    sbt_test_goals=[
322        "examples/test",
323    ]
324)
325
326
327pyspark_core = Module(
328    name="pyspark-core",
329    dependencies=[],
330    source_file_regexes=[
331        "python/(?!pyspark/(ml|mllib|sql|streaming))"
332    ],
333    python_test_goals=[
334        "pyspark.rdd",
335        "pyspark.context",
336        "pyspark.conf",
337        "pyspark.broadcast",
338        "pyspark.accumulators",
339        "pyspark.serializers",
340        "pyspark.profiler",
341        "pyspark.shuffle",
342        "pyspark.tests",
343    ]
344)
345
346
347pyspark_sql = Module(
348    name="pyspark-sql",
349    dependencies=[pyspark_core, hive],
350    source_file_regexes=[
351        "python/pyspark/sql"
352    ],
353    python_test_goals=[
354        "pyspark.sql.types",
355        "pyspark.sql.context",
356        "pyspark.sql.session",
357        "pyspark.sql.conf",
358        "pyspark.sql.catalog",
359        "pyspark.sql.column",
360        "pyspark.sql.dataframe",
361        "pyspark.sql.group",
362        "pyspark.sql.functions",
363        "pyspark.sql.readwriter",
364        "pyspark.sql.streaming",
365        "pyspark.sql.window",
366        "pyspark.sql.tests",
367    ]
368)
369
370
371pyspark_streaming = Module(
372    name="pyspark-streaming",
373    dependencies=[
374        pyspark_core,
375        streaming,
376        streaming_kafka,
377        streaming_flume_assembly,
378        streaming_kinesis_asl
379    ],
380    source_file_regexes=[
381        "python/pyspark/streaming"
382    ],
383    python_test_goals=[
384        "pyspark.streaming.util",
385        "pyspark.streaming.tests",
386    ]
387)
388
389
390pyspark_mllib = Module(
391    name="pyspark-mllib",
392    dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib],
393    source_file_regexes=[
394        "python/pyspark/mllib"
395    ],
396    python_test_goals=[
397        "pyspark.mllib.classification",
398        "pyspark.mllib.clustering",
399        "pyspark.mllib.evaluation",
400        "pyspark.mllib.feature",
401        "pyspark.mllib.fpm",
402        "pyspark.mllib.linalg.__init__",
403        "pyspark.mllib.linalg.distributed",
404        "pyspark.mllib.random",
405        "pyspark.mllib.recommendation",
406        "pyspark.mllib.regression",
407        "pyspark.mllib.stat._statistics",
408        "pyspark.mllib.stat.KernelDensity",
409        "pyspark.mllib.tree",
410        "pyspark.mllib.util",
411        "pyspark.mllib.tests",
412    ],
413    blacklisted_python_implementations=[
414        "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
415    ]
416)
417
418
419pyspark_ml = Module(
420    name="pyspark-ml",
421    dependencies=[pyspark_core, pyspark_mllib],
422    source_file_regexes=[
423        "python/pyspark/ml/"
424    ],
425    python_test_goals=[
426        "pyspark.ml.feature",
427        "pyspark.ml.classification",
428        "pyspark.ml.clustering",
429        "pyspark.ml.linalg.__init__",
430        "pyspark.ml.recommendation",
431        "pyspark.ml.regression",
432        "pyspark.ml.tuning",
433        "pyspark.ml.tests",
434        "pyspark.ml.evaluation",
435    ],
436    blacklisted_python_implementations=[
437        "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
438    ]
439)
440
441sparkr = Module(
442    name="sparkr",
443    dependencies=[hive, mllib],
444    source_file_regexes=[
445        "R/",
446    ],
447    should_run_r_tests=True
448)
449
450
451docs = Module(
452    name="docs",
453    dependencies=[],
454    source_file_regexes=[
455        "docs/",
456    ]
457)
458
459build = Module(
460    name="build",
461    dependencies=[],
462    source_file_regexes=[
463        ".*pom.xml",
464        "dev/test-dependencies.sh",
465    ],
466    should_run_build_tests=True
467)
468
469yarn = Module(
470    name="yarn",
471    dependencies=[],
472    source_file_regexes=[
473        "yarn/",
474        "common/network-yarn/",
475    ],
476    build_profile_flags=["-Pyarn"],
477    sbt_test_goals=[
478        "yarn/test",
479        "network-yarn/test",
480    ],
481    test_tags=[
482        "org.apache.spark.tags.ExtendedYarnTest"
483    ]
484)
485
486mesos = Module(
487    name="mesos",
488    dependencies=[],
489    source_file_regexes=["mesos/"],
490    build_profile_flags=["-Pmesos"],
491    sbt_test_goals=["mesos/test"]
492)
493
494# The root module is a dummy module which is used to run all of the tests.
495# No other modules should directly depend on this module.
496root = Module(
497    name="root",
498    dependencies=[build],  # Changes to build should trigger all tests.
499    source_file_regexes=[],
500    # In order to run all of the tests, enable every test profile:
501    build_profile_flags=list(set(
502        itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))),
503    sbt_test_goals=[
504        "test",
505    ],
506    python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)),
507    should_run_r_tests=True,
508    should_run_build_tests=True
509)
510