1#
2# Licensed to the Apache Software Foundation (ASF) under one or more
3# contributor license agreements.  See the NOTICE file distributed with
4# this work for additional information regarding copyright ownership.
5# The ASF licenses this file to You under the Apache License, Version 2.0
6# (the "License"); you may not use this file except in compliance with
7# the License.  You may obtain a copy of the License at
8#
9#    http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17
18context("test functions in sparkR.R")
19
20test_that("Check masked functions", {
21  # Check that we are not masking any new function from base, stats, testthat unexpectedly
22  # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it
23  # hard for users to use base R functions. Please check when in doubt.
24  namesOfMaskedCompletely <- c("cov", "filter", "sample")
25  namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
26                     "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
27                     "summary", "transform", "drop", "window", "as.data.frame", "union")
28  if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
29    namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
30  }
31  masked <- conflicts(detail = TRUE)$`package:SparkR`
32  expect_true("describe" %in% masked)  # only when with testthat..
33  func <- lapply(masked, function(x) { capture.output(showMethods(x))[[1]] })
34  funcSparkROrEmpty <- grepl("\\(package SparkR\\)$|^$", func)
35  maskedBySparkR <- masked[funcSparkROrEmpty]
36  expect_equal(length(maskedBySparkR), length(namesOfMasked))
37  # make the 2 lists the same length so expect_equal will print their content
38  l <- max(length(maskedBySparkR), length(namesOfMasked))
39  length(maskedBySparkR) <- l
40  length(namesOfMasked) <- l
41  expect_equal(sort(maskedBySparkR, na.last = TRUE), sort(namesOfMasked, na.last = TRUE))
42  # above are those reported as masked when `library(SparkR)`
43  # note that many of these methods are still callable without base:: or stats:: prefix
44  # there should be a test for each of these, except followings, which are currently "broken"
45  funcHasAny <- unlist(lapply(masked, function(x) {
46                                        any(grepl("=\"ANY\"", capture.output(showMethods(x)[-1])))
47                                      }))
48  maskedCompletely <- masked[!funcHasAny]
49  expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely))
50  l <- max(length(maskedCompletely), length(namesOfMaskedCompletely))
51  length(maskedCompletely) <- l
52  length(namesOfMaskedCompletely) <- l
53  expect_equal(sort(maskedCompletely, na.last = TRUE),
54               sort(namesOfMaskedCompletely, na.last = TRUE))
55})
56
57test_that("repeatedly starting and stopping SparkR", {
58  for (i in 1:4) {
59    sc <- suppressWarnings(sparkR.init())
60    rdd <- parallelize(sc, 1:20, 2L)
61    expect_equal(countRDD(rdd), 20)
62    suppressWarnings(sparkR.stop())
63  }
64})
65
66test_that("repeatedly starting and stopping SparkSession", {
67  for (i in 1:4) {
68    sparkR.session(enableHiveSupport = FALSE)
69    df <- createDataFrame(data.frame(dummy = 1:i))
70    expect_equal(count(df), i)
71    sparkR.session.stop()
72  }
73})
74
75test_that("rdd GC across sparkR.stop", {
76  sc <- sparkR.sparkContext() # sc should get id 0
77  rdd1 <- parallelize(sc, 1:20, 2L) # rdd1 should get id 1
78  rdd2 <- parallelize(sc, 1:10, 2L) # rdd2 should get id 2
79  sparkR.session.stop()
80
81  sc <- sparkR.sparkContext() # sc should get id 0 again
82
83  # GC rdd1 before creating rdd3 and rdd2 after
84  rm(rdd1)
85  gc()
86
87  rdd3 <- parallelize(sc, 1:20, 2L) # rdd3 should get id 1 now
88  rdd4 <- parallelize(sc, 1:10, 2L) # rdd4 should get id 2 now
89
90  rm(rdd2)
91  gc()
92
93  countRDD(rdd3)
94  countRDD(rdd4)
95  sparkR.session.stop()
96})
97
98test_that("job group functions can be called", {
99  sc <- sparkR.sparkContext()
100  setJobGroup("groupId", "job description", TRUE)
101  cancelJobGroup("groupId")
102  clearJobGroup()
103
104  suppressWarnings(setJobGroup(sc, "groupId", "job description", TRUE))
105  suppressWarnings(cancelJobGroup(sc, "groupId"))
106  suppressWarnings(clearJobGroup(sc))
107  sparkR.session.stop()
108})
109
110test_that("utility function can be called", {
111  sparkR.sparkContext()
112  setLogLevel("ERROR")
113  sparkR.session.stop()
114})
115
116test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", {
117  e <- new.env()
118  e[["spark.driver.memory"]] <- "512m"
119  ops <- getClientModeSparkSubmitOpts("sparkrmain", e)
120  expect_equal("--driver-memory \"512m\" sparkrmain", ops)
121
122  e[["spark.driver.memory"]] <- "5g"
123  e[["spark.driver.extraClassPath"]] <- "/opt/class_path" # nolint
124  e[["spark.driver.extraJavaOptions"]] <- "-XX:+UseCompressedOops -XX:+UseCompressedStrings"
125  e[["spark.driver.extraLibraryPath"]] <- "/usr/local/hadoop/lib" # nolint
126  e[["random"]] <- "skipthis"
127  ops2 <- getClientModeSparkSubmitOpts("sparkr-shell", e)
128  # nolint start
129  expect_equal(ops2, paste0("--driver-class-path \"/opt/class_path\" --driver-java-options \"",
130                      "-XX:+UseCompressedOops -XX:+UseCompressedStrings\" --driver-library-path \"",
131                      "/usr/local/hadoop/lib\" --driver-memory \"5g\" sparkr-shell"))
132  # nolint end
133
134  e[["spark.driver.extraClassPath"]] <- "/" # too short
135  ops3 <- getClientModeSparkSubmitOpts("--driver-memory 4g sparkr-shell2", e)
136  # nolint start
137  expect_equal(ops3, paste0("--driver-java-options \"-XX:+UseCompressedOops ",
138                      "-XX:+UseCompressedStrings\" --driver-library-path \"/usr/local/hadoop/lib\"",
139                      " --driver-memory 4g sparkr-shell2"))
140  # nolint end
141})
142
143test_that("sparkJars sparkPackages as comma-separated strings", {
144  expect_warning(processSparkJars(" a, b "))
145  jars <- suppressWarnings(processSparkJars(" a, b "))
146  expect_equal(lapply(jars, basename), list("a", "b"))
147
148  jars <- suppressWarnings(processSparkJars(" abc ,, def "))
149  expect_equal(lapply(jars, basename), list("abc", "def"))
150
151  jars <- suppressWarnings(processSparkJars(c(" abc ,, def ", "", "xyz", " ", "a,b")))
152  expect_equal(lapply(jars, basename), list("abc", "def", "xyz", "a", "b"))
153
154  p <- processSparkPackages(c("ghi", "lmn"))
155  expect_equal(p, c("ghi", "lmn"))
156
157  # check normalizePath
158  f <- dir()[[1]]
159  expect_warning(processSparkJars(f), NA)
160  expect_match(processSparkJars(f), f)
161})
162
163test_that("spark.lapply should perform simple transforms", {
164  sparkR.sparkContext()
165  doubled <- spark.lapply(1:10, function(x) { 2 * x })
166  expect_equal(doubled, as.list(2 * 1:10))
167  sparkR.session.stop()
168})
169
170test_that("add and get file to be downloaded with Spark job on every node", {
171  sparkR.sparkContext()
172  # Test add file.
173  path <- tempfile(pattern = "hello", fileext = ".txt")
174  filename <- basename(path)
175  words <- "Hello World!"
176  writeLines(words, path)
177  spark.addFile(path)
178  download_path <- spark.getSparkFiles(filename)
179  expect_equal(readLines(download_path), words)
180
181  # Test spark.getSparkFiles works well on executors.
182  seq <- seq(from = 1, to = 10, length.out = 5)
183  f <- function(seq) { spark.getSparkFiles(filename) }
184  results <- spark.lapply(seq, f)
185  for (i in 1:5) { expect_equal(basename(results[[i]]), filename) }
186
187  unlink(path)
188
189  # Test add directory recursively.
190  path <- paste0(tempdir(), "/", "recursive_dir")
191  dir.create(path)
192  dir_name <- basename(path)
193  path1 <- paste0(path, "/", "hello.txt")
194  file.create(path1)
195  sub_path <- paste0(path, "/", "sub_hello")
196  dir.create(sub_path)
197  path2 <- paste0(sub_path, "/", "sub_hello.txt")
198  file.create(path2)
199  words <- "Hello World!"
200  sub_words <- "Sub Hello World!"
201  writeLines(words, path1)
202  writeLines(sub_words, path2)
203  spark.addFile(path, recursive = TRUE)
204  download_path1 <- spark.getSparkFiles(paste0(dir_name, "/", "hello.txt"))
205  expect_equal(readLines(download_path1), words)
206  download_path2 <- spark.getSparkFiles(paste0(dir_name, "/", "sub_hello/sub_hello.txt"))
207  expect_equal(readLines(download_path2), sub_words)
208  unlink(path, recursive = TRUE)
209  sparkR.session.stop()
210})
211