1library(testthat)
2library(recipes)
3
4library(modeldata)
5data(okc)
6
7set.seed(19)
8in_test <- 1:200
9
10okc_tr <- okc[-in_test,]
11okc_te <- okc[ in_test,]
12
13rec <- recipe(~ diet + location, data = okc_tr)
14
15# assume no novel levels here but test later:
16# all(sort(unique(okc_tr$location)) == sort(unique(okc$location)))
17
18test_that('default inputs', {
19  others <- rec %>% step_other(diet, location, other = "another", id = "")
20
21  tidy_exp_un <- tibble(
22    terms = c("diet", "location"),
23    retained = rep(NA_character_, 2),
24    id = ""
25  )
26  expect_equal(tidy_exp_un, tidy(others, number = 1))
27
28  others <- prep(others, training = okc_tr)
29  others_te <- bake(others, new_data = okc_te)
30
31  tidy_exp_tr <- tibble(
32    terms = rep(c("diet", "location"), c(4, 3)),
33    retained = c(
34      "anything", "mostly anything", "mostly vegetarian",
35      "strictly anything", "berkeley",
36      "oakland", "san francisco"),
37    id = ""
38  )
39  expect_equal(tidy_exp_tr, tidy(others, number = 1))
40
41  diet_props <- table(okc_tr$diet)/sum(!is.na(okc_tr$diet))
42  diet_props <- sort(diet_props, decreasing = TRUE)
43  diet_levels <- names(diet_props)[diet_props >= others$step[[1]]$threshold]
44  for (i in diet_levels)
45    expect_equal(sum(others_te$diet == i, na.rm = TRUE),
46                 sum(okc_te$diet == i, na.rm = TRUE))
47
48  diet_levels <- c(diet_levels, others$step[[1]]$objects[["diet"]]$other)
49  expect_true(all(levels(others_te$diet) %in% diet_levels))
50  expect_true(all(diet_levels %in% levels(others_te$diet)))
51
52  location_props <- table(okc_tr$location)/sum(!is.na(okc_tr$location))
53  location_props <- sort(location_props, decreasing = TRUE)
54  location_levels <- names(location_props)[location_props >= others$step[[1]]$threshold]
55  for (i in location_levels)
56    expect_equal(sum(others_te$location == i, na.rm = TRUE),
57                 sum(okc_te$location == i, na.rm = TRUE))
58
59  location_levels <- c(location_levels, others$step[[1]]$objects[["location"]]$other)
60  expect_true(all(levels(others_te$location) %in% location_levels))
61  expect_true(all(location_levels %in% levels(others_te$location)))
62
63  expect_equal(is.na(okc_te$diet), is.na(others_te$diet))
64  expect_equal(is.na(okc_te$location), is.na(others_te$location))
65})
66
67
68test_that('high threshold - much removals', {
69  others <- rec %>% step_other(diet, location, threshold = .5)
70  others <- prep(others, training = okc_tr)
71  others_te <- bake(others, new_data = okc_te)
72
73  diet_props <- table(okc_tr$diet)
74  diet_levels <- others$steps[[1]]$objects$diet$keep
75  for (i in diet_levels)
76    expect_equal(sum(others_te$diet == i, na.rm = TRUE),
77                 sum(okc_te$diet == i, na.rm = TRUE))
78
79  diet_levels <- c(diet_levels, others$step[[1]]$objects[["diet"]]$other)
80  expect_true(all(levels(others_te$diet) %in% diet_levels))
81  expect_true(all(diet_levels %in% levels(others_te$diet)))
82
83  location_props <- table(okc_tr$location)
84  location_levels <- others$steps[[1]]$objects$location$keep
85  for (i in location_levels)
86    expect_equal(sum(others_te$location == i, na.rm = TRUE),
87                 sum(okc_te$location == i, na.rm = TRUE))
88
89  location_levels <- c(location_levels, others$step[[1]]$objects[["location"]]$other)
90  expect_true(all(levels(others_te$location) %in% location_levels))
91  expect_true(all(location_levels %in% levels(others_te$location)))
92
93  expect_equal(is.na(okc_te$diet), is.na(others_te$diet))
94  expect_equal(is.na(okc_te$location), is.na(others_te$location))
95})
96
97
98test_that('low threshold - no removals', {
99  others <- rec %>% step_other(diet, location, threshold = 10^-30, other = "another")
100  others <- prep(others, training = okc_tr, strings_as_factors = FALSE)
101  others_te <- bake(others, new_data = okc_te)
102
103  expect_equal(is.na(okc_te$diet), is.na(others_te$diet))
104  expect_equal(is.na(okc_te$location), is.na(others_te$location))
105
106  expect_equal(okc_te$diet, as.character(others_te$diet))
107  expect_equal(okc_te$location, as.character(others_te$location))
108})
109
110
111test_that('factor inputs', {
112
113  okc$diet <- as.factor(okc$diet)
114  okc$location <- as.factor(okc$location)
115
116  okc_tr <- okc[-in_test,]
117  okc_te <- okc[ in_test,]
118
119  rec <- recipe(~ diet + location, data = okc_tr)
120
121  others <- rec %>% step_other(diet, location)
122  others <- prep(others, training = okc_tr)
123  others_te <- bake(others, new_data = okc_te)
124
125  diet_props <- table(okc_tr$diet)/sum(!is.na(okc_tr$diet))
126  diet_props <- sort(diet_props, decreasing = TRUE)
127  diet_levels <- names(diet_props)[diet_props >= others$step[[1]]$threshold]
128  for (i in diet_levels)
129    expect_equal(sum(others_te$diet == i, na.rm = TRUE),
130                 sum(okc_te$diet == i, na.rm = TRUE))
131
132  diet_levels <- c(diet_levels, others$step[[1]]$objects[["diet"]]$other)
133  expect_true(all(levels(others_te$diet) %in% diet_levels))
134  expect_true(all(diet_levels %in% levels(others_te$diet)))
135
136  location_props <- table(okc_tr$location)/sum(!is.na(okc_tr$location))
137  location_props <- sort(location_props, decreasing = TRUE)
138  location_levels <- names(location_props)[location_props >= others$step[[1]]$threshold]
139  for (i in location_levels)
140    expect_equal(sum(others_te$location == i, na.rm = TRUE),
141                 sum(okc_te$location == i, na.rm = TRUE))
142
143  location_levels <- c(location_levels, others$step[[1]]$objects[["location"]]$other)
144  expect_true(all(levels(others_te$location) %in% location_levels))
145  expect_true(all(location_levels %in% levels(others_te$location)))
146
147  expect_equal(is.na(okc_te$diet), is.na(others_te$diet))
148  expect_equal(is.na(okc_te$location), is.na(others_te$location))
149})
150
151
152test_that('novel levels', {
153  df <- data.frame(
154    y = c(1,0,1,1,0,0,0,1,1,1,0,0,1,0,1,0,0,0,1,0),
155    x1 = c('A','B','B','B','B','A','A','A','B','A','A','B',
156           'A','C','C','B','A','B','C','D'),
157    stringsAsFactors = FALSE)
158  training <- df[1:10,]
159  testing <- df[11:20,]
160  training$y <- as.factor(training$y)
161  training$x1 <- as.factor(training$x1)
162  testing$y <- as.factor(testing$y)
163  testing$x1 <- as.factor(testing$x1)
164
165  novel_level <- recipe(y ~ ., data = training) %>%
166    step_other(x1)
167
168  novel_level <- prep(novel_level, training = training)
169  new_results <- bake(novel_level, new_data = testing)
170  orig_results <- bake(novel_level, new_data = training)
171  expect_true(all(is.na(new_results$x1[testing$x1 == "C"])))
172  expect_true(!any(orig_results$x1 == "other"))
173
174  training <- df[1:14,]
175  testing <- df[15:20,]
176  training$y <- as.factor(training$y)
177  training$x1 <- as.factor(training$x1)
178  testing$y <- as.factor(testing$y)
179  testing$x1 <- as.factor(testing$x1)
180
181  novel_level <- recipe(y ~ ., data = training) %>%
182    step_other(x1, threshold = .1)
183
184  novel_level <- prep(novel_level, training = training)
185  new_results <- bake(novel_level, new_data = testing)
186  orig_results <- bake(novel_level, new_data = training)
187  expect_true(all(new_results$x1[testing$x1 == "D"] == "other"))
188  expect_true(any(new_results$x1 == "other"))
189})
190
191test_that("'other' already in use", {
192  others <- rec %>% step_other(diet, location, threshold = 10^-10)
193  expect_error(
194    prep(others, training = okc_tr, strings_as_factors = FALSE)
195  )
196})
197
198test_that('printing', {
199  rec <- rec %>% step_other(diet, location)
200  expect_output(print(rec))
201  expect_output(prep(rec, training = okc_tr, verbose = TRUE))
202})
203
204test_that(
205  desc = "if threshold argument is an integer greater than one
206          then it's treated as a frequency",
207  code = {
208    others <- rec %>% step_other(diet, location, threshold = 3000, other = "another", id = "")
209
210    tidy_exp_un <- tibble(
211      terms = c("diet", "location"),
212      retained = rep(NA_character_, 2),
213      id = ""
214    )
215
216    expect_equal(tidy_exp_un, tidy(others, number = 1))
217
218    others <- prep(others, training = okc_tr)
219
220    tidy_exp_tr <- tibble(
221      terms = rep(c("diet", "location"), c(4, 3)),
222      retained = c(
223        "anything", "mostly anything", "mostly vegetarian",
224        "strictly anything", "berkeley",
225        "oakland", "san francisco"),
226      id = ""
227    )
228    expect_equal(tidy_exp_tr, tidy(others, number = 1))
229  }
230)
231
232test_that(
233  desc = "if the threshold argument is greather than one then it should be
234          an integer(ish)",
235  code = {
236    expect_error(rec %>% step_other(diet, location, threshold = 3.14))
237  }
238)
239
240test_that(
241  desc = "if threshold is equal to 1 then the function removes every factor
242          level that is not present in the data",
243  code = {
244    fake_data <- data.frame(
245      test_factor = factor(c("A", "B"), levels = c("A", "B", "C"))
246    )
247
248    rec <- recipe(~ test_factor, data = fake_data)
249    others <- rec %>% step_other(test_factor, threshold = 1, id = "") %>% prep()
250
251    tidy_exp_tr <- tibble(
252      terms = rep("test_factor", 2),
253      retained = c("A", "B"),
254      id = ""
255    )
256    expect_equal(tidy_exp_tr, tidy(others, number = 1))
257  }
258)
259
260
261test_that('tunable', {
262  rec <-
263    recipe(~ ., data = iris) %>%
264    step_other(all_predictors())
265  rec_param <- tunable.step_other(rec$steps[[1]])
266  expect_equal(rec_param$name, c("threshold"))
267  expect_true(all(rec_param$source == "recipe"))
268  expect_true(is.list(rec_param$call_info))
269  expect_equal(nrow(rec_param), 1)
270  expect_equal(
271    names(rec_param),
272    c('name', 'call_info', 'source', 'component', 'component_id')
273  )
274})
275
276
277test_that('issue #415 -  strings to factor conversion', {
278  trans_recipe <-
279    recipe(Species ~ ., data = iris)
280
281  prepped <- prep(trans_recipe, iris)
282
283  iris_no_outcome <- iris
284  iris_no_outcome["Species"] <- NULL
285
286  expect_error(
287    res <- bake(prepped, iris_no_outcome),
288    regex = NA
289  )
290  expect_equal(names(res), names(iris[, 1:4]))
291})
292