1library(testthat) 2library(recipes) 3 4library(modeldata) 5data(okc) 6 7set.seed(19) 8in_test <- 1:200 9 10okc_tr <- okc[-in_test,] 11okc_te <- okc[ in_test,] 12 13rec <- recipe(~ diet + location, data = okc_tr) 14 15# assume no novel levels here but test later: 16# all(sort(unique(okc_tr$location)) == sort(unique(okc$location))) 17 18test_that('default inputs', { 19 others <- rec %>% step_other(diet, location, other = "another", id = "") 20 21 tidy_exp_un <- tibble( 22 terms = c("diet", "location"), 23 retained = rep(NA_character_, 2), 24 id = "" 25 ) 26 expect_equal(tidy_exp_un, tidy(others, number = 1)) 27 28 others <- prep(others, training = okc_tr) 29 others_te <- bake(others, new_data = okc_te) 30 31 tidy_exp_tr <- tibble( 32 terms = rep(c("diet", "location"), c(4, 3)), 33 retained = c( 34 "anything", "mostly anything", "mostly vegetarian", 35 "strictly anything", "berkeley", 36 "oakland", "san francisco"), 37 id = "" 38 ) 39 expect_equal(tidy_exp_tr, tidy(others, number = 1)) 40 41 diet_props <- table(okc_tr$diet)/sum(!is.na(okc_tr$diet)) 42 diet_props <- sort(diet_props, decreasing = TRUE) 43 diet_levels <- names(diet_props)[diet_props >= others$step[[1]]$threshold] 44 for (i in diet_levels) 45 expect_equal(sum(others_te$diet == i, na.rm = TRUE), 46 sum(okc_te$diet == i, na.rm = TRUE)) 47 48 diet_levels <- c(diet_levels, others$step[[1]]$objects[["diet"]]$other) 49 expect_true(all(levels(others_te$diet) %in% diet_levels)) 50 expect_true(all(diet_levels %in% levels(others_te$diet))) 51 52 location_props <- table(okc_tr$location)/sum(!is.na(okc_tr$location)) 53 location_props <- sort(location_props, decreasing = TRUE) 54 location_levels <- names(location_props)[location_props >= others$step[[1]]$threshold] 55 for (i in location_levels) 56 expect_equal(sum(others_te$location == i, na.rm = TRUE), 57 sum(okc_te$location == i, na.rm = TRUE)) 58 59 location_levels <- c(location_levels, others$step[[1]]$objects[["location"]]$other) 60 expect_true(all(levels(others_te$location) %in% location_levels)) 61 expect_true(all(location_levels %in% levels(others_te$location))) 62 63 expect_equal(is.na(okc_te$diet), is.na(others_te$diet)) 64 expect_equal(is.na(okc_te$location), is.na(others_te$location)) 65}) 66 67 68test_that('high threshold - much removals', { 69 others <- rec %>% step_other(diet, location, threshold = .5) 70 others <- prep(others, training = okc_tr) 71 others_te <- bake(others, new_data = okc_te) 72 73 diet_props <- table(okc_tr$diet) 74 diet_levels <- others$steps[[1]]$objects$diet$keep 75 for (i in diet_levels) 76 expect_equal(sum(others_te$diet == i, na.rm = TRUE), 77 sum(okc_te$diet == i, na.rm = TRUE)) 78 79 diet_levels <- c(diet_levels, others$step[[1]]$objects[["diet"]]$other) 80 expect_true(all(levels(others_te$diet) %in% diet_levels)) 81 expect_true(all(diet_levels %in% levels(others_te$diet))) 82 83 location_props <- table(okc_tr$location) 84 location_levels <- others$steps[[1]]$objects$location$keep 85 for (i in location_levels) 86 expect_equal(sum(others_te$location == i, na.rm = TRUE), 87 sum(okc_te$location == i, na.rm = TRUE)) 88 89 location_levels <- c(location_levels, others$step[[1]]$objects[["location"]]$other) 90 expect_true(all(levels(others_te$location) %in% location_levels)) 91 expect_true(all(location_levels %in% levels(others_te$location))) 92 93 expect_equal(is.na(okc_te$diet), is.na(others_te$diet)) 94 expect_equal(is.na(okc_te$location), is.na(others_te$location)) 95}) 96 97 98test_that('low threshold - no removals', { 99 others <- rec %>% step_other(diet, location, threshold = 10^-30, other = "another") 100 others <- prep(others, training = okc_tr, strings_as_factors = FALSE) 101 others_te <- bake(others, new_data = okc_te) 102 103 expect_equal(is.na(okc_te$diet), is.na(others_te$diet)) 104 expect_equal(is.na(okc_te$location), is.na(others_te$location)) 105 106 expect_equal(okc_te$diet, as.character(others_te$diet)) 107 expect_equal(okc_te$location, as.character(others_te$location)) 108}) 109 110 111test_that('factor inputs', { 112 113 okc$diet <- as.factor(okc$diet) 114 okc$location <- as.factor(okc$location) 115 116 okc_tr <- okc[-in_test,] 117 okc_te <- okc[ in_test,] 118 119 rec <- recipe(~ diet + location, data = okc_tr) 120 121 others <- rec %>% step_other(diet, location) 122 others <- prep(others, training = okc_tr) 123 others_te <- bake(others, new_data = okc_te) 124 125 diet_props <- table(okc_tr$diet)/sum(!is.na(okc_tr$diet)) 126 diet_props <- sort(diet_props, decreasing = TRUE) 127 diet_levels <- names(diet_props)[diet_props >= others$step[[1]]$threshold] 128 for (i in diet_levels) 129 expect_equal(sum(others_te$diet == i, na.rm = TRUE), 130 sum(okc_te$diet == i, na.rm = TRUE)) 131 132 diet_levels <- c(diet_levels, others$step[[1]]$objects[["diet"]]$other) 133 expect_true(all(levels(others_te$diet) %in% diet_levels)) 134 expect_true(all(diet_levels %in% levels(others_te$diet))) 135 136 location_props <- table(okc_tr$location)/sum(!is.na(okc_tr$location)) 137 location_props <- sort(location_props, decreasing = TRUE) 138 location_levels <- names(location_props)[location_props >= others$step[[1]]$threshold] 139 for (i in location_levels) 140 expect_equal(sum(others_te$location == i, na.rm = TRUE), 141 sum(okc_te$location == i, na.rm = TRUE)) 142 143 location_levels <- c(location_levels, others$step[[1]]$objects[["location"]]$other) 144 expect_true(all(levels(others_te$location) %in% location_levels)) 145 expect_true(all(location_levels %in% levels(others_te$location))) 146 147 expect_equal(is.na(okc_te$diet), is.na(others_te$diet)) 148 expect_equal(is.na(okc_te$location), is.na(others_te$location)) 149}) 150 151 152test_that('novel levels', { 153 df <- data.frame( 154 y = c(1,0,1,1,0,0,0,1,1,1,0,0,1,0,1,0,0,0,1,0), 155 x1 = c('A','B','B','B','B','A','A','A','B','A','A','B', 156 'A','C','C','B','A','B','C','D'), 157 stringsAsFactors = FALSE) 158 training <- df[1:10,] 159 testing <- df[11:20,] 160 training$y <- as.factor(training$y) 161 training$x1 <- as.factor(training$x1) 162 testing$y <- as.factor(testing$y) 163 testing$x1 <- as.factor(testing$x1) 164 165 novel_level <- recipe(y ~ ., data = training) %>% 166 step_other(x1) 167 168 novel_level <- prep(novel_level, training = training) 169 new_results <- bake(novel_level, new_data = testing) 170 orig_results <- bake(novel_level, new_data = training) 171 expect_true(all(is.na(new_results$x1[testing$x1 == "C"]))) 172 expect_true(!any(orig_results$x1 == "other")) 173 174 training <- df[1:14,] 175 testing <- df[15:20,] 176 training$y <- as.factor(training$y) 177 training$x1 <- as.factor(training$x1) 178 testing$y <- as.factor(testing$y) 179 testing$x1 <- as.factor(testing$x1) 180 181 novel_level <- recipe(y ~ ., data = training) %>% 182 step_other(x1, threshold = .1) 183 184 novel_level <- prep(novel_level, training = training) 185 new_results <- bake(novel_level, new_data = testing) 186 orig_results <- bake(novel_level, new_data = training) 187 expect_true(all(new_results$x1[testing$x1 == "D"] == "other")) 188 expect_true(any(new_results$x1 == "other")) 189}) 190 191test_that("'other' already in use", { 192 others <- rec %>% step_other(diet, location, threshold = 10^-10) 193 expect_error( 194 prep(others, training = okc_tr, strings_as_factors = FALSE) 195 ) 196}) 197 198test_that('printing', { 199 rec <- rec %>% step_other(diet, location) 200 expect_output(print(rec)) 201 expect_output(prep(rec, training = okc_tr, verbose = TRUE)) 202}) 203 204test_that( 205 desc = "if threshold argument is an integer greater than one 206 then it's treated as a frequency", 207 code = { 208 others <- rec %>% step_other(diet, location, threshold = 3000, other = "another", id = "") 209 210 tidy_exp_un <- tibble( 211 terms = c("diet", "location"), 212 retained = rep(NA_character_, 2), 213 id = "" 214 ) 215 216 expect_equal(tidy_exp_un, tidy(others, number = 1)) 217 218 others <- prep(others, training = okc_tr) 219 220 tidy_exp_tr <- tibble( 221 terms = rep(c("diet", "location"), c(4, 3)), 222 retained = c( 223 "anything", "mostly anything", "mostly vegetarian", 224 "strictly anything", "berkeley", 225 "oakland", "san francisco"), 226 id = "" 227 ) 228 expect_equal(tidy_exp_tr, tidy(others, number = 1)) 229 } 230) 231 232test_that( 233 desc = "if the threshold argument is greather than one then it should be 234 an integer(ish)", 235 code = { 236 expect_error(rec %>% step_other(diet, location, threshold = 3.14)) 237 } 238) 239 240test_that( 241 desc = "if threshold is equal to 1 then the function removes every factor 242 level that is not present in the data", 243 code = { 244 fake_data <- data.frame( 245 test_factor = factor(c("A", "B"), levels = c("A", "B", "C")) 246 ) 247 248 rec <- recipe(~ test_factor, data = fake_data) 249 others <- rec %>% step_other(test_factor, threshold = 1, id = "") %>% prep() 250 251 tidy_exp_tr <- tibble( 252 terms = rep("test_factor", 2), 253 retained = c("A", "B"), 254 id = "" 255 ) 256 expect_equal(tidy_exp_tr, tidy(others, number = 1)) 257 } 258) 259 260 261test_that('tunable', { 262 rec <- 263 recipe(~ ., data = iris) %>% 264 step_other(all_predictors()) 265 rec_param <- tunable.step_other(rec$steps[[1]]) 266 expect_equal(rec_param$name, c("threshold")) 267 expect_true(all(rec_param$source == "recipe")) 268 expect_true(is.list(rec_param$call_info)) 269 expect_equal(nrow(rec_param), 1) 270 expect_equal( 271 names(rec_param), 272 c('name', 'call_info', 'source', 'component', 'component_id') 273 ) 274}) 275 276 277test_that('issue #415 - strings to factor conversion', { 278 trans_recipe <- 279 recipe(Species ~ ., data = iris) 280 281 prepped <- prep(trans_recipe, iris) 282 283 iris_no_outcome <- iris 284 iris_no_outcome["Species"] <- NULL 285 286 expect_error( 287 res <- bake(prepped, iris_no_outcome), 288 regex = NA 289 ) 290 expect_equal(names(res), names(iris[, 1:4])) 291}) 292