1# setdiff for data.tables, internal at the moment #547, used in not-join 2setdiff_ = function(x, y, by.x=seq_along(x), by.y=seq_along(y), use.names=FALSE) { 3 if (!is.data.table(x) || !is.data.table(y)) stop("x and y must both be data.tables") 4 # !ncol redundant since all 0-column data.tables have 0 rows 5 if (!nrow(x)) return(x) 6 by.x = colnamesInt(x, by.x, check_dups=TRUE) 7 if (!nrow(y)) return(unique(x, by=by.x)) 8 by.y = colnamesInt(y, by.y, check_dups=TRUE) 9 if (length(by.x) != length(by.y)) stop("length(by.x) != length(by.y)") 10 # factor in x should've factor/character in y, and viceversa 11 for (a in seq_along(by.x)) { 12 lc = by.y[a] 13 rc = by.x[a] 14 icnam = names(y)[lc] 15 xcnam = names(x)[rc] 16 if ( is.character(x[[rc]]) && !(is.character(y[[lc]]) || is.factor(y[[lc]])) ) { 17 stop("When x's column ('",xcnam,"') is character, the corresponding column in y ('",icnam,"') should be factor or character, but found incompatible type '",typeof(y[[lc]]),"'.") 18 } else if ( is.factor(x[[rc]]) && !(is.character(y[[lc]]) || is.factor(y[[lc]])) ) { 19 stop("When x's column ('",xcnam,"') is factor, the corresponding column in y ('",icnam,"') should be character or factor, but found incompatible type '",typeof(y[[lc]]),"'.") 20 } else if ( (is.integer(x[[rc]]) || is.double(x[[rc]])) && (is.logical(y[[lc]]) || is.character(y[[lc]])) ) { 21 stop("When x's column ('",xcnam,"') is integer or numeric, the corresponding column in y ('",icnam,"') can not be character or logical types, but found incompatible type '",typeof(y[[lc]]),"'.") 22 } 23 } 24 ux = unique(shallow(x, by.x)) 25 uy = unique(shallow(y, by.y)) 26 ix = duplicated(rbind(uy, ux, use.names=use.names, fill=FALSE))[-seq_len(nrow(uy))] 27 .Call(CsubsetDT, ux, which_(ix, FALSE), seq_along(ux)) # more memory efficient version of which(!ix) 28} 29 30# set operators ---- 31 32funique = function(x) { 33 stopifnot(is.data.table(x)) 34 dup = duplicated(x) 35 if (any(dup)) .Call(CsubsetDT, x, which_(dup, FALSE), seq_along(x)) else x 36} 37 38.set_ops_arg_check = function(x, y, all, .seqn = FALSE, block_list = TRUE) { 39 if (!is.logical(all) || length(all) != 1L) stop("argument 'all' should be logical of length one") 40 if (!is.data.table(x) || !is.data.table(y)) stop("x and y must both be data.tables") 41 if (!identical(sort(names(x)), sort(names(y)))) stop("x and y must have the same column names") 42 if (!identical(names(x), names(y))) stop("x and y must have the same column order") 43 bad_types = c("raw", "complex", if (block_list) "list") 44 found = bad_types %chin% c(vapply_1c(x, typeof), vapply_1c(y, typeof)) 45 if (any(found)) stop("unsupported column type", if (sum(found) > 1L) "s" else "", 46 " found in x or y: ", brackify(bad_types[found])) 47 super = function(x) { 48 # allow character->factor and integer->numeric because from v1.12.4 i's type is retained by joins, #3820 49 ans = class(x)[1L] 50 switch(ans, factor="character", integer="numeric", ans) 51 } 52 if (!identical(sx<-sapply(x, super), sy<-sapply(y, super))) { 53 w = which.first(sx!=sy) 54 stop("Item ",w," of x is '",class(x[[w]])[1L],"' but the corresponding item of y is '", class(y[[w]])[1L], "'.") 55 } 56 if (.seqn && ".seqn" %chin% names(x)) stop("None of the datasets should contain a column named '.seqn'") 57} 58 59fintersect = function(x, y, all=FALSE) { 60 .set_ops_arg_check(x, y, all, .seqn = TRUE) 61 if (!nrow(x) || !nrow(y)) return(x[0L]) 62 if (all) { 63 x = shallow(x)[, ".seqn" := rowidv(x)] 64 y = shallow(y)[, ".seqn" := rowidv(y)] 65 jn.on = c(".seqn",setdiff(names(y),".seqn")) 66 # fixes #4716 by preserving order of 1st (uses y[x] join) argument instead of 2nd (uses x[y] join) 67 y[x, .SD, .SDcols=setdiff(names(y),".seqn"), nomatch=NULL, on=jn.on] 68 } else { 69 z = funique(x) # fixes #3034. When .. prefix in i= is implemented (TODO), this can be x[funique(..y), on=, multi=] 70 y[z, nomatch=NULL, on=names(y), mult="first"] 71 } 72} 73 74fsetdiff = function(x, y, all=FALSE) { 75 .set_ops_arg_check(x, y, all, .seqn = TRUE) 76 if (!nrow(x)) return(x) 77 if (!nrow(y)) return(if (!all) funique(x) else x) 78 if (all) { 79 x = shallow(x)[, ".seqn" := rowidv(x)] 80 y = shallow(y)[, ".seqn" := rowidv(y)] 81 jn.on = c(".seqn",setdiff(names(x),".seqn")) 82 x[!y, .SD, .SDcols=setdiff(names(x),".seqn"), on=jn.on] 83 } else { 84 funique(x[!y, on=names(x)]) 85 } 86} 87 88funion = function(x, y, all=FALSE) { 89 .set_ops_arg_check(x, y, all, block_list = !all) 90 ans = rbindlist(list(x, y)) 91 if (!all) ans = funique(ans) 92 ans 93} 94 95fsetequal = function(x, y, all=TRUE) { 96 .set_ops_arg_check(x, y, all) 97 if (!all) { 98 x = funique(x) 99 y = funique(y) 100 } 101 isTRUE(all.equal.data.table(x, y, check.attributes = FALSE, ignore.row.order = TRUE)) 102} 103 104# all.equal ---- 105 106all.equal.data.table = function(target, current, trim.levels=TRUE, check.attributes=TRUE, ignore.col.order=FALSE, ignore.row.order=FALSE, tolerance=sqrt(.Machine$double.eps), ...) { 107 stopifnot(is.logical(trim.levels), is.logical(check.attributes), is.logical(ignore.col.order), is.logical(ignore.row.order), is.numeric(tolerance), is.data.table(target)) 108 109 if (!is.data.table(current)) { 110 if (check.attributes) return(paste0('target is data.table, current is ', data.class(current))) 111 try({current = as.data.table(current)}, silent = TRUE) 112 if (!is.data.table(current)) return('target is data.table but current is not and failed to be coerced to it') 113 } 114 115 msg = character(0L) 116 # init checks that detect high level all.equal 117 if (nrow(current) != nrow(target)) msg = "Different number of rows" 118 if (ncol(current) != ncol(target)) msg = c(msg, "Different number of columns") 119 diff.colnames = !identical(sort(names(target)), sort(names(current))) 120 diff.colorder = !identical(names(target), names(current)) 121 if (check.attributes && diff.colnames) msg = c(msg, "Different column names") 122 if (!diff.colnames && !ignore.col.order && diff.colorder) msg = c(msg, "Different column order") 123 124 if (length(msg)) return(msg) # skip check.attributes and further heavy processing 125 126 # ignore.col.order 127 if (ignore.col.order && diff.colorder) current = setcolorder(shallow(current), names(target)) 128 129 # Always check modes equal, like base::all.equal 130 targetModes = vapply_1c(target, mode) 131 currentModes = vapply_1c(current, mode) 132 if (any( d<-(targetModes!=currentModes) )) { 133 w = head(which(d),3L) 134 return(paste0("Datasets have different column modes. First 3: ",paste( 135 paste0(names(targetModes)[w],"(",paste(targetModes[w],currentModes[w],sep="!="),")") 136 ,collapse=" "))) 137 } 138 139 if (check.attributes) { 140 squashClass = function(x) if (is.object(x)) paste(class(x),collapse=";") else mode(x) 141 # else mode() is so that integer==numeric, like base all.equal does. 142 targetTypes = vapply_1c(target, squashClass) 143 currentTypes = vapply_1c(current, squashClass) 144 if (length(targetTypes) != length(currentTypes)) 145 stop("Internal error: ncol(current)==ncol(target) was checked above") # nocov 146 if (any( d<-(targetTypes != currentTypes))) { 147 w = head(which(d),3L) 148 return(paste0("Datasets have different column classes. First 3: ",paste( 149 paste0(names(targetTypes)[w],"(",paste(targetTypes[w],currentTypes[w],sep="!="),")") 150 ,collapse=" "))) 151 } 152 153 # check key 154 k1 = key(target) 155 k2 = key(current) 156 if (!identical(k1, k2)) { 157 return(sprintf("Datasets has different keys. 'target'%s. 'current'%s.", 158 if(length(k1)) paste0(": ", paste(k1, collapse=", ")) else " has no key", 159 if(length(k2)) paste0(": ", paste(k2, collapse=", ")) else " has no key")) 160 } 161 # check index 162 i1 = indices(target) 163 i2 = indices(current) 164 if (!identical(i1, i2)) { 165 return(sprintf("Datasets has different indexes. 'target'%s. 'current'%s.", 166 if(length(i1)) paste0(": ", paste(i1, collapse=", ")) else " has no index", 167 if(length(i2)) paste0(": ", paste(i2, collapse=", ")) else " has no index")) 168 } 169 170 # Trim any extra row.names attributes that came from some inheritance 171 # Trim ".internal.selfref" as long as there is no `all.equal.externalptr` method 172 exclude.attrs = function(x, attrs = c("row.names",".internal.selfref")) x[!names(x) %chin% attrs] 173 a1 = exclude.attrs(attributes(target)) 174 a2 = exclude.attrs(attributes(current)) 175 if (length(a1) != length(a2)) return(sprintf("Datasets has different number of (non-excluded) attributes: target %s, current %s", length(a1), length(a2))) 176 if (!identical(nm1 <- sort(names(a1)), nm2 <- sort(names(a2)))) return(sprintf("Datasets has attributes with different names: %s", paste(setdiff(union(names(a1), names(a2)), intersect(names(a1), names(a2))), collapse=", "))) 177 attrs.r = all.equal(a1[nm1], a2[nm2], ..., check.attributes = check.attributes) 178 if (is.character(attrs.r)) return(paste("Attributes: <", attrs.r, ">")) # skip further heavy processing 179 } 180 181 if (ignore.row.order) { 182 if (".seqn" %chin% names(target)) 183 stop("None of the datasets to compare should contain a column named '.seqn'") 184 bad.type = setNames(c("raw","complex","list") %chin% c(vapply_1c(current, typeof), vapply_1c(target, typeof)), c("raw","complex","list")) 185 if (any(bad.type)) 186 stop("Datasets to compare with 'ignore.row.order' must not have unsupported column types: ", brackify(names(bad.type)[bad.type])) 187 if (between(tolerance, 0, sqrt(.Machine$double.eps), incbounds=FALSE)) { 188 warning("Argument 'tolerance' was forced to lowest accepted value `sqrt(.Machine$double.eps)` from provided ", format(tolerance, scientific=FALSE)) 189 tolerance = sqrt(.Machine$double.eps) 190 } 191 target_dup = as.logical(anyDuplicated(target)) 192 current_dup = as.logical(anyDuplicated(current)) 193 tolerance.msg = if (identical(tolerance, 0)) ", be aware you are using `tolerance=0` which may result into visually equal data" else "" 194 if (target_dup || current_dup) { 195 # handling 'tolerance' for duplicate rows - those `msg` will be returned only when equality with tolerance will fail 196 if (any(vapply_1c(target,typeof)=="double") && !identical(tolerance, 0)) { 197 if (target_dup && !current_dup) msg = c(msg, "Dataset 'target' has duplicate rows while 'current' doesn't") 198 else if (!target_dup && current_dup) msg = c(msg, "Dataset 'current' has duplicate rows while 'target' doesn't") 199 else { # both 200 if (!identical(tolerance, sqrt(.Machine$double.eps))) # non-default will raise error 201 stop("Duplicate rows in datasets, numeric columns and ignore.row.order cannot be used with non 0 tolerance argument") 202 msg = c(msg, "Both datasets have duplicate rows, they also have numeric columns, together with ignore.row.order this force 'tolerance' argument to 0") 203 tolerance = 0 204 } 205 } else { # no numeric columns or tolerance==0L 206 if (target_dup && !current_dup) 207 return(sprintf("Dataset 'target' has duplicate rows while 'current' doesn't%s", tolerance.msg)) 208 if (!target_dup && current_dup) 209 return(sprintf("Dataset 'current' has duplicate rows while 'target' doesn't%s", tolerance.msg)) 210 } 211 } 212 # handling 'tolerance' for factor cols - those `msg` will be returned only when equality with tolerance will fail 213 if (any(vapply_1b(target,is.factor)) && !identical(tolerance, 0)) { 214 if (!identical(tolerance, sqrt(.Machine$double.eps))) # non-default will raise error 215 stop("Factor columns and ignore.row.order cannot be used with non 0 tolerance argument") 216 msg = c(msg, "Using factor columns together together with ignore.row.order, this force 'tolerance' argument to 0") 217 tolerance = 0 218 } 219 jn.on = copy(names(target)) # default, possible altered later on 220 dbl.cols = vapply_1c(target,typeof)=="double" 221 if (!identical(tolerance, 0)) { 222 if (!any(dbl.cols)) { # dbl.cols handles (removed) "all character columns" (char.cols) case as well 223 tolerance = 0 224 } else { 225 jn.on = jn.on[c(which(!dbl.cols), which(dbl.cols))] # double column must be last for rolling join 226 } 227 } 228 if (target_dup && current_dup) { 229 target = shallow(target)[, ".seqn" := rowidv(target)] 230 current = shallow(current)[, ".seqn" := rowidv(current)] 231 jn.on = c(".seqn", jn.on) 232 } 233 # roll join to support 'tolerance' argument, conditional to retain support for factor when tolerance=0 234 ans = if (identical(tolerance, 0)) target[current, nomatch=NA, which=TRUE, on=jn.on] else { 235 ans1 = target[current, roll=tolerance, rollends=TRUE, which=TRUE, on=jn.on] 236 ans2 = target[current, roll=-tolerance, rollends=TRUE, which=TRUE, on=jn.on] 237 pmin(ans1, ans2, na.rm=TRUE) 238 } 239 if (any_na(as_list(ans))) { 240 msg = c(msg, sprintf("Dataset 'current' has rows not present in 'target'%s%s", if (target_dup || current_dup) " or present in different quantity" else "", tolerance.msg)) 241 return(msg) 242 } 243 # rolling join other way around 244 ans = if (identical(tolerance, 0)) current[target, nomatch=NA, which=TRUE, on=jn.on] else { 245 ans1 = current[target, roll=tolerance, rollends=TRUE, which=TRUE, on=jn.on] 246 ans2 = current[target, roll=-tolerance, rollends=TRUE, which=TRUE, on=jn.on] 247 pmin(ans1, ans2, na.rm=TRUE) 248 } 249 if (any_na(as_list(ans))) { 250 msg = c(msg, sprintf("Dataset 'target' has rows not present in 'current'%s%s", if (target_dup || current_dup) " or present in different quantity" else "", tolerance.msg)) 251 return(msg) 252 } 253 } else { 254 for (i in seq_along(target)) { 255 # trim.levels moved here 256 x = target[[i]] 257 y = current[[i]] 258 if (xor(is.factor(x),is.factor(y))) 259 stop("Internal error: factor type mismatch should have been caught earlier") # nocov 260 cols.r = TRUE 261 if (is.factor(x)) { 262 if (!identical(levels(x),levels(y))) { 263 if (trim.levels) { 264 # do this regardless of check.attributes (that's more about classes, checked above) 265 x = factor(x) 266 y = factor(y) 267 if (!identical(levels(x),levels(y))) 268 cols.r = "Levels not identical even after refactoring since trim.levels is TRUE" 269 } else { 270 cols.r = "Levels not identical. No attempt to refactor because trim.levels is FALSE" 271 } 272 } else { 273 cols.r = all.equal(x, y, check.attributes=check.attributes) 274 # the check.attributes here refers to everything other than the levels, which are always 275 # dealt with according to trim.levels 276 } 277 } else { 278 cols.r = all.equal(unclass(x), unclass(y), tolerance=tolerance, ..., check.attributes=check.attributes) 279 # classes were explicitly checked earlier above, so ignore classes here. 280 } 281 if (!isTRUE(cols.r)) return(paste0("Column '", names(target)[i], "': ", paste(cols.r,collapse=" "))) 282 } 283 } 284 TRUE 285} 286 287