1fread = function( 2input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto", 3na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE), 4skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"), 5col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, 6showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE), 7nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), keepLeadingZeros=getOption("datatable.keepLeadingZeros",FALSE), 8yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") 9{ 10 if (missing(input)+is.null(file)+is.null(text)+is.null(cmd) < 3L) stop("Used more than one of the arguments input=, file=, text= and cmd=.") 11 input_has_vars = length(all.vars(substitute(input)))>0L # see news for v1.11.6 12 if (is.null(sep)) sep="\n" # C level knows that \n means \r\n on Windows, for example 13 else { 14 stopifnot( length(sep)==1L, !is.na(sep), is.character(sep) ) 15 if (sep=="") { sep="\n" } # meaning readLines behaviour. The 3 values (NULL, "" or "\n") are equivalent. 16 else if (sep=="auto") sep="" # sep=="" at C level means auto sep 17 else stopifnot( nchar(sep)==1L ) # otherwise an actual character to use as sep 18 } 19 stopifnot( is.character(dec), length(dec)==1L, nchar(dec)==1L ) 20 # handle encoding, #563 21 if (length(encoding) != 1L || !encoding %chin% c("unknown", "UTF-8", "Latin-1")) { 22 stop("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") 23 } 24 stopifnot( isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(showProgress), 25 isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml) ) 26 stopifnot( isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0)) 27 stopifnot( is.numeric(nrows), length(nrows)==1L ) 28 if (is.na(nrows) || nrows<0L) nrows=Inf # accept -1 to mean Inf, as read.table does 29 if (identical(header,"auto")) header=NA 30 stopifnot(is.logical(header) && length(header)==1L) # TRUE, FALSE or NA 31 stopifnot(is.numeric(nThread) && length(nThread)==1L) 32 nThread=as.integer(nThread) 33 stopifnot(nThread>=1L) 34 if (!is.null(text)) { 35 if (!is.character(text)) stop("'text=' is type ", typeof(text), " but must be character.") 36 if (!length(text)) return(data.table()) 37 if (length(text) > 1L) { 38 cat(text, file=(tmpFile<-tempfile(tmpdir=tmpdir)), sep="\n") # avoid paste0() which could create a new very long single string in R's memory 39 file = tmpFile 40 on.exit(unlink(tmpFile), add=TRUE) 41 } else { 42 # avoid creating a tempfile() for single strings, which can be done a lot; e.g. in the test suite. 43 input = text 44 } 45 } 46 else if (is.null(cmd)) { 47 if (!is.character(input) || length(input)!=1L) { 48 stop("input= must be a single character string containing a file name, a system command containing at least one space, a URL starting 'http[s]://', 'ftp[s]://' or 'file://', or, the input data itself containing at least one \\n or \\r") 49 } 50 if (input=="" || length(grep('\\n|\\r', input))) { 51 # input is data itself containing at least one \n or \r 52 } else { 53 if (substring(input,1L,1L)==" ") { 54 stop("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=") 55 } 56 str6 = substring(input,1L,6L) # avoid grepl() for #2531 57 str7 = substring(input,1L,7L) 58 str8 = substring(input,1L,8L) 59 if (str7=="ftps://" || str8=="https://") { 60 # nocov start 61 if (!requireNamespace("curl", quietly = TRUE)) 62 stop("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov 63 tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below 64 curl::curl_download(input, tmpFile, mode="wb", quiet = !showProgress) 65 file = tmpFile 66 on.exit(unlink(tmpFile), add=TRUE) 67 # nocov end 68 } 69 else if (str6=="ftp://" || str7== "http://" || str7=="file://") { 70 # nocov start 71 method = if (str7=="file://") "internal" else getOption("download.file.method", default="auto") 72 # force "auto" when file:// to ensure we don't use an invalid option (e.g. wget), #1668 73 tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) 74 download.file(input, tmpFile, method=method, mode="wb", quiet=!showProgress) 75 # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" 76 file = tmpFile 77 on.exit(unlink(tmpFile), add=TRUE) 78 # nocov end 79 } 80 else if (length(grep(' ', input, fixed = TRUE)) && !file.exists(input)) { # file name or path containing spaces is not a command 81 cmd = input 82 if (input_has_vars && getOption("datatable.fread.input.cmd.message", TRUE)) { 83 message("Taking input= as a system command ('",cmd,"') and a variable has been used in the expression passed to `input=`. Please use fread(cmd=...). There is a security concern if you are creating an app, and the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.") 84 } 85 } 86 else { 87 file = input # filename 88 } 89 } 90 } 91 if (!is.null(cmd)) { 92 (if (.Platform$OS.type == "unix") system else shell)(paste0('(', cmd, ') > ', tmpFile<-tempfile(tmpdir=tmpdir))) 93 file = tmpFile 94 on.exit(unlink(tmpFile), add=TRUE) 95 } 96 if (!is.null(file)) { 97 file_info = file.info(file) 98 if (is.na(file_info$size)) stop("File '",file,"' does not exist or is non-readable. getwd()=='", getwd(), "'") 99 if (isTRUE(file_info$isdir)) stop("File '",file,"' is a directory. Not yet implemented.") # dir.exists() requires R v3.2+, #989 100 if (!file_info$size) { 101 warning("File '", file, "' has size 0. Returning a NULL ", 102 if (data.table) 'data.table' else 'data.frame', ".") 103 return(if (data.table) data.table(NULL) else data.frame(NULL)) 104 } 105 ext2 = substring(file, nchar(file)-2L, nchar(file)) # last 3 characters ".gz" 106 ext3 = substring(file, nchar(file)-3L, nchar(file)) # last 4 characters ".bz2" 107 if (ext2==".gz" || ext3==".bz2") { 108 if (!requireNamespace("R.utils", quietly = TRUE)) 109 stop("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov 110 FUN = if (ext2==".gz") gzfile else bzfile 111 R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional 112 file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download 113 on.exit(unlink(decompFile), add=TRUE) 114 } 115 file = enc2native(file) # CfreadR cannot handle UTF-8 if that is not the native encoding, see #3078. 116 117 input = file 118 } 119 if (!missing(autostart)) warning("'autostart' is now deprecated and ignored. Consider skip='string' or skip=n"); 120 if (is.logical(colClasses)) { 121 if (!allNA(colClasses)) stop("colClasses is type 'logical' which is ok if all NA but it has some TRUE or FALSE values in it which is not allowed. Please consider the drop= or select= argument instead. See ?fread.") 122 colClasses = NULL 123 } 124 if (!is.null(colClasses) && is.atomic(colClasses)) { 125 if (!is.character(colClasses)) stop("colClasses is not type list or character vector") 126 if (!length(colClasses)) { 127 colClasses=NULL; 128 } else if (identical(colClasses, "NULL")) { 129 colClasses = NULL 130 warning('colClasses="NULL" (quoted) is interpreted as colClasses=NULL (the default) as opposed to dropping every column.') 131 } else if (!is.null(names(colClasses))) { # names are column names; convert to list approach 132 colClasses = tapply(names(colClasses), colClasses, c, simplify=FALSE) 133 } 134 } 135 stopifnot(length(skip)==1L, !is.na(skip), is.character(skip) || is.numeric(skip)) 136 if (identical(skip,"__auto__")) skip = if (yaml) 0L else -1L 137 else if (is.double(skip)) skip = as.integer(skip) 138 # else skip="string" so long as "string" is not "__auto__" (best conveys to user skip is automatic rather than user needing to know -1 or NA means auto) 139 stopifnot(is.null(na.strings) || is.character(na.strings)) 140 tt = grep("^\\s+$", na.strings) 141 if (length(tt)) { 142 msg = paste0('na.strings[', tt[1L], ']=="',na.strings[tt[1L]],'" consists only of whitespace, ignoring. ') 143 if (strip.white) { 144 if (any(na.strings=="")) { 145 warning(msg, 'strip.white==TRUE (default) and "" is present in na.strings, so any number of spaces in string columns will already be read as <NA>.') 146 } else { 147 warning(msg, 'Since strip.white=TRUE (default), use na.strings="" to specify that any number of spaces in a string column should be read as <NA>.') 148 } 149 na.strings = na.strings[-tt] 150 } else { 151 stop(msg, 'But strip.white=FALSE. Use strip.white=TRUE (default) together with na.strings="" to turn any number of spaces in string columns into <NA>') 152 } 153 # whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804 154 } 155 if (yaml) { 156 if (!requireNamespace('yaml', quietly = TRUE)) 157 stop("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov 158 # for tracking which YAML elements may be overridden by being declared explicitly 159 call_args = names(match.call()) 160 if (is.character(skip)) 161 warning("Combining a search string as 'skip' and reading a YAML header may not work as expected -- currently, ", 162 "reading will proceed to search for 'skip' from the beginning of the file, NOT from the end of ", 163 "the metadata; please file an issue on GitHub if you'd like to see more intuitive behavior supported.") 164 # create connection to stream header lines from file: 165 # https://stackoverflow.com/questions/9871307 166 f = base::file(input, 'r') 167 first_line = readLines(f, n=1L) 168 n_read = 1L 169 yaml_border_re = '^#?---' 170 if (!grepl(yaml_border_re, first_line)) { 171 close(f) 172 stop('Encountered <', substring(first_line, 1L, 50L), if (nchar(first_line) > 50L) '...', '> at the first ', 173 'unskipped line (', 1L+skip, '), which does not constitute the start to a valid YAML header ', 174 '(expecting something matching regex "', yaml_border_re, '"); please check your input and try again.') 175 } 176 177 yaml_comment_re = '^#' 178 yaml_string = character(0L) 179 while (TRUE) { 180 this_line = readLines(f, n=1L) 181 n_read = n_read + 1L 182 if (!length(this_line)){ 183 close(f) 184 stop('Reached the end of the file before finding a completion to the YAML header. A valid YAML header is bookended by lines matching ', 185 'the regex "', yaml_border_re, '". Please double check the input file is a valid csvy.') 186 } 187 if (grepl(yaml_border_re, this_line)) break 188 if (grepl(yaml_comment_re, this_line)) 189 this_line = sub(yaml_comment_re, '', this_line) 190 yaml_string = paste(yaml_string, this_line, sep='\n') 191 } 192 close(f) # when #561 is implemented, no need to close f. 193 194 yaml_header = yaml::yaml.load(yaml_string) 195 yaml_names = names(yaml_header) 196 if (verbose) cat('Processed', n_read, 'lines of YAML metadata with the following top-level fields:', brackify(yaml_names), '\n') 197 # process header first since it impacts how to handle colClasses 198 if ('header' %chin% yaml_names) { 199 if ('header' %chin% call_args) message("User-supplied 'header' will override that found in metadata.") 200 else header = as.logical(yaml_header$header) 201 } 202 if ('schema' %chin% yaml_names) { 203 new_types = sapply(yaml_header$schema$fields, `[[`, 'type') 204 if (any(null_idx <- sapply(new_types, is.null))) 205 new_types = do.call(c, new_types) 206 synonms = rbindlist(list( 207 character = list(syn = c('character', 'string')), 208 integer = list(syn = c('integer', 'int')), 209 numeric = list(syn = c('numeric', 'number', 'double')), 210 factor = list(syn = c('factor', 'categorical')), 211 integer64 = list(syn = c('integer64', 'int64')) 212 ), idcol = 'r_type') 213 setkeyv(synonms, 'syn') 214 new_types = synonms[list(new_types)]$r_type 215 new_names = sapply(yaml_header$schema$fields[!null_idx], `[[`, 'name') 216 217 if ('col.names' %chin% call_args) message("User-supplied column names in 'col.names' will override those found in YAML metadata.") 218 # resolve any conflicts with colClasses, if supplied; 219 # colClasses (if present) is already in list form by now 220 if ('colClasses' %chin% call_args) { 221 if (any(idx_name <- new_names %chin% unlist(colClasses))) { 222 matched_name_idx = which(idx_name) 223 if (!all(idx_type <- sapply(matched_name_idx, function(ii) { 224 new_names[ii] %chin% colClasses[[ new_types[ii] ]] 225 }))) { 226 plural = sum(idx_type) > 1L 227 message('colClasses dictated by user input and those read from YAML header are in conflict (specifically, for column', if (plural) 's', 228 ' [', paste(new_names[matched_name_idx[!idx_type]], collapse = ','), ']); the proceeding assumes the user input was ', 229 'an intentional override and will ignore the types implied by the YAML header; please exclude ', 230 if (plural) 'these columns' else 'this column from colClasses if this was unintentional.') 231 } 232 } 233 # only add unmentioned columns 234 for (ii in which(!idx_name)) { 235 colClasses[[ new_types[ii] ]] = c(colClasses[[ new_types[ii] ]], new_names[ii]) 236 } 237 } else { 238 # there are no names to be matched in the data, which fread expects 239 # at the C level; instead, apply these in post through col.names 240 # and send the auto-generated V1:Vn as dummies 241 if (identical(header, FALSE)) { 242 if (!'col.names' %chin% call_args) col.names = new_names 243 new_names = paste0('V', seq_along(new_names)) 244 } 245 colClasses = tapply(new_names, new_types, c, simplify=FALSE) 246 } 247 } 248 sep_syn = c('sep', 'delimiter') 249 if (any(sep_idx <- sep_syn %chin% yaml_names)) { 250 if ('sep' %chin% call_args) message("User-supplied 'sep' will override that found in metadata.") 251 else sep = yaml_header[[ sep_syn[sep_idx][1L] ]] 252 } 253 quote_syn = c('quote', 'quoteChar', 'quote_char') 254 if (any(quote_idx <- quote_syn %chin% yaml_names)) { 255 if ('quote' %chin% call_args) message("User-supplied 'quote' will override that found in metadata.") 256 else quote = yaml_header[[ quote_syn[quote_idx][1L] ]] 257 } 258 dec_syn = c('dec', 'decimal') 259 if (any(dec_idx <- dec_syn %chin% yaml_names)) { 260 if ('dec' %chin% call_args) message("User-supplied 'dec' will override that found in metadata.") 261 else dec = yaml_header[[ dec_syn[dec_idx][1L] ]] 262 } 263 if ('na.strings' %chin% yaml_names) { 264 if ('na.strings' %chin% call_args) message("User-supplied 'na.strings' will override that found in metadata.") 265 else na.strings = yaml_header$na.strings 266 } 267 if (is.integer(skip)) skip = skip + n_read 268 } 269 warnings2errors = getOption("warn") >= 2 270 stopifnot(identical(tz,"UTC") || identical(tz,"")) 271 if (tz=="") { 272 tt = Sys.getenv("TZ", unset=NA_character_) 273 if (identical(tt,"") || is_utc(tt)) # empty TZ env variable ("") means UTC in C library, unlike R; _unset_ TZ means local 274 tz="UTC" 275 } 276 ans = .Call(CfreadR,input,sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip, 277 fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC") 278 if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns 279 nr = length(ans[[1L]]) 280 require_bit64_if_needed(ans) 281 setattr(ans,"row.names",.set_row_names(nr)) 282 283 if (isTRUE(data.table)) { 284 setattr(ans, "class", c("data.table", "data.frame")) 285 setalloccol(ans) 286 } else { 287 setattr(ans, "class", "data.frame") 288 } 289 # #1027, make.unique -> make.names as spotted by @DavidArenberg 290 if (check.names) { 291 setattr(ans, 'names', make.names(names(ans), unique=TRUE)) 292 } 293 294 colClassesAs = attr(ans, "colClassesAs", exact=TRUE) # should only be present if one or more are != "" 295 for (j in which(colClassesAs!="")) { # # 1634 296 v = .subset2(ans, j) 297 new_class = colClassesAs[j] 298 new_v = tryCatch({ # different to read.csv; i.e. won't error if a column won't coerce (fallback with warning instead) 299 switch(new_class, 300 "factor" = as_factor(v), 301 "complex" = as.complex(v), 302 "raw" = as_raw(v), # Internal implementation 303 "Date" = as.Date(v), 304 "POSIXct" = as.POSIXct(v), # test 2150.14 covers this by setting the option to restore old behaviour. Otherwise types that 305 # are recognized by freadR.c (e.g. POSIXct; #4464) result in user-override-bump at C level before reading so do not reach this switch 306 # see https://github.com/Rdatatable/data.table/pull/4464#discussion_r447275278. 307 # Aside: as(v,"POSIXct") fails with error in R so has to be caught explicitly above 308 # finally: 309 methods::as(v, new_class)) 310 }, 311 warning = fun <- function(e) { 312 warning("Column '", names(ans)[j], "' was requested to be '", new_class, "' but fread encountered the following ", 313 if (inherits(e, "error")) "error" else "warning", ":\n\t", e$message, "\nso the column has been left as type '", typeof(v), "'", call.=FALSE) 314 return(v) 315 }, 316 error = fun) 317 set(ans, j = j, value = new_v) # aside: new_v == v if the coercion was aborted 318 } 319 setattr(ans, "colClassesAs", NULL) 320 321 if (stringsAsFactors) { 322 if (is.double(stringsAsFactors)) { #2025 323 should_be_factor = function(v) is.character(v) && uniqueN(v) < nr * stringsAsFactors 324 cols_to_factor = which(vapply_1b(ans, should_be_factor)) 325 } else { 326 cols_to_factor = which(vapply_1b(ans, is.character)) 327 } 328 if (verbose) cat("stringsAsFactors=", stringsAsFactors, " converted ", length(cols_to_factor), " column(s): ", brackify(names(ans)[cols_to_factor]), "\n", sep="") 329 for (j in cols_to_factor) set(ans, j=j, value=as_factor(.subset2(ans, j))) 330 } 331 332 if (!missing(col.names)) # FR #768 333 setnames(ans, col.names) # setnames checks and errors automatically 334 if (!is.null(key) && data.table) { 335 if (!is.character(key)) 336 stop("key argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)") 337 if (length(key) == 1L) { 338 key = strsplit(key, split = ",", fixed = TRUE)[[1L]] 339 } 340 setkeyv(ans, key) 341 } 342 if (yaml) setattr(ans, 'yaml_metadata', yaml_header) 343 if (!is.null(index) && data.table) { 344 if (!all(sapply(index, is.character))) 345 stop("index argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)") 346 if (is.list(index)) { 347 to_split = sapply(index, length) == 1L 348 if (any(to_split)) 349 index[to_split] = sapply(index[to_split], strsplit, split = ",", fixed = TRUE) 350 } else { 351 if (length(index) == 1L) { 352 # setindexv accepts lists, so no [[1]] 353 index = strsplit(index, split = ",", fixed = TRUE) 354 } 355 } 356 setindexv(ans, index) 357 } 358 ans 359} 360 361# simplified but faster version of `factor()` for internal use. 362as_factor = function(x) { 363 lev = forderv(x, retGrp = TRUE, na.last = NA) 364 # get levels, also take care of all sorted condition 365 lev = if (length(lev)) x[lev[attributes(lev)$starts]] else x[attributes(lev)$starts] 366 ans = chmatch(x, lev) 367 setattr(ans, 'levels', lev) 368 setattr(ans, 'class', 'factor') 369} 370 371as_raw = function(x) { 372 scan(text=x, what=raw(), quiet=TRUE) # as in read.csv, which ultimately uses src/main/scan.c and strtoraw 373} 374