1fread = function(
2input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto",
3na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE),
4skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"),
5col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL,
6showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE),
7nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), keepLeadingZeros=getOption("datatable.keepLeadingZeros",FALSE),
8yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
9{
10  if (missing(input)+is.null(file)+is.null(text)+is.null(cmd) < 3L) stop("Used more than one of the arguments input=, file=, text= and cmd=.")
11  input_has_vars = length(all.vars(substitute(input)))>0L  # see news for v1.11.6
12  if (is.null(sep)) sep="\n"         # C level knows that \n means \r\n on Windows, for example
13  else {
14    stopifnot( length(sep)==1L, !is.na(sep), is.character(sep) )
15    if (sep=="") { sep="\n" }         # meaning readLines behaviour. The 3 values (NULL, "" or "\n") are equivalent.
16    else if (sep=="auto") sep=""      # sep=="" at C level means auto sep
17    else stopifnot( nchar(sep)==1L )  # otherwise an actual character to use as sep
18  }
19  stopifnot( is.character(dec), length(dec)==1L, nchar(dec)==1L )
20  # handle encoding, #563
21  if (length(encoding) != 1L || !encoding %chin% c("unknown", "UTF-8", "Latin-1")) {
22    stop("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.")
23  }
24  stopifnot( isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(showProgress),
25             isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml) )
26  stopifnot( isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0))
27  stopifnot( is.numeric(nrows), length(nrows)==1L )
28  if (is.na(nrows) || nrows<0L) nrows=Inf   # accept -1 to mean Inf, as read.table does
29  if (identical(header,"auto")) header=NA
30  stopifnot(is.logical(header) && length(header)==1L)  # TRUE, FALSE or NA
31  stopifnot(is.numeric(nThread) && length(nThread)==1L)
32  nThread=as.integer(nThread)
33  stopifnot(nThread>=1L)
34  if (!is.null(text)) {
35    if (!is.character(text)) stop("'text=' is type ", typeof(text), " but must be character.")
36    if (!length(text)) return(data.table())
37    if (length(text) > 1L) {
38      cat(text, file=(tmpFile<-tempfile(tmpdir=tmpdir)), sep="\n")  # avoid paste0() which could create a new very long single string in R's memory
39      file = tmpFile
40      on.exit(unlink(tmpFile), add=TRUE)
41    } else {
42      # avoid creating a tempfile() for single strings, which can be done a lot; e.g. in the test suite.
43      input = text
44    }
45  }
46  else if (is.null(cmd)) {
47    if (!is.character(input) || length(input)!=1L) {
48      stop("input= must be a single character string containing a file name, a system command containing at least one space, a URL starting 'http[s]://', 'ftp[s]://' or 'file://', or, the input data itself containing at least one \\n or \\r")
49    }
50    if (input=="" || length(grep('\\n|\\r', input))) {
51      # input is data itself containing at least one \n or \r
52    } else {
53      if (substring(input,1L,1L)==" ") {
54        stop("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=")
55      }
56      str6 = substring(input,1L,6L)   # avoid grepl() for #2531
57      str7 = substring(input,1L,7L)
58      str8 = substring(input,1L,8L)
59      if (str7=="ftps://" || str8=="https://") {
60        # nocov start
61        if (!requireNamespace("curl", quietly = TRUE))
62          stop("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov
63        tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir)  # retain .gz extension in temp filename so it knows to be decompressed further below
64        curl::curl_download(input, tmpFile, mode="wb", quiet = !showProgress)
65        file = tmpFile
66        on.exit(unlink(tmpFile), add=TRUE)
67        # nocov end
68      }
69      else if (str6=="ftp://" || str7== "http://" || str7=="file://") {
70        # nocov start
71        method = if (str7=="file://") "internal" else getOption("download.file.method", default="auto")
72        # force "auto" when file:// to ensure we don't use an invalid option (e.g. wget), #1668
73        tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir)
74        download.file(input, tmpFile, method=method, mode="wb", quiet=!showProgress)
75        # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF"
76        file = tmpFile
77        on.exit(unlink(tmpFile), add=TRUE)
78        # nocov end
79      }
80      else if (length(grep(' ', input, fixed = TRUE)) && !file.exists(input)) {  # file name or path containing spaces is not a command
81        cmd = input
82        if (input_has_vars && getOption("datatable.fread.input.cmd.message", TRUE)) {
83          message("Taking input= as a system command ('",cmd,"') and a variable has been used in the expression passed to `input=`. Please use fread(cmd=...). There is a security concern if you are creating an app, and the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.")
84        }
85      }
86      else {
87        file = input   # filename
88      }
89    }
90  }
91  if (!is.null(cmd)) {
92    (if (.Platform$OS.type == "unix") system else shell)(paste0('(', cmd, ') > ', tmpFile<-tempfile(tmpdir=tmpdir)))
93    file = tmpFile
94    on.exit(unlink(tmpFile), add=TRUE)
95  }
96  if (!is.null(file)) {
97    file_info = file.info(file)
98    if (is.na(file_info$size)) stop("File '",file,"' does not exist or is non-readable. getwd()=='", getwd(), "'")
99    if (isTRUE(file_info$isdir)) stop("File '",file,"' is a directory. Not yet implemented.") # dir.exists() requires R v3.2+, #989
100    if (!file_info$size) {
101      warning("File '", file, "' has size 0. Returning a NULL ",
102              if (data.table) 'data.table' else 'data.frame', ".")
103      return(if (data.table) data.table(NULL) else data.frame(NULL))
104    }
105    ext2 = substring(file, nchar(file)-2L, nchar(file))   # last 3 characters ".gz"
106    ext3 = substring(file, nchar(file)-3L, nchar(file))   # last 4 characters ".bz2"
107    if (ext2==".gz" || ext3==".bz2") {
108      if (!requireNamespace("R.utils", quietly = TRUE))
109        stop("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov
110      FUN = if (ext2==".gz") gzfile else bzfile
111      R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE)   # ext is not used by decompressFile when destname is supplied, but isn't optional
112      file = decompFile   # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download
113      on.exit(unlink(decompFile), add=TRUE)
114    }
115    file = enc2native(file) # CfreadR cannot handle UTF-8 if that is not the native encoding, see #3078.
116
117    input = file
118  }
119  if (!missing(autostart)) warning("'autostart' is now deprecated and ignored. Consider skip='string' or skip=n");
120  if (is.logical(colClasses)) {
121    if (!allNA(colClasses)) stop("colClasses is type 'logical' which is ok if all NA but it has some TRUE or FALSE values in it which is not allowed. Please consider the drop= or select= argument instead. See ?fread.")
122    colClasses = NULL
123  }
124  if (!is.null(colClasses) && is.atomic(colClasses)) {
125    if (!is.character(colClasses)) stop("colClasses is not type list or character vector")
126    if (!length(colClasses)) {
127      colClasses=NULL;
128    } else if (identical(colClasses, "NULL")) {
129      colClasses = NULL
130      warning('colClasses="NULL" (quoted) is interpreted as colClasses=NULL (the default) as opposed to dropping every column.')
131    } else if (!is.null(names(colClasses))) {   # names are column names; convert to list approach
132      colClasses = tapply(names(colClasses), colClasses, c, simplify=FALSE)
133    }
134  }
135  stopifnot(length(skip)==1L, !is.na(skip), is.character(skip) || is.numeric(skip))
136  if (identical(skip,"__auto__")) skip = if (yaml) 0L else -1L
137  else if (is.double(skip)) skip = as.integer(skip)
138  # else skip="string" so long as "string" is not "__auto__" (best conveys to user skip is automatic rather than user needing to know -1 or NA means auto)
139  stopifnot(is.null(na.strings) || is.character(na.strings))
140  tt = grep("^\\s+$", na.strings)
141  if (length(tt)) {
142    msg = paste0('na.strings[', tt[1L], ']=="',na.strings[tt[1L]],'" consists only of whitespace, ignoring. ')
143    if (strip.white) {
144      if (any(na.strings=="")) {
145        warning(msg, 'strip.white==TRUE (default) and "" is present in na.strings, so any number of spaces in string columns will already be read as <NA>.')
146      } else {
147        warning(msg, 'Since strip.white=TRUE (default), use na.strings="" to specify that any number of spaces in a string column should be read as <NA>.')
148      }
149      na.strings = na.strings[-tt]
150    } else {
151      stop(msg, 'But strip.white=FALSE. Use strip.white=TRUE (default) together with na.strings="" to turn any number of spaces in string columns into <NA>')
152    }
153    # whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804
154  }
155  if (yaml) {
156    if (!requireNamespace('yaml', quietly = TRUE))
157      stop("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov
158    # for tracking which YAML elements may be overridden by being declared explicitly
159    call_args = names(match.call())
160    if (is.character(skip))
161      warning("Combining a search string as 'skip' and reading a YAML header may not work as expected -- currently, ",
162              "reading will proceed to search for 'skip' from the beginning of the file, NOT from the end of ",
163              "the metadata; please file an issue on GitHub if you'd like to see more intuitive behavior supported.")
164    # create connection to stream header lines from file:
165    #   https://stackoverflow.com/questions/9871307
166    f = base::file(input, 'r')
167    first_line = readLines(f, n=1L)
168    n_read = 1L
169    yaml_border_re = '^#?---'
170    if (!grepl(yaml_border_re, first_line)) {
171      close(f)
172      stop('Encountered <', substring(first_line, 1L, 50L), if (nchar(first_line) > 50L) '...', '> at the first ',
173           'unskipped line (', 1L+skip, '), which does not constitute the start to a valid YAML header ',
174           '(expecting something matching regex "', yaml_border_re, '"); please check your input and try again.')
175    }
176
177    yaml_comment_re = '^#'
178    yaml_string = character(0L)
179    while (TRUE) {
180      this_line = readLines(f, n=1L)
181      n_read = n_read + 1L
182      if (!length(this_line)){
183        close(f)
184        stop('Reached the end of the file before finding a completion to the YAML header. A valid YAML header is bookended by lines matching ',
185             'the regex "', yaml_border_re, '". Please double check the input file is a valid csvy.')
186      }
187      if (grepl(yaml_border_re, this_line)) break
188      if (grepl(yaml_comment_re, this_line))
189        this_line = sub(yaml_comment_re, '', this_line)
190      yaml_string = paste(yaml_string, this_line, sep='\n')
191    }
192    close(f) # when #561 is implemented, no need to close f.
193
194    yaml_header = yaml::yaml.load(yaml_string)
195    yaml_names = names(yaml_header)
196    if (verbose) cat('Processed', n_read, 'lines of YAML metadata with the following top-level fields:', brackify(yaml_names), '\n')
197    # process header first since it impacts how to handle colClasses
198    if ('header' %chin% yaml_names) {
199      if ('header' %chin% call_args) message("User-supplied 'header' will override that found in metadata.")
200      else header = as.logical(yaml_header$header)
201    }
202    if ('schema' %chin% yaml_names) {
203      new_types = sapply(yaml_header$schema$fields, `[[`, 'type')
204      if (any(null_idx <- sapply(new_types, is.null)))
205        new_types = do.call(c, new_types)
206      synonms = rbindlist(list(
207        character = list(syn = c('character', 'string')),
208        integer = list(syn = c('integer', 'int')),
209        numeric = list(syn = c('numeric', 'number', 'double')),
210        factor = list(syn = c('factor', 'categorical')),
211        integer64 = list(syn = c('integer64', 'int64'))
212      ), idcol = 'r_type')
213      setkeyv(synonms, 'syn')
214      new_types = synonms[list(new_types)]$r_type
215      new_names = sapply(yaml_header$schema$fields[!null_idx], `[[`, 'name')
216
217      if ('col.names' %chin% call_args) message("User-supplied column names in 'col.names' will override those found in YAML metadata.")
218      # resolve any conflicts with colClasses, if supplied;
219      #   colClasses (if present) is already in list form by now
220      if ('colClasses' %chin% call_args) {
221        if (any(idx_name <- new_names %chin% unlist(colClasses))) {
222          matched_name_idx = which(idx_name)
223          if (!all(idx_type <- sapply(matched_name_idx, function(ii) {
224            new_names[ii] %chin% colClasses[[ new_types[ii] ]]
225          }))) {
226            plural = sum(idx_type) > 1L
227            message('colClasses dictated by user input and those read from YAML header are in conflict (specifically, for column', if (plural) 's',
228                    ' [', paste(new_names[matched_name_idx[!idx_type]], collapse = ','), ']); the proceeding assumes the user input was ',
229                    'an intentional override and will ignore the types implied by the YAML header; please exclude ',
230                    if (plural) 'these columns' else 'this column from colClasses if this was unintentional.')
231          }
232        }
233        # only add unmentioned columns
234        for (ii in which(!idx_name)) {
235          colClasses[[ new_types[ii] ]] = c(colClasses[[ new_types[ii] ]], new_names[ii])
236        }
237      } else {
238        # there are no names to be matched in the data, which fread expects
239        #   at the C level; instead, apply these in post through col.names
240        #   and send the auto-generated V1:Vn as dummies
241        if (identical(header, FALSE)) {
242          if (!'col.names' %chin% call_args) col.names = new_names
243          new_names = paste0('V', seq_along(new_names))
244        }
245        colClasses = tapply(new_names, new_types, c, simplify=FALSE)
246      }
247    }
248    sep_syn = c('sep', 'delimiter')
249    if (any(sep_idx <- sep_syn %chin% yaml_names)) {
250      if ('sep' %chin% call_args) message("User-supplied 'sep' will override that found in metadata.")
251      else sep = yaml_header[[ sep_syn[sep_idx][1L] ]]
252    }
253    quote_syn = c('quote', 'quoteChar', 'quote_char')
254    if (any(quote_idx <- quote_syn %chin% yaml_names)) {
255      if ('quote' %chin% call_args) message("User-supplied 'quote' will override that found in metadata.")
256      else quote = yaml_header[[ quote_syn[quote_idx][1L] ]]
257    }
258    dec_syn = c('dec', 'decimal')
259    if (any(dec_idx <- dec_syn %chin% yaml_names)) {
260      if ('dec' %chin% call_args) message("User-supplied 'dec' will override that found in metadata.")
261      else dec = yaml_header[[ dec_syn[dec_idx][1L] ]]
262    }
263    if ('na.strings' %chin% yaml_names) {
264      if ('na.strings' %chin% call_args) message("User-supplied 'na.strings' will override that found in metadata.")
265      else na.strings = yaml_header$na.strings
266    }
267    if (is.integer(skip)) skip = skip + n_read
268  }
269  warnings2errors = getOption("warn") >= 2
270  stopifnot(identical(tz,"UTC") || identical(tz,""))
271  if (tz=="") {
272    tt = Sys.getenv("TZ", unset=NA_character_)
273    if (identical(tt,"") || is_utc(tt)) # empty TZ env variable ("") means UTC in C library, unlike R; _unset_ TZ means local
274      tz="UTC"
275  }
276  ans = .Call(CfreadR,input,sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,
277              fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC")
278  if (!length(ans)) return(null.data.table())  # test 1743.308 drops all columns
279  nr = length(ans[[1L]])
280  require_bit64_if_needed(ans)
281  setattr(ans,"row.names",.set_row_names(nr))
282
283  if (isTRUE(data.table)) {
284    setattr(ans, "class", c("data.table", "data.frame"))
285    setalloccol(ans)
286  } else {
287    setattr(ans, "class", "data.frame")
288  }
289  # #1027, make.unique -> make.names as spotted by @DavidArenberg
290  if (check.names) {
291    setattr(ans, 'names', make.names(names(ans), unique=TRUE))
292  }
293
294  colClassesAs = attr(ans, "colClassesAs", exact=TRUE)   # should only be present if one or more are != ""
295  for (j in which(colClassesAs!="")) {       # # 1634
296    v = .subset2(ans, j)
297    new_class = colClassesAs[j]
298    new_v = tryCatch({    # different to read.csv; i.e. won't error if a column won't coerce (fallback with warning instead)
299      switch(new_class,
300             "factor" = as_factor(v),
301             "complex" = as.complex(v),
302             "raw" = as_raw(v),  # Internal implementation
303             "Date" = as.Date(v),
304             "POSIXct" = as.POSIXct(v),  # test 2150.14 covers this by setting the option to restore old behaviour. Otherwise types that
305             # are recognized by freadR.c (e.g. POSIXct; #4464) result in user-override-bump at C level before reading so do not reach this switch
306             # see https://github.com/Rdatatable/data.table/pull/4464#discussion_r447275278.
307             # Aside: as(v,"POSIXct") fails with error in R so has to be caught explicitly above
308             # finally:
309             methods::as(v, new_class))
310      },
311      warning = fun <- function(e) {
312        warning("Column '", names(ans)[j], "' was requested to be '", new_class, "' but fread encountered the following ",
313                if (inherits(e, "error")) "error" else "warning", ":\n\t", e$message, "\nso the column has been left as type '", typeof(v), "'", call.=FALSE)
314        return(v)
315      },
316      error = fun)
317    set(ans, j = j, value = new_v)  # aside: new_v == v if the coercion was aborted
318  }
319  setattr(ans, "colClassesAs", NULL)
320
321  if (stringsAsFactors) {
322    if (is.double(stringsAsFactors)) { #2025
323      should_be_factor = function(v) is.character(v) && uniqueN(v) < nr * stringsAsFactors
324      cols_to_factor = which(vapply_1b(ans, should_be_factor))
325    } else {
326      cols_to_factor = which(vapply_1b(ans, is.character))
327    }
328    if (verbose) cat("stringsAsFactors=", stringsAsFactors, " converted ", length(cols_to_factor), " column(s): ", brackify(names(ans)[cols_to_factor]), "\n", sep="")
329    for (j in cols_to_factor) set(ans, j=j, value=as_factor(.subset2(ans, j)))
330  }
331
332  if (!missing(col.names))   # FR #768
333    setnames(ans, col.names) # setnames checks and errors automatically
334  if (!is.null(key) && data.table) {
335    if (!is.character(key))
336      stop("key argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)")
337    if (length(key) == 1L) {
338      key = strsplit(key, split = ",", fixed = TRUE)[[1L]]
339    }
340    setkeyv(ans, key)
341  }
342  if (yaml) setattr(ans, 'yaml_metadata', yaml_header)
343  if (!is.null(index) && data.table) {
344    if (!all(sapply(index, is.character)))
345      stop("index argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)")
346    if (is.list(index)) {
347      to_split = sapply(index, length) == 1L
348      if (any(to_split))
349        index[to_split] = sapply(index[to_split], strsplit, split = ",", fixed = TRUE)
350    } else {
351      if (length(index) == 1L) {
352        # setindexv accepts lists, so no [[1]]
353        index = strsplit(index, split = ",", fixed = TRUE)
354      }
355    }
356    setindexv(ans, index)
357  }
358  ans
359}
360
361# simplified but faster version of `factor()` for internal use.
362as_factor = function(x) {
363  lev = forderv(x, retGrp = TRUE, na.last = NA)
364  # get levels, also take care of all sorted condition
365  lev = if (length(lev)) x[lev[attributes(lev)$starts]] else x[attributes(lev)$starts]
366  ans = chmatch(x, lev)
367  setattr(ans, 'levels', lev)
368  setattr(ans, 'class', 'factor')
369}
370
371as_raw = function(x) {
372  scan(text=x, what=raw(), quiet=TRUE)  # as in read.csv, which ultimately uses src/main/scan.c and strtoraw
373}
374