1% File src/library/utils/man/read.table.Rd
2% Part of the R package, https://www.R-project.org
3% Copyright 1995-2016 R Core Team
4% Distributed under GPL 2 or later
5
6\name{read.table}
7\alias{read.table}
8\alias{read.csv}
9\alias{read.csv2}
10\alias{read.delim}
11\alias{read.delim2}
12\title{Data Input}
13\description{
14  Reads a file in table format and creates a data frame from it, with
15  cases corresponding to lines and variables to fields in the file.
16}
17\usage{
18read.table(file, header = FALSE, sep = "", quote = "\"'",
19           dec = ".", numerals = c("allow.loss", "warn.loss", "no.loss"),
20           row.names, col.names, as.is = !stringsAsFactors,
21           na.strings = "NA", colClasses = NA, nrows = -1,
22           skip = 0, check.names = TRUE, fill = !blank.lines.skip,
23           strip.white = FALSE, blank.lines.skip = TRUE,
24           comment.char = "#",
25           allowEscapes = FALSE, flush = FALSE,
26           stringsAsFactors = FALSE,
27           fileEncoding = "", encoding = "unknown", text, skipNul = FALSE)
28
29read.csv(file, header = TRUE, sep = ",", quote = "\"",
30         dec = ".", fill = TRUE, comment.char = "", \dots)
31
32read.csv2(file, header = TRUE, sep = ";", quote = "\"",
33          dec = ",", fill = TRUE, comment.char = "", \dots)
34
35read.delim(file, header = TRUE, sep = "\t", quote = "\"",
36           dec = ".", fill = TRUE, comment.char = "", \dots)
37
38read.delim2(file, header = TRUE, sep = "\t", quote = "\"",
39            dec = ",", fill = TRUE, comment.char = "", \dots)
40}
41\arguments{
42  \item{file}{the name of the file which the data are to be read from.
43    Each row of the table appears as one line of the file.  If it does
44    not contain an \emph{absolute} path, the file name is
45    \emph{relative} to the current working directory,
46    \code{\link{getwd}()}. Tilde-expansion is performed where supported.
47    This can be a compressed file (see \code{\link{file}}).
48
49    Alternatively, \code{file} can be a readable text-mode
50    \link{connection} (which will be opened for reading if
51    necessary, and if so \code{\link{close}}d (and hence destroyed) at
52    the end of the function call).  (If \code{\link{stdin}()} is used,
53    the prompts for lines may be somewhat confusing.  Terminate input
54    with a blank line or an EOF signal, \code{Ctrl-D} on Unix and
55    \code{Ctrl-Z} on Windows.  Any pushback on \code{stdin()} will be
56    cleared before return.)
57
58    \code{file} can also be a complete URL.  (For the supported URL
59    schemes, see the \sQuote{URLs} section of the help for
60    \code{\link{url}}.)
61  }
62
63  \item{header}{a logical value indicating whether the file contains the
64    names of the variables as its first line.  If missing, the value is
65    determined from the file format: \code{header} is set to \code{TRUE}
66    if and only if the first row contains one fewer field than the
67    number of columns.}
68
69  \item{sep}{the field separator character.  Values on each line of the
70    file are separated by this character.  If \code{sep = ""} (the
71    default for \code{read.table}) the separator is \sQuote{white space},
72    that is one or more spaces, tabs, newlines or carriage returns.}
73
74  \item{quote}{the set of quoting characters. To disable quoting
75    altogether, use \code{quote = ""}.  See \code{\link{scan}} for the
76    behaviour on quotes embedded in quotes.  Quoting is only considered
77    for columns read as character, which is all of them unless
78    \code{colClasses} is specified.}
79
80  \item{dec}{the character used in the file for decimal points.}
81
82  \item{numerals}{string indicating how to convert numbers whose conversion
83    to double precision would lose accuracy, see \code{\link{type.convert}}.
84    Can be abbreviated.  (Applies also to complex-number inputs.)}
85
86  \item{row.names}{a vector of row names.  This can be a vector giving
87    the actual row names, or a single number giving the column of the
88    table which contains the row names, or character string giving the
89    name of the table column containing the row names.
90
91    If there is a header and the first row contains one fewer field than
92    the number of columns, the first column in the input is used for the
93    row names.  Otherwise if \code{row.names} is missing, the rows are
94    numbered.
95
96    Using \code{row.names = NULL} forces row numbering. Missing or
97    \code{NULL} \code{row.names} generate row names that are considered
98    to be \sQuote{automatic} (and not preserved by \code{\link{as.matrix}}).
99  }
100
101  \item{col.names}{a vector of optional names for the variables.
102    The default is to use \code{"V"} followed by the column number.}
103
104  \item{as.is}{controls conversion of character variables (insofar as
105    they are not converted to logical, numeric or complex) to factors,
106    if not otherwise specified by \code{colClasses}.
107    Its value is either a vector of logicals (values are recycled if
108    necessary), or a vector of numeric or character indices which
109    specify which columns should not be converted to factors.
110
111    Note: to suppress all conversions including those of numeric
112    columns, set \code{colClasses = "character"}.
113
114    Note that \code{as.is} is specified per column (not per
115    variable) and so includes the column of row names (if any) and any
116    columns to be skipped.
117  }
118
119  \item{na.strings}{a character vector of strings which are to be
120    interpreted as \code{\link{NA}} values.  Blank fields are also
121    considered to be missing values in logical, integer, numeric and
122    complex fields.  Note that the test happens \emph{after}
123    white space is stripped from the input, so \code{na.strings}
124    values may need their own white space stripped in advance.}
125
126  \item{colClasses}{character.  A vector of classes to be assumed for
127    the columns.  If unnamed, recycled as necessary.  If named, names
128    are matched with unspecified values being taken to be \code{NA}.
129
130    Possible values are \code{NA} (the default, when
131    \code{\link{type.convert}} is used), \code{"NULL"} (when the column
132    is skipped), one of the atomic vector classes (logical, integer,
133    numeric, complex, character, raw), or \code{"factor"}, \code{"Date"}
134    or \code{"POSIXct"}.  Otherwise there needs to be an \code{as}
135    method (from package \pkg{methods}) for conversion from
136    \code{"character"} to the specified formal class.
137
138    Note that \code{colClasses} is specified per column (not per
139    variable) and so includes the column of row names (if any).
140  }
141
142  \item{nrows}{integer: the maximum number of rows to read in.  Negative
143    and other invalid values are ignored.}
144
145  \item{skip}{integer: the number of lines of the data file to skip before
146    beginning to read data.}
147
148  \item{check.names}{logical.  If \code{TRUE} then the names of the
149    variables in the data frame are checked to ensure that they are
150    syntactically valid variable names.  If necessary they are adjusted
151    (by \code{\link{make.names}}) so that they are, and also to ensure
152    that there are no duplicates.}
153
154  \item{fill}{logical. If \code{TRUE} then in case the rows have unequal
155    length, blank fields are implicitly added.  See \sQuote{Details}.}
156
157  \item{strip.white}{logical. Used only when \code{sep} has
158    been specified, and allows the stripping of leading and trailing
159    white space from unquoted \code{character} fields (\code{numeric} fields
160    are always stripped).  See \code{\link{scan}} for further details
161    (including the exact meaning of \sQuote{white space}),
162    remembering that the columns may include the row names.}
163
164  \item{blank.lines.skip}{logical: if \code{TRUE} blank lines in the
165    input are ignored.}
166
167  \item{comment.char}{character: a character vector of length one
168    containing a single character or an empty string.  Use \code{""} to
169    turn off the interpretation of comments altogether.}
170
171  \item{allowEscapes}{logical.  Should C-style escapes such as
172    \samp{\\n} be processed or read verbatim (the default)?   Note that if
173    not within quotes these could be interpreted as a delimiter (but not
174    as a comment character).  For more details see \code{\link{scan}}.}
175
176  \item{flush}{logical: if \code{TRUE}, \code{scan} will flush to the
177    end of the line after reading the last of the fields requested.
178    This allows putting comments after the last field.}
179
180  \item{stringsAsFactors}{logical: should character vectors be converted
181    to factors?  Note that this is overridden by \code{as.is} and
182    \code{colClasses}, both of which allow finer control.}
183
184  \item{fileEncoding}{character string: if non-empty declares the
185    encoding used on a file (not a connection) so the character data can
186    be re-encoded.  See the \sQuote{Encoding} section of the help for
187    \code{\link{file}}, the \sQuote{R Data Import/Export} manual and
188    \sQuote{Note}.
189  }
190
191  \item{encoding}{encoding to be assumed for input strings.  It is
192    used to mark character strings as known to be in
193    Latin-1 or UTF-8 (see \code{\link{Encoding}}): it is not used to
194    re-encode the input, but allows \R to handle encoded strings in
195    their native encoding (if one of those two).  See \sQuote{Value}
196    and \sQuote{Note}.
197  }
198
199  \item{text}{character string: if \code{file} is not supplied and this is,
200    then data are read from the value of \code{text} via a text connection.
201    Notice that a literal string can be used to include (small) data sets
202    within R code.
203  }
204
205  \item{skipNul}{logical: should nuls be skipped?}
206
207  \item{\dots}{Further arguments to be passed to \code{read.table}.}
208}
209
210\details{
211  This function is the principal means of reading tabular data into \R.
212
213  Unless \code{colClasses} is specified, all columns are read as
214  character columns and then converted using \code{\link{type.convert}}
215  to logical, integer, numeric, complex or (depending on \code{as.is})
216  factor as appropriate.  Quotes are (by default) interpreted in all
217  fields, so a column of values like \code{"42"} will result in an
218  integer column.
219
220  A field or line is \sQuote{blank} if it contains nothing (except
221  whitespace if no separator is specified) before a comment character or
222  the end of the field or line.
223
224  If \code{row.names} is not specified and the header line has one less
225  entry than the number of columns, the first column is taken to be the
226  row names.  This allows data frames to be read in from the format in
227  which they are printed.  If \code{row.names} is specified and does
228  not refer to the first column, that column is discarded from such files.
229
230  The number of data columns is determined by looking at the first five
231  lines of input (or the whole input if it has less than five lines), or
232  from the length of \code{col.names} if it is specified and is longer.
233  This could conceivably be wrong if \code{fill} or
234  \code{blank.lines.skip} are true, so specify \code{col.names} if
235  necessary (as in the \sQuote{Examples}).
236
237  \code{read.csv} and \code{read.csv2} are identical to
238  \code{read.table} except for the defaults.  They are intended for
239  reading \sQuote{comma separated value} files (\file{.csv}) or
240  (\code{read.csv2}) the variant used in countries that use a comma as
241  decimal point and a semicolon as field separator.  Similarly,
242  \code{read.delim} and \code{read.delim2} are for reading delimited
243  files, defaulting to the TAB character for the delimiter.  Notice that
244  \code{header = TRUE} and \code{fill = TRUE} in these variants, and
245  that the comment character is disabled.
246
247  The rest of the line after a comment character is skipped; quotes
248  are not processed in comments.  Complete comment lines are allowed
249  provided \code{blank.lines.skip = TRUE}; however, comment lines prior
250  to the header must have the comment character in the first non-blank
251  column.
252
253  Quoted fields with embedded newlines are supported except after a
254  comment character.  Embedded nuls are unsupported: skipping them (with
255  \code{skipNul = TRUE}) may work.
256}
257
258\section{CSV files}{
259  See the help on \code{\link{write.csv}} for the various conventions
260  for \code{.csv} files.  The commonest form of CSV file with row names
261  needs to be read with \code{read.csv(..., row.names = 1)} to use the
262  names in the first columnn of the file as row names.
263}
264
265\value{
266  A data frame (\code{\link{data.frame}}) containing a representation of
267  the data in the file.
268
269  Empty input is an error unless \code{col.names} is specified, when a
270  0-row data frame is returned: similarly giving just a header line if
271  \code{header = TRUE} results in a 0-row data frame.  Note that in
272  either case the columns will be logical unless \code{colClasses} was
273  supplied.
274
275  Character strings in the result (including factor levels) will have a
276  declared encoding if \code{encoding} is \code{"latin1"} or
277  \code{"UTF-8"}.
278}
279
280\section{Memory usage}{
281  These functions can use a surprising amount of memory when reading
282  large files.  There is extensive discussion in the \sQuote{R Data
283  Import/Export} manual, supplementing the notes here.
284
285  Less memory will be used if \code{colClasses} is specified as one of
286  the six \link{atomic} vector classes.  This can be particularly so when
287  reading a column that takes many distinct numeric values, as storing
288  each distinct value as a character string can take up to 14 times as
289  much memory as storing it as an integer.
290
291  Using \code{nrows}, even as a mild over-estimate, will help memory
292  usage.
293
294  Using \code{comment.char = ""} will be appreciably faster than the
295  \code{read.table} default.
296
297  \code{read.table} is not the right tool for reading large matrices,
298  especially those with many columns: it is designed to read
299  \emph{data frames} which may have columns of very different classes.
300  Use \code{\link{scan}} instead for matrices.
301}
302
303\note{
304  The columns referred to in \code{as.is} and \code{colClasses} include
305  the column of row names (if any).
306
307  There are two approaches for reading input that is not in the local
308  encoding.  If the input is known to be UTF-8 or Latin1, use the
309  \code{encoding} argument to declare that.  If the input is in some
310  other encoding, then it may be translated on input.  The \code{fileEncoding}
311  argument achieves this by setting up a connection to do the re-encoding
312  into the current locale.  Note that on Windows or other systems not running
313  in a UTF-8 locale, this may not be possible.
314}
315
316\seealso{
317  The \sQuote{R Data Import/Export} manual.
318
319  \code{\link{scan}}, \code{\link{type.convert}},
320  \code{\link{read.fwf}} for reading \emph{f}ixed \emph{w}idth
321  \emph{f}ormatted input;
322  \code{\link{write.table}};
323  \code{\link{data.frame}}.
324
325  \code{\link{count.fields}} can be useful to determine problems with
326  reading files which result in reports of incorrect record lengths (see
327  the \sQuote{Examples} below).
328
329  \url{https://tools.ietf.org/html/rfc4180} for the IANA definition of
330  CSV files (which requires comma as separator and CRLF line endings).
331}
332
333\references{
334  Chambers, J. M. (1992)
335  \emph{Data for models.}
336  Chapter 3 of \emph{Statistical Models in S}
337  eds J. M. Chambers and T. J. Hastie, Wadsworth & Brooks/Cole.
338}
339
340\examples{
341## using count.fields to handle unknown maximum number of fields
342## when fill = TRUE
343test1 <- c(1:5, "6,7", "8,9,10")
344tf <- tempfile()
345writeLines(test1, tf)
346
347read.csv(tf, fill = TRUE) # 1 column
348ncol <- max(count.fields(tf, sep = ","))
349read.csv(tf, fill = TRUE, header = FALSE,
350         col.names = paste0("V", seq_len(ncol)))
351unlink(tf)
352
353## "Inline" data set, using text=
354## Notice that leading and trailing empty lines are auto-trimmed
355
356read.table(header = TRUE, text = "
357a b
3581 2
3593 4
360")
361}
362
363
364\keyword{file}
365\keyword{connection}
366