1## ---- echo = FALSE, message = FALSE-------------------------------------------
2require(data.table)
3knitr::opts_chunk$set(
4  comment = "#",
5    error = FALSE,
6     tidy = FALSE,
7    cache = FALSE,
8 collapse = TRUE
9)
10
11## ----echo = FALSE---------------------------------------------------------------------------------
12options(width = 100L)
13
14## -------------------------------------------------------------------------------------------------
15input <- if (file.exists("flights14.csv")) {
16   "flights14.csv"
17} else {
18  "https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv"
19}
20flights <- fread(input)
21flights
22dim(flights)
23
24## -------------------------------------------------------------------------------------------------
25DT = data.table(
26  ID = c("b","b","b","a","a","c"),
27  a = 1:6,
28  b = 7:12,
29  c = 13:18
30)
31DT
32class(DT$ID)
33
34## ----eval = FALSE---------------------------------------------------------------------------------
35#  DT[i, j, by]
36#
37#  ##   R:                 i                 j        by
38#  ## SQL:  where | order by   select | update  group by
39
40## -------------------------------------------------------------------------------------------------
41ans <- flights[origin == "JFK" & month == 6L]
42head(ans)
43
44## -------------------------------------------------------------------------------------------------
45ans <- flights[1:2]
46ans
47
48## -------------------------------------------------------------------------------------------------
49ans <- flights[order(origin, -dest)]
50head(ans)
51
52## -------------------------------------------------------------------------------------------------
53ans <- flights[, arr_delay]
54head(ans)
55
56## -------------------------------------------------------------------------------------------------
57ans <- flights[, list(arr_delay)]
58head(ans)
59
60## -------------------------------------------------------------------------------------------------
61ans <- flights[, .(arr_delay, dep_delay)]
62head(ans)
63
64## alternatively
65# ans <- flights[, list(arr_delay, dep_delay)]
66
67## -------------------------------------------------------------------------------------------------
68ans <- flights[, .(delay_arr = arr_delay, delay_dep = dep_delay)]
69head(ans)
70
71## -------------------------------------------------------------------------------------------------
72ans <- flights[, sum( (arr_delay + dep_delay) < 0 )]
73ans
74
75## -------------------------------------------------------------------------------------------------
76ans <- flights[origin == "JFK" & month == 6L,
77               .(m_arr = mean(arr_delay), m_dep = mean(dep_delay))]
78ans
79
80## -------------------------------------------------------------------------------------------------
81ans <- flights[origin == "JFK" & month == 6L, length(dest)]
82ans
83
84## -------------------------------------------------------------------------------------------------
85ans <- flights[origin == "JFK" & month == 6L, .N]
86ans
87
88## ----j_cols_no_with-------------------------------------------------------------------------------
89ans <- flights[, c("arr_delay", "dep_delay")]
90head(ans)
91
92## ----j_cols_dot_prefix----------------------------------------------------------------------------
93select_cols = c("arr_delay", "dep_delay")
94flights[ , ..select_cols]
95
96## ----j_cols_with----------------------------------------------------------------------------------
97flights[ , select_cols, with = FALSE]
98
99## -------------------------------------------------------------------------------------------------
100DF = data.frame(x = c(1,1,1,2,2,3,3,3), y = 1:8)
101
102## (1) normal way
103DF[DF$x > 1, ] # data.frame needs that ',' as well
104
105## (2) using with
106DF[with(DF, x > 1), ]
107
108## ----eval = FALSE---------------------------------------------------------------------------------
109#  ## not run
110#
111#  # returns all columns except arr_delay and dep_delay
112#  ans <- flights[, !c("arr_delay", "dep_delay")]
113#  # or
114#  ans <- flights[, -c("arr_delay", "dep_delay")]
115
116## ----eval = FALSE---------------------------------------------------------------------------------
117#  ## not run
118#
119#  # returns year,month and day
120#  ans <- flights[, year:day]
121#  # returns day, month and year
122#  ans <- flights[, day:year]
123#  # returns all columns except year, month and day
124#  ans <- flights[, -(year:day)]
125#  ans <- flights[, !(year:day)]
126
127## -------------------------------------------------------------------------------------------------
128ans <- flights[, .(.N), by = .(origin)]
129ans
130
131## or equivalently using a character vector in 'by'
132# ans <- flights[, .(.N), by = "origin"]
133
134## -------------------------------------------------------------------------------------------------
135ans <- flights[, .N, by = origin]
136ans
137
138## -------------------------------------------------------------------------------------------------
139ans <- flights[carrier == "AA", .N, by = origin]
140ans
141
142## -------------------------------------------------------------------------------------------------
143ans <- flights[carrier == "AA", .N, by = .(origin, dest)]
144head(ans)
145
146## or equivalently using a character vector in 'by'
147# ans <- flights[carrier == "AA", .N, by = c("origin", "dest")]
148
149## -------------------------------------------------------------------------------------------------
150ans <- flights[carrier == "AA",
151        .(mean(arr_delay), mean(dep_delay)),
152        by = .(origin, dest, month)]
153ans
154
155## -------------------------------------------------------------------------------------------------
156ans <- flights[carrier == "AA",
157        .(mean(arr_delay), mean(dep_delay)),
158        keyby = .(origin, dest, month)]
159ans
160
161## -------------------------------------------------------------------------------------------------
162ans <- flights[carrier == "AA", .N, by = .(origin, dest)]
163
164## -------------------------------------------------------------------------------------------------
165ans <- ans[order(origin, -dest)]
166head(ans)
167
168## -------------------------------------------------------------------------------------------------
169ans <- flights[carrier == "AA", .N, by = .(origin, dest)][order(origin, -dest)]
170head(ans, 10)
171
172## ----eval = FALSE---------------------------------------------------------------------------------
173#  DT[ ...
174#     ][ ...
175#       ][ ...
176#         ]
177
178## -------------------------------------------------------------------------------------------------
179ans <- flights[, .N, .(dep_delay>0, arr_delay>0)]
180ans
181
182## -------------------------------------------------------------------------------------------------
183DT
184
185DT[, print(.SD), by = ID]
186
187## -------------------------------------------------------------------------------------------------
188DT[, lapply(.SD, mean), by = ID]
189
190## -------------------------------------------------------------------------------------------------
191flights[carrier == "AA",                       ## Only on trips with carrier "AA"
192        lapply(.SD, mean),                     ## compute the mean
193        by = .(origin, dest, month),           ## for every 'origin,dest,month'
194        .SDcols = c("arr_delay", "dep_delay")] ## for just those specified in .SDcols
195
196## -------------------------------------------------------------------------------------------------
197ans <- flights[, head(.SD, 2), by = month]
198head(ans)
199
200## -------------------------------------------------------------------------------------------------
201DT[, .(val = c(a,b)), by = ID]
202
203## -------------------------------------------------------------------------------------------------
204DT[, .(val = list(c(a,b))), by = ID]
205
206## -------------------------------------------------------------------------------------------------
207## (1) look at the difference between
208DT[, print(c(a,b)), by = ID]
209
210## (2) and
211DT[, print(list(c(a,b))), by = ID]
212
213## ----eval = FALSE---------------------------------------------------------------------------------
214#  DT[i, j, by]
215
216