1## ---- echo = FALSE, message = FALSE------------------------------------------- 2require(data.table) 3knitr::opts_chunk$set( 4 comment = "#", 5 error = FALSE, 6 tidy = FALSE, 7 cache = FALSE, 8 collapse = TRUE 9) 10 11## ----echo = FALSE--------------------------------------------------------------------------------- 12options(width = 100L) 13 14## ------------------------------------------------------------------------------------------------- 15input <- if (file.exists("flights14.csv")) { 16 "flights14.csv" 17} else { 18 "https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv" 19} 20flights <- fread(input) 21flights 22dim(flights) 23 24## ------------------------------------------------------------------------------------------------- 25DT = data.table( 26 ID = c("b","b","b","a","a","c"), 27 a = 1:6, 28 b = 7:12, 29 c = 13:18 30) 31DT 32class(DT$ID) 33 34## ----eval = FALSE--------------------------------------------------------------------------------- 35# DT[i, j, by] 36# 37# ## R: i j by 38# ## SQL: where | order by select | update group by 39 40## ------------------------------------------------------------------------------------------------- 41ans <- flights[origin == "JFK" & month == 6L] 42head(ans) 43 44## ------------------------------------------------------------------------------------------------- 45ans <- flights[1:2] 46ans 47 48## ------------------------------------------------------------------------------------------------- 49ans <- flights[order(origin, -dest)] 50head(ans) 51 52## ------------------------------------------------------------------------------------------------- 53ans <- flights[, arr_delay] 54head(ans) 55 56## ------------------------------------------------------------------------------------------------- 57ans <- flights[, list(arr_delay)] 58head(ans) 59 60## ------------------------------------------------------------------------------------------------- 61ans <- flights[, .(arr_delay, dep_delay)] 62head(ans) 63 64## alternatively 65# ans <- flights[, list(arr_delay, dep_delay)] 66 67## ------------------------------------------------------------------------------------------------- 68ans <- flights[, .(delay_arr = arr_delay, delay_dep = dep_delay)] 69head(ans) 70 71## ------------------------------------------------------------------------------------------------- 72ans <- flights[, sum( (arr_delay + dep_delay) < 0 )] 73ans 74 75## ------------------------------------------------------------------------------------------------- 76ans <- flights[origin == "JFK" & month == 6L, 77 .(m_arr = mean(arr_delay), m_dep = mean(dep_delay))] 78ans 79 80## ------------------------------------------------------------------------------------------------- 81ans <- flights[origin == "JFK" & month == 6L, length(dest)] 82ans 83 84## ------------------------------------------------------------------------------------------------- 85ans <- flights[origin == "JFK" & month == 6L, .N] 86ans 87 88## ----j_cols_no_with------------------------------------------------------------------------------- 89ans <- flights[, c("arr_delay", "dep_delay")] 90head(ans) 91 92## ----j_cols_dot_prefix---------------------------------------------------------------------------- 93select_cols = c("arr_delay", "dep_delay") 94flights[ , ..select_cols] 95 96## ----j_cols_with---------------------------------------------------------------------------------- 97flights[ , select_cols, with = FALSE] 98 99## ------------------------------------------------------------------------------------------------- 100DF = data.frame(x = c(1,1,1,2,2,3,3,3), y = 1:8) 101 102## (1) normal way 103DF[DF$x > 1, ] # data.frame needs that ',' as well 104 105## (2) using with 106DF[with(DF, x > 1), ] 107 108## ----eval = FALSE--------------------------------------------------------------------------------- 109# ## not run 110# 111# # returns all columns except arr_delay and dep_delay 112# ans <- flights[, !c("arr_delay", "dep_delay")] 113# # or 114# ans <- flights[, -c("arr_delay", "dep_delay")] 115 116## ----eval = FALSE--------------------------------------------------------------------------------- 117# ## not run 118# 119# # returns year,month and day 120# ans <- flights[, year:day] 121# # returns day, month and year 122# ans <- flights[, day:year] 123# # returns all columns except year, month and day 124# ans <- flights[, -(year:day)] 125# ans <- flights[, !(year:day)] 126 127## ------------------------------------------------------------------------------------------------- 128ans <- flights[, .(.N), by = .(origin)] 129ans 130 131## or equivalently using a character vector in 'by' 132# ans <- flights[, .(.N), by = "origin"] 133 134## ------------------------------------------------------------------------------------------------- 135ans <- flights[, .N, by = origin] 136ans 137 138## ------------------------------------------------------------------------------------------------- 139ans <- flights[carrier == "AA", .N, by = origin] 140ans 141 142## ------------------------------------------------------------------------------------------------- 143ans <- flights[carrier == "AA", .N, by = .(origin, dest)] 144head(ans) 145 146## or equivalently using a character vector in 'by' 147# ans <- flights[carrier == "AA", .N, by = c("origin", "dest")] 148 149## ------------------------------------------------------------------------------------------------- 150ans <- flights[carrier == "AA", 151 .(mean(arr_delay), mean(dep_delay)), 152 by = .(origin, dest, month)] 153ans 154 155## ------------------------------------------------------------------------------------------------- 156ans <- flights[carrier == "AA", 157 .(mean(arr_delay), mean(dep_delay)), 158 keyby = .(origin, dest, month)] 159ans 160 161## ------------------------------------------------------------------------------------------------- 162ans <- flights[carrier == "AA", .N, by = .(origin, dest)] 163 164## ------------------------------------------------------------------------------------------------- 165ans <- ans[order(origin, -dest)] 166head(ans) 167 168## ------------------------------------------------------------------------------------------------- 169ans <- flights[carrier == "AA", .N, by = .(origin, dest)][order(origin, -dest)] 170head(ans, 10) 171 172## ----eval = FALSE--------------------------------------------------------------------------------- 173# DT[ ... 174# ][ ... 175# ][ ... 176# ] 177 178## ------------------------------------------------------------------------------------------------- 179ans <- flights[, .N, .(dep_delay>0, arr_delay>0)] 180ans 181 182## ------------------------------------------------------------------------------------------------- 183DT 184 185DT[, print(.SD), by = ID] 186 187## ------------------------------------------------------------------------------------------------- 188DT[, lapply(.SD, mean), by = ID] 189 190## ------------------------------------------------------------------------------------------------- 191flights[carrier == "AA", ## Only on trips with carrier "AA" 192 lapply(.SD, mean), ## compute the mean 193 by = .(origin, dest, month), ## for every 'origin,dest,month' 194 .SDcols = c("arr_delay", "dep_delay")] ## for just those specified in .SDcols 195 196## ------------------------------------------------------------------------------------------------- 197ans <- flights[, head(.SD, 2), by = month] 198head(ans) 199 200## ------------------------------------------------------------------------------------------------- 201DT[, .(val = c(a,b)), by = ID] 202 203## ------------------------------------------------------------------------------------------------- 204DT[, .(val = list(c(a,b))), by = ID] 205 206## ------------------------------------------------------------------------------------------------- 207## (1) look at the difference between 208DT[, print(c(a,b)), by = ID] 209 210## (2) and 211DT[, print(list(c(a,b))), by = ID] 212 213## ----eval = FALSE--------------------------------------------------------------------------------- 214# DT[i, j, by] 215 216