1## ---- echo = FALSE, message = FALSE--------------------------------------------------------------- 2require(data.table) 3knitr::opts_chunk$set( 4 comment = "#", 5 error = FALSE, 6 tidy = FALSE, 7 cache = FALSE, 8 collapse = TRUE) 9 10## ----echo = FALSE--------------------------------------------------------------------------------- 11options(width = 100L) 12 13## ------------------------------------------------------------------------------------------------- 14flights <- fread("flights14.csv") 15head(flights) 16dim(flights) 17 18## ------------------------------------------------------------------------------------------------- 19set.seed(1L) 20DF = data.frame(ID1 = sample(letters[1:2], 10, TRUE), 21 ID2 = sample(1:3, 10, TRUE), 22 val = sample(10), 23 stringsAsFactors = FALSE, 24 row.names = sample(LETTERS[1:10])) 25DF 26 27rownames(DF) 28 29## ------------------------------------------------------------------------------------------------- 30DF["C", ] 31 32## ----eval = FALSE--------------------------------------------------------------------------------- 33# rownames(DF) = sample(LETTERS[1:5], 10, TRUE) 34# # Warning: non-unique values when setting 'row.names': 'C', 'D' 35# # Error in `.rowNamesDF<-`(x, value = value): duplicate 'row.names' are not allowed 36 37## ------------------------------------------------------------------------------------------------- 38DT = as.data.table(DF) 39DT 40 41rownames(DT) 42 43## ------------------------------------------------------------------------------------------------- 44setkey(flights, origin) 45head(flights) 46 47## alternatively we can provide character vectors to the function 'setkeyv()' 48# setkeyv(flights, "origin") # useful to program with 49 50## ------------------------------------------------------------------------------------------------- 51flights[.("JFK")] 52 53## alternatively 54# flights[J("JFK")] (or) 55# flights[list("JFK")] 56 57## ----eval = FALSE--------------------------------------------------------------------------------- 58# flights["JFK"] ## same as flights[.("JFK")] 59 60## ----eval = FALSE--------------------------------------------------------------------------------- 61# flights[c("JFK", "LGA")] ## same as flights[.(c("JFK", "LGA"))] 62 63## ------------------------------------------------------------------------------------------------- 64key(flights) 65 66## ------------------------------------------------------------------------------------------------- 67setkey(flights, origin, dest) 68head(flights) 69 70## or alternatively 71# setkeyv(flights, c("origin", "dest")) # provide a character vector of column names 72 73key(flights) 74 75## ------------------------------------------------------------------------------------------------- 76flights[.("JFK", "MIA")] 77 78## ------------------------------------------------------------------------------------------------- 79key(flights) 80 81flights[.("JFK")] ## or in this case simply flights["JFK"], for convenience 82 83## ------------------------------------------------------------------------------------------------- 84flights[.(unique(origin), "MIA")] 85 86## ------------------------------------------------------------------------------------------------- 87key(flights) 88flights[.("LGA", "TPA"), .(arr_delay)] 89 90## ----eval = FALSE--------------------------------------------------------------------------------- 91# flights[.("LGA", "TPA"), "arr_delay", with = FALSE] 92 93## ------------------------------------------------------------------------------------------------- 94flights[.("LGA", "TPA"), .(arr_delay)][order(-arr_delay)] 95 96## ------------------------------------------------------------------------------------------------- 97flights[.("LGA", "TPA"), max(arr_delay)] 98 99## ------------------------------------------------------------------------------------------------- 100# get all 'hours' in flights 101flights[, sort(unique(hour))] 102 103## ------------------------------------------------------------------------------------------------- 104setkey(flights, hour) 105key(flights) 106flights[.(24), hour := 0L] 107key(flights) 108 109## ------------------------------------------------------------------------------------------------- 110flights[, sort(unique(hour))] 111 112## ------------------------------------------------------------------------------------------------- 113setkey(flights, origin, dest) 114key(flights) 115 116## ------------------------------------------------------------------------------------------------- 117ans <- flights["JFK", max(dep_delay), keyby = month] 118head(ans) 119key(ans) 120 121## ------------------------------------------------------------------------------------------------- 122flights[.("JFK", "MIA"), mult = "first"] 123 124## ------------------------------------------------------------------------------------------------- 125flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last"] 126 127## ------------------------------------------------------------------------------------------------- 128flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last", nomatch = NULL] 129 130## ----eval = FALSE--------------------------------------------------------------------------------- 131# # key by origin,dest columns 132# flights[.("JFK", "MIA")] 133 134## ----eval = FALSE--------------------------------------------------------------------------------- 135# flights[origin == "JFK" & dest == "MIA"] 136 137## ----eval = FALSE--------------------------------------------------------------------------------- 138# setkey(flights, NULL) 139# flights[origin == "JFK" & dest == "MIA"] 140 141## ------------------------------------------------------------------------------------------------- 142set.seed(2L) 143N = 2e7L 144DT = data.table(x = sample(letters, N, TRUE), 145 y = sample(1000L, N, TRUE), 146 val = runif(N)) 147print(object.size(DT), units = "Mb") 148 149## ------------------------------------------------------------------------------------------------- 150key(DT) 151## (1) Usual way of subsetting - vector scan approach 152t1 <- system.time(ans1 <- DT[x == "g" & y == 877L]) 153t1 154head(ans1) 155dim(ans1) 156 157## ------------------------------------------------------------------------------------------------- 158setkeyv(DT, c("x", "y")) 159key(DT) 160## (2) Subsetting using keys 161t2 <- system.time(ans2 <- DT[.("g", 877L)]) 162t2 163head(ans2) 164dim(ans2) 165 166identical(ans1$val, ans2$val) 167 168## ----eval = FALSE--------------------------------------------------------------------------------- 169# 1, 5, 10, 19, 22, 23, 30 170 171