1## ---- echo = FALSE, message = FALSE---------------------------------------------------------------
2require(data.table)
3knitr::opts_chunk$set(
4  comment = "#",
5    error = FALSE,
6     tidy = FALSE,
7    cache = FALSE,
8 collapse = TRUE)
9
10## ----echo = FALSE---------------------------------------------------------------------------------
11options(width = 100L)
12
13## -------------------------------------------------------------------------------------------------
14flights <- fread("flights14.csv")
15head(flights)
16dim(flights)
17
18## -------------------------------------------------------------------------------------------------
19set.seed(1L)
20DF = data.frame(ID1 = sample(letters[1:2], 10, TRUE),
21                ID2 = sample(1:3, 10, TRUE),
22                val = sample(10),
23                stringsAsFactors = FALSE,
24                row.names = sample(LETTERS[1:10]))
25DF
26
27rownames(DF)
28
29## -------------------------------------------------------------------------------------------------
30DF["C", ]
31
32## ----eval = FALSE---------------------------------------------------------------------------------
33#  rownames(DF) = sample(LETTERS[1:5], 10, TRUE)
34#  # Warning: non-unique values when setting 'row.names': 'C', 'D'
35#  # Error in `.rowNamesDF<-`(x, value = value): duplicate 'row.names' are not allowed
36
37## -------------------------------------------------------------------------------------------------
38DT = as.data.table(DF)
39DT
40
41rownames(DT)
42
43## -------------------------------------------------------------------------------------------------
44setkey(flights, origin)
45head(flights)
46
47## alternatively we can provide character vectors to the function 'setkeyv()'
48# setkeyv(flights, "origin") # useful to program with
49
50## -------------------------------------------------------------------------------------------------
51flights[.("JFK")]
52
53## alternatively
54# flights[J("JFK")] (or)
55# flights[list("JFK")]
56
57## ----eval = FALSE---------------------------------------------------------------------------------
58#  flights["JFK"]              ## same as flights[.("JFK")]
59
60## ----eval = FALSE---------------------------------------------------------------------------------
61#  flights[c("JFK", "LGA")]    ## same as flights[.(c("JFK", "LGA"))]
62
63## -------------------------------------------------------------------------------------------------
64key(flights)
65
66## -------------------------------------------------------------------------------------------------
67setkey(flights, origin, dest)
68head(flights)
69
70## or alternatively
71# setkeyv(flights, c("origin", "dest")) # provide a character vector of column names
72
73key(flights)
74
75## -------------------------------------------------------------------------------------------------
76flights[.("JFK", "MIA")]
77
78## -------------------------------------------------------------------------------------------------
79key(flights)
80
81flights[.("JFK")] ## or in this case simply flights["JFK"], for convenience
82
83## -------------------------------------------------------------------------------------------------
84flights[.(unique(origin), "MIA")]
85
86## -------------------------------------------------------------------------------------------------
87key(flights)
88flights[.("LGA", "TPA"), .(arr_delay)]
89
90## ----eval = FALSE---------------------------------------------------------------------------------
91#  flights[.("LGA", "TPA"), "arr_delay", with = FALSE]
92
93## -------------------------------------------------------------------------------------------------
94flights[.("LGA", "TPA"), .(arr_delay)][order(-arr_delay)]
95
96## -------------------------------------------------------------------------------------------------
97flights[.("LGA", "TPA"), max(arr_delay)]
98
99## -------------------------------------------------------------------------------------------------
100# get all 'hours' in flights
101flights[, sort(unique(hour))]
102
103## -------------------------------------------------------------------------------------------------
104setkey(flights, hour)
105key(flights)
106flights[.(24), hour := 0L]
107key(flights)
108
109## -------------------------------------------------------------------------------------------------
110flights[, sort(unique(hour))]
111
112## -------------------------------------------------------------------------------------------------
113setkey(flights, origin, dest)
114key(flights)
115
116## -------------------------------------------------------------------------------------------------
117ans <- flights["JFK", max(dep_delay), keyby = month]
118head(ans)
119key(ans)
120
121## -------------------------------------------------------------------------------------------------
122flights[.("JFK", "MIA"), mult = "first"]
123
124## -------------------------------------------------------------------------------------------------
125flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last"]
126
127## -------------------------------------------------------------------------------------------------
128flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last", nomatch = NULL]
129
130## ----eval = FALSE---------------------------------------------------------------------------------
131#  # key by origin,dest columns
132#  flights[.("JFK", "MIA")]
133
134## ----eval = FALSE---------------------------------------------------------------------------------
135#  flights[origin == "JFK" & dest == "MIA"]
136
137## ----eval = FALSE---------------------------------------------------------------------------------
138#  setkey(flights, NULL)
139#  flights[origin == "JFK" & dest == "MIA"]
140
141## -------------------------------------------------------------------------------------------------
142set.seed(2L)
143N = 2e7L
144DT = data.table(x = sample(letters, N, TRUE),
145                y = sample(1000L, N, TRUE),
146                val = runif(N))
147print(object.size(DT), units = "Mb")
148
149## -------------------------------------------------------------------------------------------------
150key(DT)
151## (1) Usual way of subsetting - vector scan approach
152t1 <- system.time(ans1 <- DT[x == "g" & y == 877L])
153t1
154head(ans1)
155dim(ans1)
156
157## -------------------------------------------------------------------------------------------------
158setkeyv(DT, c("x", "y"))
159key(DT)
160## (2) Subsetting using keys
161t2 <- system.time(ans2 <- DT[.("g", 877L)])
162t2
163head(ans2)
164dim(ans2)
165
166identical(ans1$val, ans2$val)
167
168## ----eval = FALSE---------------------------------------------------------------------------------
169#  1, 5, 10, 19, 22, 23, 30
170
171