1## ---- echo = FALSE, message = FALSE---------------------------------------------------------------
2require(data.table)
3knitr::opts_chunk$set(
4  comment = "#",
5  error = FALSE,
6  tidy = FALSE,
7  cache = FALSE,
8  collapse = TRUE,
9  out.width = '100%',
10  dpi = 144
11)
12
13## ----download_lahman------------------------------------------------------------------------------
14load('Teams.RData')
15setDT(Teams)
16Teams
17
18load('Pitching.RData')
19setDT(Pitching)
20Pitching
21
22## ----plain_sd-------------------------------------------------------------------------------------
23Pitching[ , .SD]
24
25## ----plain_sd_is_table----------------------------------------------------------------------------
26identical(Pitching, Pitching[ , .SD])
27
28## ----simple_sdcols--------------------------------------------------------------------------------
29# W: Wins; L: Losses; G: Games
30Pitching[ , .SD, .SDcols = c('W', 'L', 'G')]
31
32## ----identify_factors-----------------------------------------------------------------------------
33# teamIDBR: Team ID used by Baseball Reference website
34# teamIDlahman45: Team ID used in Lahman database version 4.5
35# teamIDretro: Team ID used by Retrosheet
36fkt = c('teamIDBR', 'teamIDlahman45', 'teamIDretro')
37# confirm that they're stored as `character`
38Teams[ , sapply(.SD, is.character), .SDcols = fkt]
39
40## ----identify_factors_as_df-----------------------------------------------------------------------
41setDF(Teams) # convert to data.frame for illustration
42sapply(Teams[ , fkt], is.character)
43setDT(Teams) # convert back to data.table
44
45## ----assign_factors-------------------------------------------------------------------------------
46Teams[ , (fkt) := lapply(.SD, factor), .SDcols = fkt]
47# print out the first column to demonstrate success
48head(unique(Teams[[fkt[1L]]]))
49
50## ----sd_as_logical--------------------------------------------------------------------------------
51# while .SDcols accepts a logical vector,
52#   := does not, so we need to convert to column
53#   positions with which()
54fkt_idx = which(sapply(Teams, is.factor))
55Teams[ , (fkt_idx) := lapply(.SD, as.character), .SDcols = fkt_idx]
56head(unique(Teams[[fkt_idx[1L]]]))
57
58## ----sd_patterns----------------------------------------------------------------------------------
59Teams[ , .SD, .SDcols = patterns('team')]
60
61# now convert these columns to factor;
62#   value = TRUE in grep() is for the LHS of := to
63#   get column names instead of positions
64team_idx = grep('team', names(Teams), value = TRUE)
65Teams[ , (team_idx) := lapply(.SD, factor), .SDcols = team_idx]
66
67## ----sd_for_lm, cache = FALSE---------------------------------------------------------------------
68# this generates a list of the 2^k possible extra variables
69#   for models of the form ERA ~ G + (...)
70extra_var = c('yearID', 'teamID', 'G', 'L')
71models = unlist(
72  lapply(0L:length(extra_var), combn, x = extra_var, simplify = FALSE),
73  recursive = FALSE
74)
75
76# here are 16 visually distinct colors, taken from the list of 20 here:
77#   https://sashat.me/2017/01/11/list-of-20-simple-distinct-colors/
78col16 = c('#e6194b', '#3cb44b', '#ffe119', '#0082c8',
79          '#f58231', '#911eb4', '#46f0f0', '#f032e6',
80          '#d2f53c', '#fabebe', '#008080', '#e6beff',
81          '#aa6e28', '#fffac8', '#800000', '#aaffc3')
82
83par(oma = c(2, 0, 0, 0))
84lm_coef = sapply(models, function(rhs) {
85  # using ERA ~ . and data = .SD, then varying which
86  #   columns are included in .SD allows us to perform this
87  #   iteration over 16 models succinctly.
88  #   coef(.)['W'] extracts the W coefficient from each model fit
89  Pitching[ , coef(lm(ERA ~ ., data = .SD))['W'], .SDcols = c('W', rhs)]
90})
91barplot(lm_coef, names.arg = sapply(models, paste, collapse = '/'),
92        main = 'Wins Coefficient\nWith Various Covariates',
93        col = col16, las = 2L, cex.names = .8)
94
95## ----conditional_join-----------------------------------------------------------------------------
96# to exclude pitchers with exceptional performance in a few games,
97#   subset first; then define rank of pitchers within their team each year
98#   (in general, we should put more care into the 'ties.method' of frank)
99Pitching[G > 5, rank_in_team := frank(ERA), by = .(teamID, yearID)]
100Pitching[rank_in_team == 1, team_performance :=
101           Teams[.SD, Rank, on = c('teamID', 'yearID')]]
102
103## ----grouping_png, fig.cap = "Grouping, Illustrated", echo = FALSE--------------------------------
104knitr::include_graphics('plots/grouping_illustration.png')
105
106## ----group_sd_last--------------------------------------------------------------------------------
107# the data is already sorted by year; if it weren't
108#   we could do Teams[order(yearID), .SD[.N], by = teamID]
109Teams[ , .SD[.N], by = teamID]
110
111## ----sd_team_best_year----------------------------------------------------------------------------
112Teams[ , .SD[which.max(R)], by = teamID]
113
114## ----group_lm, results = 'hide'-------------------------------------------------------------------
115# Overall coefficient for comparison
116overall_coef = Pitching[ , coef(lm(ERA ~ W))['W']]
117# use the .N > 20 filter to exclude teams with few observations
118Pitching[ , if (.N > 20L) .(w_coef = coef(lm(ERA ~ W))['W']), by = teamID
119          ][ , hist(w_coef, 20L, las = 1L,
120                    xlab = 'Fitted Coefficient on W',
121                    ylab = 'Number of Teams', col = 'darkgreen',
122                    main = 'Team-Level Distribution\nWin Coefficients on ERA')]
123abline(v = overall_coef, lty = 2L, col = 'red')
124
125