1## ---- echo = FALSE, message = FALSE--------------------------------------------------------------- 2require(data.table) 3knitr::opts_chunk$set( 4 comment = "#", 5 error = FALSE, 6 tidy = FALSE, 7 cache = FALSE, 8 collapse = TRUE, 9 out.width = '100%', 10 dpi = 144 11) 12 13## ----download_lahman------------------------------------------------------------------------------ 14load('Teams.RData') 15setDT(Teams) 16Teams 17 18load('Pitching.RData') 19setDT(Pitching) 20Pitching 21 22## ----plain_sd------------------------------------------------------------------------------------- 23Pitching[ , .SD] 24 25## ----plain_sd_is_table---------------------------------------------------------------------------- 26identical(Pitching, Pitching[ , .SD]) 27 28## ----simple_sdcols-------------------------------------------------------------------------------- 29# W: Wins; L: Losses; G: Games 30Pitching[ , .SD, .SDcols = c('W', 'L', 'G')] 31 32## ----identify_factors----------------------------------------------------------------------------- 33# teamIDBR: Team ID used by Baseball Reference website 34# teamIDlahman45: Team ID used in Lahman database version 4.5 35# teamIDretro: Team ID used by Retrosheet 36fkt = c('teamIDBR', 'teamIDlahman45', 'teamIDretro') 37# confirm that they're stored as `character` 38Teams[ , sapply(.SD, is.character), .SDcols = fkt] 39 40## ----identify_factors_as_df----------------------------------------------------------------------- 41setDF(Teams) # convert to data.frame for illustration 42sapply(Teams[ , fkt], is.character) 43setDT(Teams) # convert back to data.table 44 45## ----assign_factors------------------------------------------------------------------------------- 46Teams[ , (fkt) := lapply(.SD, factor), .SDcols = fkt] 47# print out the first column to demonstrate success 48head(unique(Teams[[fkt[1L]]])) 49 50## ----sd_as_logical-------------------------------------------------------------------------------- 51# while .SDcols accepts a logical vector, 52# := does not, so we need to convert to column 53# positions with which() 54fkt_idx = which(sapply(Teams, is.factor)) 55Teams[ , (fkt_idx) := lapply(.SD, as.character), .SDcols = fkt_idx] 56head(unique(Teams[[fkt_idx[1L]]])) 57 58## ----sd_patterns---------------------------------------------------------------------------------- 59Teams[ , .SD, .SDcols = patterns('team')] 60 61# now convert these columns to factor; 62# value = TRUE in grep() is for the LHS of := to 63# get column names instead of positions 64team_idx = grep('team', names(Teams), value = TRUE) 65Teams[ , (team_idx) := lapply(.SD, factor), .SDcols = team_idx] 66 67## ----sd_for_lm, cache = FALSE--------------------------------------------------------------------- 68# this generates a list of the 2^k possible extra variables 69# for models of the form ERA ~ G + (...) 70extra_var = c('yearID', 'teamID', 'G', 'L') 71models = unlist( 72 lapply(0L:length(extra_var), combn, x = extra_var, simplify = FALSE), 73 recursive = FALSE 74) 75 76# here are 16 visually distinct colors, taken from the list of 20 here: 77# https://sashat.me/2017/01/11/list-of-20-simple-distinct-colors/ 78col16 = c('#e6194b', '#3cb44b', '#ffe119', '#0082c8', 79 '#f58231', '#911eb4', '#46f0f0', '#f032e6', 80 '#d2f53c', '#fabebe', '#008080', '#e6beff', 81 '#aa6e28', '#fffac8', '#800000', '#aaffc3') 82 83par(oma = c(2, 0, 0, 0)) 84lm_coef = sapply(models, function(rhs) { 85 # using ERA ~ . and data = .SD, then varying which 86 # columns are included in .SD allows us to perform this 87 # iteration over 16 models succinctly. 88 # coef(.)['W'] extracts the W coefficient from each model fit 89 Pitching[ , coef(lm(ERA ~ ., data = .SD))['W'], .SDcols = c('W', rhs)] 90}) 91barplot(lm_coef, names.arg = sapply(models, paste, collapse = '/'), 92 main = 'Wins Coefficient\nWith Various Covariates', 93 col = col16, las = 2L, cex.names = .8) 94 95## ----conditional_join----------------------------------------------------------------------------- 96# to exclude pitchers with exceptional performance in a few games, 97# subset first; then define rank of pitchers within their team each year 98# (in general, we should put more care into the 'ties.method' of frank) 99Pitching[G > 5, rank_in_team := frank(ERA), by = .(teamID, yearID)] 100Pitching[rank_in_team == 1, team_performance := 101 Teams[.SD, Rank, on = c('teamID', 'yearID')]] 102 103## ----grouping_png, fig.cap = "Grouping, Illustrated", echo = FALSE-------------------------------- 104knitr::include_graphics('plots/grouping_illustration.png') 105 106## ----group_sd_last-------------------------------------------------------------------------------- 107# the data is already sorted by year; if it weren't 108# we could do Teams[order(yearID), .SD[.N], by = teamID] 109Teams[ , .SD[.N], by = teamID] 110 111## ----sd_team_best_year---------------------------------------------------------------------------- 112Teams[ , .SD[which.max(R)], by = teamID] 113 114## ----group_lm, results = 'hide'------------------------------------------------------------------- 115# Overall coefficient for comparison 116overall_coef = Pitching[ , coef(lm(ERA ~ W))['W']] 117# use the .N > 20 filter to exclude teams with few observations 118Pitching[ , if (.N > 20L) .(w_coef = coef(lm(ERA ~ W))['W']), by = teamID 119 ][ , hist(w_coef, 20L, las = 1L, 120 xlab = 'Fitted Coefficient on W', 121 ylab = 'Number of Teams', col = 'darkgreen', 122 main = 'Team-Level Distribution\nWin Coefficients on ERA')] 123abline(v = overall_coef, lty = 2L, col = 'red') 124 125