1# TODO Non-Linear Regressions can be used 2# TODO Further decomposition of the two_fold parameters i.e. 3# the delta method for further two_fold detail 4""" 5Author: Austin Adams 6 7This class implements Oaxaca-Blinder Decomposition. It returns 8a OaxacaResults Class: 9 10OaxacaBlinder: 11Two-Fold (two_fold) 12Three-Fold (three_fold) 13 14OaxacaResults: 15Table Summary (summary) 16 17Oaxaca-Blinder is a statistical method that is used to explain 18the differences between two mean values. The idea is to show 19from two mean values what can be explained by the data and 20what cannot by using OLS regression frameworks. 21 22"The original use by Oaxaca's was to explain the wage 23differential between two different groups of workers, 24but the method has since been applied to numerous other 25topics." (Wikipedia) 26 27The model is designed to accept two endogenous response variables 28and two exogenous explanitory variables. They are then fit using 29the specific type of decomposition that you want. 30 31The method was famously used in Card and Krueger's paper 32"School Quality and Black-White Relative Earnings: A Direct Assessment" (1992) 33 34General reference for Oaxaca-Blinder: 35 36B. Jann "The Blinder-Oaxaca decomposition for linear 37regression models," The Stata Journal, 2008. 38 39Econometrics references for regression models: 40 41E. M. Kitagawa "Components of a Difference Between Two Rates" 42Journal of the American Statistical Association, 1955. 43 44A. S. Blinder "Wage Discrimination: Reduced Form and Structural 45Estimates," The Journal of Human Resources, 1973. 46""" 47from textwrap import dedent 48 49import numpy as np 50 51from statsmodels.regression.linear_model import OLS 52from statsmodels.tools.tools import add_constant 53 54 55class OaxacaBlinder(object): 56 """ 57 Class to perform Oaxaca-Blinder Decomposition. 58 59 Parameters 60 ---------- 61 endog : array_like 62 The endogenous variable or the dependent variable that you are trying 63 to explain. 64 exog : array_like 65 The exogenous variable(s) or the independent variable(s) that you are 66 using to explain the endogenous variable. 67 bifurcate : {int, str} 68 The column of the exogenous variable(s) on which to split. This would 69 generally be the group that you wish to explain the two means for. 70 Int of the column for a NumPy array or int/string for the name of 71 the column in Pandas. 72 hasconst : bool, optional 73 Indicates whether the two exogenous variables include a user-supplied 74 constant. If True, a constant is assumed. If False, a constant is added 75 at the start. If nothing is supplied, then True is assumed. 76 swap : bool, optional 77 Imitates the STATA Oaxaca command by allowing users to choose to swap 78 groups. Unlike STATA, this is assumed to be True instead of False 79 cov_type : str, optional 80 See regression.linear_model.RegressionResults for a description of the 81 available covariance estimators 82 cov_kwds : dict, optional 83 See linear_model.RegressionResults.get_robustcov_results for a 84 description required keywords for alternative covariance estimators 85 86 Notes 87 ----- 88 Please check if your data includes at constant. This will still run, but 89 will return incorrect values if set incorrectly. 90 91 You can access the models by using their code as an attribute, e.g., 92 _t_model for the total model, _f_model for the first model, _s_model for 93 the second model. 94 95 Examples 96 -------- 97 >>> import numpy as np 98 >>> import statsmodels.api as sm 99 >>> data = sm.datasets.ccards.load() 100 101 '3' is the column of which we want to explain or which indicates 102 the two groups. In this case, it is if you rent. 103 104 >>> model = sm.OaxacaBlinder(df.endog, df.exog, 3, hasconst = False) 105 >>> model.two_fold().summary() 106 Oaxaca-Blinder Two-fold Effects 107 Unexplained Effect: 27.94091 108 Explained Effect: 130.80954 109 Gap: 158.75044 110 111 >>> model.three_fold().summary() 112 Oaxaca-Blinder Three-fold Effects 113 Endowments Effect: 321.74824 114 Coefficient Effect: 75.45371 115 Interaction Effect: -238.45151 116 Gap: 158.75044 117 """ 118 119 def __init__( 120 self, 121 endog, 122 exog, 123 bifurcate, 124 hasconst=True, 125 swap=True, 126 cov_type="nonrobust", 127 cov_kwds=None, 128 ): 129 if str(type(exog)).find("pandas") != -1: 130 bifurcate = exog.columns.get_loc(bifurcate) 131 endog, exog = np.array(endog), np.array(exog) 132 133 self.two_fold_type = None 134 self.bifurcate = bifurcate 135 self.cov_type = cov_type 136 self.cov_kwds = cov_kwds 137 self.neumark = np.delete(exog, bifurcate, axis=1) 138 self.exog = exog 139 self.hasconst = hasconst 140 bi_col = exog[:, bifurcate] 141 endog = np.column_stack((bi_col, endog)) 142 bi = np.unique(bi_col) 143 self.bi_col = bi_col 144 145 # split the data along the bifurcate axis, the issue is you need to 146 # delete it after you fit the model for the total model. 147 exog_f = exog[np.where(exog[:, bifurcate] == bi[0])] 148 exog_s = exog[np.where(exog[:, bifurcate] == bi[1])] 149 endog_f = endog[np.where(endog[:, 0] == bi[0])] 150 endog_s = endog[np.where(endog[:, 0] == bi[1])] 151 exog_f = np.delete(exog_f, bifurcate, axis=1) 152 exog_s = np.delete(exog_s, bifurcate, axis=1) 153 endog_f = endog_f[:, 1] 154 endog_s = endog_s[:, 1] 155 self.endog = endog[:, 1] 156 157 self.len_f, self.len_s = len(endog_f), len(endog_s) 158 self.gap = endog_f.mean() - endog_s.mean() 159 160 if swap and self.gap < 0: 161 endog_f, endog_s = endog_s, endog_f 162 exog_f, exog_s = exog_s, exog_f 163 self.gap = endog_f.mean() - endog_s.mean() 164 bi[0], bi[1] = bi[1], bi[0] 165 166 self.bi = bi 167 168 if hasconst is False: 169 exog_f = add_constant(exog_f, prepend=False) 170 exog_s = add_constant(exog_s, prepend=False) 171 self.exog = add_constant(self.exog, prepend=False) 172 self.neumark = add_constant(self.neumark, prepend=False) 173 174 self.exog_f_mean = np.mean(exog_f, axis=0) 175 self.exog_s_mean = np.mean(exog_s, axis=0) 176 177 self._f_model = OLS(endog_f, exog_f).fit( 178 cov_type=cov_type, cov_kwds=cov_kwds 179 ) 180 self._s_model = OLS(endog_s, exog_s).fit( 181 cov_type=cov_type, cov_kwds=cov_kwds 182 ) 183 184 def variance(self, decomp_type, n=5000, conf=0.99): 185 """ 186 A helper function to calculate the variance/std. Used to keep 187 the decomposition functions cleaner 188 """ 189 if self.submitted_n is not None: 190 n = self.submitted_n 191 if self.submitted_conf is not None: 192 conf = self.submitted_conf 193 if self.submitted_weight is not None: 194 submitted_weight = [ 195 self.submitted_weight, 196 1 - self.submitted_weight, 197 ] 198 bi = self.bi 199 bifurcate = self.bifurcate 200 endow_eff_list = [] 201 coef_eff_list = [] 202 int_eff_list = [] 203 exp_eff_list = [] 204 unexp_eff_list = [] 205 for _ in range(0, n): 206 endog = np.column_stack((self.bi_col, self.endog)) 207 exog = self.exog 208 amount = len(endog) 209 210 samples = np.random.randint(0, high=amount, size=amount) 211 endog = endog[samples] 212 exog = exog[samples] 213 neumark = np.delete(exog, bifurcate, axis=1) 214 215 exog_f = exog[np.where(exog[:, bifurcate] == bi[0])] 216 exog_s = exog[np.where(exog[:, bifurcate] == bi[1])] 217 endog_f = endog[np.where(endog[:, 0] == bi[0])] 218 endog_s = endog[np.where(endog[:, 0] == bi[1])] 219 exog_f = np.delete(exog_f, bifurcate, axis=1) 220 exog_s = np.delete(exog_s, bifurcate, axis=1) 221 endog_f = endog_f[:, 1] 222 endog_s = endog_s[:, 1] 223 endog = endog[:, 1] 224 225 two_fold_type = self.two_fold_type 226 227 if self.hasconst is False: 228 exog_f = add_constant(exog_f, prepend=False) 229 exog_s = add_constant(exog_s, prepend=False) 230 exog = add_constant(exog, prepend=False) 231 neumark = add_constant(neumark, prepend=False) 232 233 _f_model = OLS(endog_f, exog_f).fit( 234 cov_type=self.cov_type, cov_kwds=self.cov_kwds 235 ) 236 _s_model = OLS(endog_s, exog_s).fit( 237 cov_type=self.cov_type, cov_kwds=self.cov_kwds 238 ) 239 exog_f_mean = np.mean(exog_f, axis=0) 240 exog_s_mean = np.mean(exog_s, axis=0) 241 242 if decomp_type == 3: 243 endow_eff = (exog_f_mean - exog_s_mean) @ _s_model.params 244 coef_eff = exog_s_mean @ (_f_model.params - _s_model.params) 245 int_eff = (exog_f_mean - exog_s_mean) @ ( 246 _f_model.params - _s_model.params 247 ) 248 249 endow_eff_list.append(endow_eff) 250 coef_eff_list.append(coef_eff) 251 int_eff_list.append(int_eff) 252 253 elif decomp_type == 2: 254 len_f = len(exog_f) 255 len_s = len(exog_s) 256 257 if two_fold_type == "cotton": 258 t_params = (len_f / (len_f + len_s) * _f_model.params) + ( 259 len_s / (len_f + len_s) * _s_model.params 260 ) 261 262 elif two_fold_type == "reimers": 263 t_params = 0.5 * (_f_model.params + _s_model.params) 264 265 elif two_fold_type == "self_submitted": 266 t_params = ( 267 submitted_weight[0] * _f_model.params 268 + submitted_weight[1] * _s_model.params 269 ) 270 271 elif two_fold_type == "nuemark": 272 _t_model = OLS(endog, neumark).fit( 273 cov_type=self.cov_type, cov_kwds=self.cov_kwds 274 ) 275 t_params = _t_model.params 276 277 else: 278 _t_model = OLS(endog, exog).fit( 279 cov_type=self.cov_type, cov_kwds=self.cov_kwds 280 ) 281 t_params = np.delete(_t_model.params, bifurcate) 282 283 unexplained = (exog_f_mean @ (_f_model.params - t_params)) + ( 284 exog_s_mean @ (t_params - _s_model.params) 285 ) 286 287 explained = (exog_f_mean - exog_s_mean) @ t_params 288 289 unexp_eff_list.append(unexplained) 290 exp_eff_list.append(explained) 291 292 high, low = int(n * conf), int(n * (1 - conf)) 293 if decomp_type == 3: 294 return [ 295 np.std(np.sort(endow_eff_list)[low:high]), 296 np.std(np.sort(coef_eff_list)[low:high]), 297 np.std(np.sort(int_eff_list)[low:high]), 298 ] 299 elif decomp_type == 2: 300 return [ 301 np.std(np.sort(unexp_eff_list)[low:high]), 302 np.std(np.sort(exp_eff_list)[low:high]), 303 ] 304 305 def three_fold(self, std=False, n=None, conf=None): 306 """ 307 Calculates the three-fold Oaxaca Blinder Decompositions 308 309 Parameters 310 ---------- 311 std: boolean, optional 312 If true, bootstrapped standard errors will be calculated. 313 n: int, optional 314 A amount of iterations to calculate the bootstrapped 315 standard errors. This defaults to 5000. 316 conf: float, optional 317 This is the confidence required for the standard error 318 calculation. Defaults to .99, but could be anything less 319 than or equal to one. One is heavy discouraged, due to the 320 extreme outliers inflating the variance. 321 322 Returns 323 ------- 324 OaxacaResults 325 A results container for the three-fold decomposition. 326 """ 327 self.submitted_n = n 328 self.submitted_conf = conf 329 self.submitted_weight = None 330 std_val = None 331 self.endow_eff = ( 332 self.exog_f_mean - self.exog_s_mean 333 ) @ self._s_model.params 334 self.coef_eff = self.exog_s_mean @ ( 335 self._f_model.params - self._s_model.params 336 ) 337 self.int_eff = (self.exog_f_mean - self.exog_s_mean) @ ( 338 self._f_model.params - self._s_model.params 339 ) 340 341 if std is True: 342 std_val = self.variance(3) 343 344 return OaxacaResults( 345 (self.endow_eff, self.coef_eff, self.int_eff, self.gap), 346 3, 347 std_val=std_val, 348 ) 349 350 def two_fold( 351 self, 352 std=False, 353 two_fold_type="pooled", 354 submitted_weight=None, 355 n=None, 356 conf=None, 357 ): 358 """ 359 Calculates the two-fold or pooled Oaxaca Blinder Decompositions 360 361 Methods 362 ------- 363 std: boolean, optional 364 If true, bootstrapped standard errors will be calculated. 365 366 two_fold_type: string, optional 367 This method allows for the specific calculation of the 368 non-discriminatory model. There are four different types 369 available at this time. pooled, cotton, reimers, self_submitted. 370 Pooled is assumed and if a non-viable parameter is given, 371 pooled will be ran. 372 373 pooled - This type assumes that the pooled model's parameters 374 (a normal regression) is the non-discriminatory model. 375 This includes the indicator variable. This is generally 376 the best idea. If you have economic justification for 377 using others, then use others. 378 379 nuemark - This is similar to the pooled type, but the regression 380 is not done including the indicator variable. 381 382 cotton - This type uses the adjusted in Cotton (1988), which 383 accounts for the undervaluation of one group causing the 384 overevalution of another. It uses the sample size weights for 385 a linear combination of the two model parameters 386 387 reimers - This type uses a linear combination of the two 388 models with both parameters being 50% of the 389 non-discriminatory model. 390 391 self_submitted - This allows the user to submit their 392 own weights. Please be sure to put the weight of the larger mean 393 group only. This should be submitted in the 394 submitted_weights variable. 395 396 submitted_weight: int/float, required only for self_submitted, 397 This is the submitted weight for the larger mean. If the 398 weight for the larger mean is p, then the weight for the 399 other mean is 1-p. Only submit the first value. 400 401 n: int, optional 402 A amount of iterations to calculate the bootstrapped 403 standard errors. This defaults to 5000. 404 conf: float, optional 405 This is the confidence required for the standard error 406 calculation. Defaults to .99, but could be anything less 407 than or equal to one. One is heavy discouraged, due to the 408 extreme outliers inflating the variance. 409 410 Returns 411 ------- 412 OaxacaResults 413 A results container for the two-fold decomposition. 414 """ 415 self.submitted_n = n 416 self.submitted_conf = conf 417 std_val = None 418 self.two_fold_type = two_fold_type 419 self.submitted_weight = submitted_weight 420 421 if two_fold_type == "cotton": 422 self.t_params = ( 423 self.len_f / (self.len_f + self.len_s) * self._f_model.params 424 ) + (self.len_s / (self.len_f + self.len_s) * self._s_model.params) 425 426 elif two_fold_type == "reimers": 427 self.t_params = 0.5 * (self._f_model.params + self._s_model.params) 428 429 elif two_fold_type == "self_submitted": 430 if submitted_weight is None: 431 raise ValueError("Please submit weights") 432 submitted_weight = [submitted_weight, 1 - submitted_weight] 433 self.t_params = ( 434 submitted_weight[0] * self._f_model.params 435 + submitted_weight[1] * self._s_model.params 436 ) 437 438 elif two_fold_type == "nuemark": 439 self._t_model = OLS(self.endog, self.neumark).fit( 440 cov_type=self.cov_type, cov_kwds=self.cov_kwds 441 ) 442 self.t_params = self._t_model.params 443 444 else: 445 self._t_model = OLS(self.endog, self.exog).fit( 446 cov_type=self.cov_type, cov_kwds=self.cov_kwds 447 ) 448 self.t_params = np.delete(self._t_model.params, self.bifurcate) 449 450 self.unexplained = ( 451 self.exog_f_mean @ (self._f_model.params - self.t_params) 452 ) + (self.exog_s_mean @ (self.t_params - self._s_model.params)) 453 self.explained = (self.exog_f_mean - self.exog_s_mean) @ self.t_params 454 455 if std is True: 456 std_val = self.variance(2) 457 458 return OaxacaResults( 459 (self.unexplained, self.explained, self.gap), 2, std_val=std_val 460 ) 461 462 463class OaxacaResults: 464 """ 465 This class summarizes the fit of the OaxacaBlinder model. 466 467 Use .summary() to get a table of the fitted values or 468 use .params to receive a list of the values 469 use .std to receive a list of the standard errors 470 471 If a two-fold model was fitted, this will return 472 unexplained effect, explained effect, and the 473 mean gap. The list will always be of the following order 474 and type. If standard error was asked for, then standard error 475 calculations will also be included for each variable after each 476 calculated effect. 477 478 unexplained : float 479 This is the effect that cannot be explained by the data at hand. 480 This does not mean it cannot be explained with more. 481 explained : float 482 This is the effect that can be explained using the data. 483 gap : float 484 This is the gap in the mean differences of the two groups. 485 486 If a three-fold model was fitted, this will 487 return characteristic effect, coefficient effect 488 interaction effect, and the mean gap. The list will 489 be of the following order and type. If standard error was asked 490 for, then standard error calculations will also be included for 491 each variable after each calculated effect. 492 493 endowment effect : float 494 This is the effect due to the group differences in 495 predictors 496 coefficient effect : float 497 This is the effect due to differences of the coefficients 498 of the two groups 499 interaction effect : float 500 This is the effect due to differences in both effects 501 existing at the same time between the two groups. 502 gap : float 503 This is the gap in the mean differences of the two groups. 504 505 Attributes 506 ---------- 507 params 508 A list of all values for the fitted models. 509 std 510 A list of standard error calculations. 511 """ 512 513 def __init__(self, results, model_type, std_val=None): 514 self.params = results 515 self.std = std_val 516 self.model_type = model_type 517 518 def summary(self): 519 """ 520 Print a summary table with the Oaxaca-Blinder effects 521 """ 522 if self.model_type == 2: 523 if self.std is None: 524 print( 525 dedent( 526 f"""\ 527 Oaxaca-Blinder Two-fold Effects 528 Unexplained Effect: {self.params[0]:.5f} 529 Explained Effect: {self.params[1]:.5f} 530 Gap: {self.params[2]:.5f}""" 531 ) 532 ) 533 else: 534 print( 535 dedent( 536 """\ 537 Oaxaca-Blinder Two-fold Effects 538 Unexplained Effect: {:.5f} 539 Unexplained Standard Error: {:.5f} 540 Explained Effect: {:.5f} 541 Explained Standard Error: {:.5f} 542 Gap: {:.5f}""".format( 543 self.params[0], 544 self.std[0], 545 self.params[1], 546 self.std[1], 547 self.params[2], 548 ) 549 ) 550 ) 551 if self.model_type == 3: 552 if self.std is None: 553 print( 554 dedent( 555 f"""\ 556 Oaxaca-Blinder Three-fold Effects 557 Endowment Effect: {self.params[0]:.5f} 558 Coefficient Effect: {self.params[1]:.5f} 559 Interaction Effect: {self.params[2]:.5f} 560 Gap: {self.params[3]:.5f}""" 561 ) 562 ) 563 else: 564 print( 565 dedent( 566 f"""\ 567 Oaxaca-Blinder Three-fold Effects 568 Endowment Effect: {self.params[0]:.5f} 569 Endowment Standard Error: {self.std[0]:.5f} 570 Coefficient Effect: {self.params[1]:.5f} 571 Coefficient Standard Error: {self.std[1]:.5f} 572 Interaction Effect: {self.params[2]:.5f} 573 Interaction Standard Error: {self.std[2]:.5f} 574 Gap: {self.params[3]:.5f}""" 575 ) 576 ) 577