1# TODO Non-Linear Regressions can be used
2# TODO Further decomposition of the two_fold parameters i.e.
3# the delta method for further two_fold detail
4"""
5Author: Austin Adams
6
7This class implements Oaxaca-Blinder Decomposition. It returns
8a OaxacaResults Class:
9
10OaxacaBlinder:
11Two-Fold (two_fold)
12Three-Fold (three_fold)
13
14OaxacaResults:
15Table Summary (summary)
16
17Oaxaca-Blinder is a statistical method that is used to explain
18the differences between two mean values. The idea is to show
19from two mean values what can be explained by the data and
20what cannot by using OLS regression frameworks.
21
22"The original use by Oaxaca's was to explain the wage
23differential between two different groups of workers,
24but the method has since been applied to numerous other
25topics." (Wikipedia)
26
27The model is designed to accept two endogenous response variables
28and two exogenous explanitory variables. They are then fit using
29the specific type of decomposition that you want.
30
31The method was famously used in Card and Krueger's paper
32"School Quality and Black-White Relative Earnings: A Direct Assessment" (1992)
33
34General reference for Oaxaca-Blinder:
35
36B. Jann "The Blinder-Oaxaca decomposition for linear
37regression models," The Stata Journal, 2008.
38
39Econometrics references for regression models:
40
41E. M. Kitagawa  "Components of a Difference Between Two Rates"
42Journal of the American Statistical Association, 1955.
43
44A. S. Blinder "Wage Discrimination: Reduced Form and Structural
45Estimates," The Journal of Human Resources, 1973.
46"""
47from textwrap import dedent
48
49import numpy as np
50
51from statsmodels.regression.linear_model import OLS
52from statsmodels.tools.tools import add_constant
53
54
55class OaxacaBlinder(object):
56    """
57    Class to perform Oaxaca-Blinder Decomposition.
58
59    Parameters
60    ----------
61    endog : array_like
62        The endogenous variable or the dependent variable that you are trying
63        to explain.
64    exog : array_like
65        The exogenous variable(s) or the independent variable(s) that you are
66        using to explain the endogenous variable.
67    bifurcate : {int, str}
68        The column of the exogenous variable(s) on which to split. This would
69        generally be the group that you wish to explain the two means for.
70        Int of the column for a NumPy array or int/string for the name of
71        the column in Pandas.
72    hasconst : bool, optional
73        Indicates whether the two exogenous variables include a user-supplied
74        constant. If True, a constant is assumed. If False, a constant is added
75        at the start. If nothing is supplied, then True is assumed.
76    swap : bool, optional
77        Imitates the STATA Oaxaca command by allowing users to choose to swap
78        groups. Unlike STATA, this is assumed to be True instead of False
79    cov_type : str, optional
80        See regression.linear_model.RegressionResults for a description of the
81        available covariance estimators
82    cov_kwds : dict, optional
83        See linear_model.RegressionResults.get_robustcov_results for a
84        description required keywords for alternative covariance estimators
85
86    Notes
87    -----
88    Please check if your data includes at constant. This will still run, but
89    will return incorrect values if set incorrectly.
90
91    You can access the models by using their code as an attribute, e.g.,
92    _t_model for the total model, _f_model for the first model, _s_model for
93    the second model.
94
95    Examples
96    --------
97    >>> import numpy as np
98    >>> import statsmodels.api as sm
99    >>> data = sm.datasets.ccards.load()
100
101    '3' is the column of which we want to explain or which indicates
102    the two groups. In this case, it is if you rent.
103
104    >>> model = sm.OaxacaBlinder(df.endog, df.exog, 3, hasconst = False)
105    >>> model.two_fold().summary()
106    Oaxaca-Blinder Two-fold Effects
107    Unexplained Effect: 27.94091
108    Explained Effect: 130.80954
109    Gap: 158.75044
110
111    >>> model.three_fold().summary()
112    Oaxaca-Blinder Three-fold Effects
113    Endowments Effect: 321.74824
114    Coefficient Effect: 75.45371
115    Interaction Effect: -238.45151
116    Gap: 158.75044
117    """
118
119    def __init__(
120        self,
121        endog,
122        exog,
123        bifurcate,
124        hasconst=True,
125        swap=True,
126        cov_type="nonrobust",
127        cov_kwds=None,
128    ):
129        if str(type(exog)).find("pandas") != -1:
130            bifurcate = exog.columns.get_loc(bifurcate)
131            endog, exog = np.array(endog), np.array(exog)
132
133        self.two_fold_type = None
134        self.bifurcate = bifurcate
135        self.cov_type = cov_type
136        self.cov_kwds = cov_kwds
137        self.neumark = np.delete(exog, bifurcate, axis=1)
138        self.exog = exog
139        self.hasconst = hasconst
140        bi_col = exog[:, bifurcate]
141        endog = np.column_stack((bi_col, endog))
142        bi = np.unique(bi_col)
143        self.bi_col = bi_col
144
145        # split the data along the bifurcate axis, the issue is you need to
146        # delete it after you fit the model for the total model.
147        exog_f = exog[np.where(exog[:, bifurcate] == bi[0])]
148        exog_s = exog[np.where(exog[:, bifurcate] == bi[1])]
149        endog_f = endog[np.where(endog[:, 0] == bi[0])]
150        endog_s = endog[np.where(endog[:, 0] == bi[1])]
151        exog_f = np.delete(exog_f, bifurcate, axis=1)
152        exog_s = np.delete(exog_s, bifurcate, axis=1)
153        endog_f = endog_f[:, 1]
154        endog_s = endog_s[:, 1]
155        self.endog = endog[:, 1]
156
157        self.len_f, self.len_s = len(endog_f), len(endog_s)
158        self.gap = endog_f.mean() - endog_s.mean()
159
160        if swap and self.gap < 0:
161            endog_f, endog_s = endog_s, endog_f
162            exog_f, exog_s = exog_s, exog_f
163            self.gap = endog_f.mean() - endog_s.mean()
164            bi[0], bi[1] = bi[1], bi[0]
165
166        self.bi = bi
167
168        if hasconst is False:
169            exog_f = add_constant(exog_f, prepend=False)
170            exog_s = add_constant(exog_s, prepend=False)
171            self.exog = add_constant(self.exog, prepend=False)
172            self.neumark = add_constant(self.neumark, prepend=False)
173
174        self.exog_f_mean = np.mean(exog_f, axis=0)
175        self.exog_s_mean = np.mean(exog_s, axis=0)
176
177        self._f_model = OLS(endog_f, exog_f).fit(
178            cov_type=cov_type, cov_kwds=cov_kwds
179        )
180        self._s_model = OLS(endog_s, exog_s).fit(
181            cov_type=cov_type, cov_kwds=cov_kwds
182        )
183
184    def variance(self, decomp_type, n=5000, conf=0.99):
185        """
186        A helper function to calculate the variance/std. Used to keep
187        the decomposition functions cleaner
188        """
189        if self.submitted_n is not None:
190            n = self.submitted_n
191        if self.submitted_conf is not None:
192            conf = self.submitted_conf
193        if self.submitted_weight is not None:
194            submitted_weight = [
195                self.submitted_weight,
196                1 - self.submitted_weight,
197            ]
198        bi = self.bi
199        bifurcate = self.bifurcate
200        endow_eff_list = []
201        coef_eff_list = []
202        int_eff_list = []
203        exp_eff_list = []
204        unexp_eff_list = []
205        for _ in range(0, n):
206            endog = np.column_stack((self.bi_col, self.endog))
207            exog = self.exog
208            amount = len(endog)
209
210            samples = np.random.randint(0, high=amount, size=amount)
211            endog = endog[samples]
212            exog = exog[samples]
213            neumark = np.delete(exog, bifurcate, axis=1)
214
215            exog_f = exog[np.where(exog[:, bifurcate] == bi[0])]
216            exog_s = exog[np.where(exog[:, bifurcate] == bi[1])]
217            endog_f = endog[np.where(endog[:, 0] == bi[0])]
218            endog_s = endog[np.where(endog[:, 0] == bi[1])]
219            exog_f = np.delete(exog_f, bifurcate, axis=1)
220            exog_s = np.delete(exog_s, bifurcate, axis=1)
221            endog_f = endog_f[:, 1]
222            endog_s = endog_s[:, 1]
223            endog = endog[:, 1]
224
225            two_fold_type = self.two_fold_type
226
227            if self.hasconst is False:
228                exog_f = add_constant(exog_f, prepend=False)
229                exog_s = add_constant(exog_s, prepend=False)
230                exog = add_constant(exog, prepend=False)
231                neumark = add_constant(neumark, prepend=False)
232
233            _f_model = OLS(endog_f, exog_f).fit(
234                cov_type=self.cov_type, cov_kwds=self.cov_kwds
235            )
236            _s_model = OLS(endog_s, exog_s).fit(
237                cov_type=self.cov_type, cov_kwds=self.cov_kwds
238            )
239            exog_f_mean = np.mean(exog_f, axis=0)
240            exog_s_mean = np.mean(exog_s, axis=0)
241
242            if decomp_type == 3:
243                endow_eff = (exog_f_mean - exog_s_mean) @ _s_model.params
244                coef_eff = exog_s_mean @ (_f_model.params - _s_model.params)
245                int_eff = (exog_f_mean - exog_s_mean) @ (
246                    _f_model.params - _s_model.params
247                )
248
249                endow_eff_list.append(endow_eff)
250                coef_eff_list.append(coef_eff)
251                int_eff_list.append(int_eff)
252
253            elif decomp_type == 2:
254                len_f = len(exog_f)
255                len_s = len(exog_s)
256
257                if two_fold_type == "cotton":
258                    t_params = (len_f / (len_f + len_s) * _f_model.params) + (
259                        len_s / (len_f + len_s) * _s_model.params
260                    )
261
262                elif two_fold_type == "reimers":
263                    t_params = 0.5 * (_f_model.params + _s_model.params)
264
265                elif two_fold_type == "self_submitted":
266                    t_params = (
267                        submitted_weight[0] * _f_model.params
268                        + submitted_weight[1] * _s_model.params
269                    )
270
271                elif two_fold_type == "nuemark":
272                    _t_model = OLS(endog, neumark).fit(
273                        cov_type=self.cov_type, cov_kwds=self.cov_kwds
274                    )
275                    t_params = _t_model.params
276
277                else:
278                    _t_model = OLS(endog, exog).fit(
279                        cov_type=self.cov_type, cov_kwds=self.cov_kwds
280                    )
281                    t_params = np.delete(_t_model.params, bifurcate)
282
283                unexplained = (exog_f_mean @ (_f_model.params - t_params)) + (
284                    exog_s_mean @ (t_params - _s_model.params)
285                )
286
287                explained = (exog_f_mean - exog_s_mean) @ t_params
288
289                unexp_eff_list.append(unexplained)
290                exp_eff_list.append(explained)
291
292        high, low = int(n * conf), int(n * (1 - conf))
293        if decomp_type == 3:
294            return [
295                np.std(np.sort(endow_eff_list)[low:high]),
296                np.std(np.sort(coef_eff_list)[low:high]),
297                np.std(np.sort(int_eff_list)[low:high]),
298            ]
299        elif decomp_type == 2:
300            return [
301                np.std(np.sort(unexp_eff_list)[low:high]),
302                np.std(np.sort(exp_eff_list)[low:high]),
303            ]
304
305    def three_fold(self, std=False, n=None, conf=None):
306        """
307        Calculates the three-fold Oaxaca Blinder Decompositions
308
309        Parameters
310        ----------
311        std: boolean, optional
312            If true, bootstrapped standard errors will be calculated.
313        n: int, optional
314            A amount of iterations to calculate the bootstrapped
315            standard errors. This defaults to 5000.
316        conf: float, optional
317            This is the confidence required for the standard error
318            calculation. Defaults to .99, but could be anything less
319            than or equal to one. One is heavy discouraged, due to the
320            extreme outliers inflating the variance.
321
322        Returns
323        -------
324        OaxacaResults
325            A results container for the three-fold decomposition.
326        """
327        self.submitted_n = n
328        self.submitted_conf = conf
329        self.submitted_weight = None
330        std_val = None
331        self.endow_eff = (
332            self.exog_f_mean - self.exog_s_mean
333        ) @ self._s_model.params
334        self.coef_eff = self.exog_s_mean @ (
335            self._f_model.params - self._s_model.params
336        )
337        self.int_eff = (self.exog_f_mean - self.exog_s_mean) @ (
338            self._f_model.params - self._s_model.params
339        )
340
341        if std is True:
342            std_val = self.variance(3)
343
344        return OaxacaResults(
345            (self.endow_eff, self.coef_eff, self.int_eff, self.gap),
346            3,
347            std_val=std_val,
348        )
349
350    def two_fold(
351        self,
352        std=False,
353        two_fold_type="pooled",
354        submitted_weight=None,
355        n=None,
356        conf=None,
357    ):
358        """
359        Calculates the two-fold or pooled Oaxaca Blinder Decompositions
360
361        Methods
362        -------
363        std: boolean, optional
364            If true, bootstrapped standard errors will be calculated.
365
366        two_fold_type: string, optional
367            This method allows for the specific calculation of the
368            non-discriminatory model. There are four different types
369            available at this time. pooled, cotton, reimers, self_submitted.
370            Pooled is assumed and if a non-viable parameter is given,
371            pooled will be ran.
372
373            pooled - This type assumes that the pooled model's parameters
374            (a normal regression) is the non-discriminatory model.
375            This includes the indicator variable. This is generally
376            the best idea. If you have economic justification for
377            using others, then use others.
378
379            nuemark - This is similar to the pooled type, but the regression
380            is not done including the indicator variable.
381
382            cotton - This type uses the adjusted in Cotton (1988), which
383            accounts for the undervaluation of one group causing the
384            overevalution of another. It uses the sample size weights for
385            a linear combination of the two model parameters
386
387            reimers - This type uses a linear combination of the two
388            models with both parameters being 50% of the
389            non-discriminatory model.
390
391            self_submitted - This allows the user to submit their
392            own weights. Please be sure to put the weight of the larger mean
393            group only. This should be submitted in the
394            submitted_weights variable.
395
396        submitted_weight: int/float, required only for self_submitted,
397            This is the submitted weight for the larger mean. If the
398            weight for the larger mean is p, then the weight for the
399            other mean is 1-p. Only submit the first value.
400
401        n: int, optional
402            A amount of iterations to calculate the bootstrapped
403            standard errors. This defaults to 5000.
404        conf: float, optional
405            This is the confidence required for the standard error
406            calculation. Defaults to .99, but could be anything less
407            than or equal to one. One is heavy discouraged, due to the
408            extreme outliers inflating the variance.
409
410        Returns
411        -------
412        OaxacaResults
413            A results container for the two-fold decomposition.
414        """
415        self.submitted_n = n
416        self.submitted_conf = conf
417        std_val = None
418        self.two_fold_type = two_fold_type
419        self.submitted_weight = submitted_weight
420
421        if two_fold_type == "cotton":
422            self.t_params = (
423                self.len_f / (self.len_f + self.len_s) * self._f_model.params
424            ) + (self.len_s / (self.len_f + self.len_s) * self._s_model.params)
425
426        elif two_fold_type == "reimers":
427            self.t_params = 0.5 * (self._f_model.params + self._s_model.params)
428
429        elif two_fold_type == "self_submitted":
430            if submitted_weight is None:
431                raise ValueError("Please submit weights")
432            submitted_weight = [submitted_weight, 1 - submitted_weight]
433            self.t_params = (
434                submitted_weight[0] * self._f_model.params
435                + submitted_weight[1] * self._s_model.params
436            )
437
438        elif two_fold_type == "nuemark":
439            self._t_model = OLS(self.endog, self.neumark).fit(
440                cov_type=self.cov_type, cov_kwds=self.cov_kwds
441            )
442            self.t_params = self._t_model.params
443
444        else:
445            self._t_model = OLS(self.endog, self.exog).fit(
446                cov_type=self.cov_type, cov_kwds=self.cov_kwds
447            )
448            self.t_params = np.delete(self._t_model.params, self.bifurcate)
449
450        self.unexplained = (
451            self.exog_f_mean @ (self._f_model.params - self.t_params)
452        ) + (self.exog_s_mean @ (self.t_params - self._s_model.params))
453        self.explained = (self.exog_f_mean - self.exog_s_mean) @ self.t_params
454
455        if std is True:
456            std_val = self.variance(2)
457
458        return OaxacaResults(
459            (self.unexplained, self.explained, self.gap), 2, std_val=std_val
460        )
461
462
463class OaxacaResults:
464    """
465    This class summarizes the fit of the OaxacaBlinder model.
466
467    Use .summary() to get a table of the fitted values or
468    use .params to receive a list of the values
469    use .std to receive a list of the standard errors
470
471    If a two-fold model was fitted, this will return
472    unexplained effect, explained effect, and the
473    mean gap. The list will always be of the following order
474    and type. If standard error was asked for, then standard error
475    calculations will also be included for each variable after each
476    calculated effect.
477
478    unexplained : float
479        This is the effect that cannot be explained by the data at hand.
480        This does not mean it cannot be explained with more.
481    explained : float
482        This is the effect that can be explained using the data.
483    gap : float
484        This is the gap in the mean differences of the two groups.
485
486    If a three-fold model was fitted, this will
487    return characteristic effect, coefficient effect
488    interaction effect, and the mean gap. The list will
489    be of the following order and type. If standard error was asked
490    for, then standard error calculations will also be included for
491    each variable after each calculated effect.
492
493    endowment effect : float
494        This is the effect due to the group differences in
495        predictors
496    coefficient effect : float
497        This is the effect due to differences of the coefficients
498        of the two groups
499    interaction effect : float
500        This is the effect due to differences in both effects
501        existing at the same time between the two groups.
502    gap : float
503        This is the gap in the mean differences of the two groups.
504
505    Attributes
506    ----------
507    params
508        A list of all values for the fitted models.
509    std
510        A list of standard error calculations.
511    """
512
513    def __init__(self, results, model_type, std_val=None):
514        self.params = results
515        self.std = std_val
516        self.model_type = model_type
517
518    def summary(self):
519        """
520        Print a summary table with the Oaxaca-Blinder effects
521        """
522        if self.model_type == 2:
523            if self.std is None:
524                print(
525                    dedent(
526                        f"""\
527                Oaxaca-Blinder Two-fold Effects
528                Unexplained Effect: {self.params[0]:.5f}
529                Explained Effect: {self.params[1]:.5f}
530                Gap: {self.params[2]:.5f}"""
531                    )
532                )
533            else:
534                print(
535                    dedent(
536                        """\
537                Oaxaca-Blinder Two-fold Effects
538                Unexplained Effect: {:.5f}
539                Unexplained Standard Error: {:.5f}
540                Explained Effect: {:.5f}
541                Explained Standard Error: {:.5f}
542                Gap: {:.5f}""".format(
543                            self.params[0],
544                            self.std[0],
545                            self.params[1],
546                            self.std[1],
547                            self.params[2],
548                        )
549                    )
550                )
551        if self.model_type == 3:
552            if self.std is None:
553                print(
554                    dedent(
555                        f"""\
556                Oaxaca-Blinder Three-fold Effects
557                Endowment Effect: {self.params[0]:.5f}
558                Coefficient Effect: {self.params[1]:.5f}
559                Interaction Effect: {self.params[2]:.5f}
560                Gap: {self.params[3]:.5f}"""
561                    )
562                )
563            else:
564                print(
565                    dedent(
566                        f"""\
567                Oaxaca-Blinder Three-fold Effects
568                Endowment Effect: {self.params[0]:.5f}
569                Endowment Standard Error: {self.std[0]:.5f}
570                Coefficient Effect: {self.params[1]:.5f}
571                Coefficient Standard Error: {self.std[1]:.5f}
572                Interaction Effect: {self.params[2]:.5f}
573                Interaction Standard Error: {self.std[2]:.5f}
574                Gap: {self.params[3]:.5f}"""
575                    )
576                )
577