1"""Model-based methods for areal interpolation."""
2
3import numpy as np
4import statsmodels.formula.api as smf
5from statsmodels.genmod.families import Gaussian, NegativeBinomial, Poisson
6from warnings import warn
7from ..area_weighted._vectorized_raster_interpolation import (
8    _check_presence_of_crs,
9    _calculate_interpolated_population_from_correspondence_table,
10    _create_non_zero_population_by_pixels_locations,
11    _fast_append_profile_in_gdf,
12    _return_weights_from_regression,
13)
14from tobler.util import project_gdf
15
16
17def glm_pixel_adjusted(
18    source_df=None,
19    target_df=None,
20    raster=None,
21    raster_codes=None,
22    variable=None,
23    formula=None,
24    likelihood="poisson",
25    force_crs_match=True,
26    **kwargs,
27):
28    """Estimate interpolated values using raster data as input to a generalized linear model, then apply an adjustmnent factor based on pixel values.
29
30    Unlike the regular `glm` function, this version applies an experimental pixel-level adjustment
31    subsequent to fitting the model. This has the benefit of making sure local control totals are
32    respected, but can also induce unknown error. Use with caution.
33
34    Parameters
35    ----------
36    source_df : geopandas.GeoDataFrame, required
37        geodataframe containing source original data to be represented by another geometry
38    target_df : geopandas.GeoDataFrame, required
39        geodataframe containing target boundaries that will be used to represent the source data
40    raster : str, required
41        path to raster file that will be used to input data to the regression model.
42        i.e. a coefficients refer to the relationship between pixel counts and population counts.
43    raster_codes : list, required (default =[21, 22, 23, 24])
44        list of integers that represent different types of raster cells.
45        Defaults to [21, 22, 23, 24] whichare considered developed land types in the NLCD
46    variable : str, required
47        name of the variable (column) to be modeled from the `source_df`
48    formula : str, optional
49        patsy-style model formula
50    likelihood : str, {'poisson', 'gaussian'} (default = "poisson")
51        the likelihood function used in the model
52
53    Returns
54    --------
55    interpolated : geopandas.GeoDataFrame
56        a new geopandas dataframe with boundaries from `target_df` and modeled attribute data
57        from the `source_df`
58
59    """
60    if not raster_codes:
61        raster_codes = [21, 22, 23, 24]
62    if not raster:
63        raise IOError(
64            "You must provide the path to a raster that can be read with rasterio"
65        )
66
67    # build weights from raster and vector data
68    weights = _return_weights_from_regression(
69        geodataframe=source_df,
70        raster_path=raster,
71        pop_string=variable,
72        formula_string=formula,
73        codes=raster_codes,
74        force_crs_match=force_crs_match,
75        likelihood=likelihood,
76        na_value=255,
77        ReLU=False,
78    )
79
80    # match vector population to pixel counts
81    correspondence_table = _create_non_zero_population_by_pixels_locations(
82        geodataframe=source_df, raster=raster, pop_string=variable, weights=weights
83    )
84
85    # estimate the model
86    interpolated = _calculate_interpolated_population_from_correspondence_table(
87        target_df, raster, correspondence_table, variable_name=variable
88    )
89
90    return interpolated
91
92
93def glm(
94    source_df=None,
95    target_df=None,
96    raster="nlcd_2011",
97    raster_codes=None,
98    variable=None,
99    formula=None,
100    likelihood="poisson",
101    force_crs_match=True,
102    return_model=False,
103):
104    """Estimate interpolated values using raster data as input to a generalized linear model.
105
106    Parameters
107    ----------
108    source_df : geopandas.GeoDataFrame, required
109        geodataframe containing source original data to be represented by another geometry
110    target_df : geopandas.GeoDataFrame, required
111        geodataframe containing target boundaries that will be used to represent the source data
112    raster : str, required (default="nlcd_2011")
113        path to raster file that will be used to input data to the regression model.
114        i.e. a coefficients refer to the relationship between pixel counts and population counts.
115        Defaults to 2011 NLCD
116    raster_codes : list, required (default =[21, 22, 23, 24, 41, 42, 52])
117        list of integers that represent different types of raster cells. If no formula is given,
118        the model will be fit from a linear combination of the logged count of each cell type
119        listed here. Defaults to [21, 22, 23, 24, 41, 42, 52] which
120        are informative land type cells from the NLCD
121    variable : str, required
122        name of the variable (column) to be modeled from the `source_df`
123    formula : str, optional
124        patsy-style model formula that specifies the model. Raster codes should be prefixed with
125        "Type_", e.g. `"n_total_pop ~ -1 + np.log1p(Type_21) + np.log1p(Type_22)`
126    likelihood : str, {'poisson', 'gaussian', 'neg_binomial'} (default = "poisson")
127        the likelihood function used in the model
128    force_crs_match : bool
129        whether to coerce geodataframe and raster to the same CRS
130    return model : bool
131        whether to return the fitted model in addition to the interpolated geodataframe.
132        If true, this will return (geodataframe, model)
133
134    Returns
135    --------
136    interpolated : geopandas.GeoDataFrame
137        a new geopandas dataframe with boundaries from `target_df` and modeled attribute
138        data from the `source_df`. If `return_model` is true, the function will also return
139        the fitted regression model for further diagnostics
140
141
142    """
143    source_df = source_df.copy()
144    target_df = target_df.copy()
145    _check_presence_of_crs(source_df)
146    liks = {"poisson": Poisson, "gaussian": Gaussian, "neg_binomial": NegativeBinomial}
147
148    if likelihood not in liks.keys():
149        raise ValueError(f"likelihood must one of {liks.keys()}")
150
151    if not raster_codes:
152        raster_codes = [21, 22, 23, 24, 41, 42, 52]
153    raster_codes = ["Type_" + str(i) for i in raster_codes]
154
155    if not formula:
156        formula = (
157            variable
158            + "~ -1 +"
159            + "+".join(["np.log1p(" + code + ")" for code in raster_codes])
160        )
161    if source_df.crs.is_geographic:
162        source_df["area"] = project_gdf(source_df).area
163        warn("Geograpic CRS detected. Calculating area using auto UTM reprojection")
164    else:
165        source_df["area"] = source_df.area
166
167    profiled_df = _fast_append_profile_in_gdf(
168        source_df[[source_df.geometry.name, variable, "area"]], raster, force_crs_match
169    )
170
171    results = smf.glm(formula, data=profiled_df, family=liks[likelihood]()).fit()
172
173    out = target_df[[target_df.geometry.name]]
174    temp = _fast_append_profile_in_gdf(
175        out[[out.geometry.name]], raster, force_crs_match
176    )
177    temp["area"] = temp.area
178
179    out[variable] = results.predict(temp.drop(columns=[temp.geometry.name]).fillna(0))
180
181    if return_model:
182        return out, results
183
184    return out
185