1"""Model-based methods for areal interpolation.""" 2 3import numpy as np 4import statsmodels.formula.api as smf 5from statsmodels.genmod.families import Gaussian, NegativeBinomial, Poisson 6from warnings import warn 7from ..area_weighted._vectorized_raster_interpolation import ( 8 _check_presence_of_crs, 9 _calculate_interpolated_population_from_correspondence_table, 10 _create_non_zero_population_by_pixels_locations, 11 _fast_append_profile_in_gdf, 12 _return_weights_from_regression, 13) 14from tobler.util import project_gdf 15 16 17def glm_pixel_adjusted( 18 source_df=None, 19 target_df=None, 20 raster=None, 21 raster_codes=None, 22 variable=None, 23 formula=None, 24 likelihood="poisson", 25 force_crs_match=True, 26 **kwargs, 27): 28 """Estimate interpolated values using raster data as input to a generalized linear model, then apply an adjustmnent factor based on pixel values. 29 30 Unlike the regular `glm` function, this version applies an experimental pixel-level adjustment 31 subsequent to fitting the model. This has the benefit of making sure local control totals are 32 respected, but can also induce unknown error. Use with caution. 33 34 Parameters 35 ---------- 36 source_df : geopandas.GeoDataFrame, required 37 geodataframe containing source original data to be represented by another geometry 38 target_df : geopandas.GeoDataFrame, required 39 geodataframe containing target boundaries that will be used to represent the source data 40 raster : str, required 41 path to raster file that will be used to input data to the regression model. 42 i.e. a coefficients refer to the relationship between pixel counts and population counts. 43 raster_codes : list, required (default =[21, 22, 23, 24]) 44 list of integers that represent different types of raster cells. 45 Defaults to [21, 22, 23, 24] whichare considered developed land types in the NLCD 46 variable : str, required 47 name of the variable (column) to be modeled from the `source_df` 48 formula : str, optional 49 patsy-style model formula 50 likelihood : str, {'poisson', 'gaussian'} (default = "poisson") 51 the likelihood function used in the model 52 53 Returns 54 -------- 55 interpolated : geopandas.GeoDataFrame 56 a new geopandas dataframe with boundaries from `target_df` and modeled attribute data 57 from the `source_df` 58 59 """ 60 if not raster_codes: 61 raster_codes = [21, 22, 23, 24] 62 if not raster: 63 raise IOError( 64 "You must provide the path to a raster that can be read with rasterio" 65 ) 66 67 # build weights from raster and vector data 68 weights = _return_weights_from_regression( 69 geodataframe=source_df, 70 raster_path=raster, 71 pop_string=variable, 72 formula_string=formula, 73 codes=raster_codes, 74 force_crs_match=force_crs_match, 75 likelihood=likelihood, 76 na_value=255, 77 ReLU=False, 78 ) 79 80 # match vector population to pixel counts 81 correspondence_table = _create_non_zero_population_by_pixels_locations( 82 geodataframe=source_df, raster=raster, pop_string=variable, weights=weights 83 ) 84 85 # estimate the model 86 interpolated = _calculate_interpolated_population_from_correspondence_table( 87 target_df, raster, correspondence_table, variable_name=variable 88 ) 89 90 return interpolated 91 92 93def glm( 94 source_df=None, 95 target_df=None, 96 raster="nlcd_2011", 97 raster_codes=None, 98 variable=None, 99 formula=None, 100 likelihood="poisson", 101 force_crs_match=True, 102 return_model=False, 103): 104 """Estimate interpolated values using raster data as input to a generalized linear model. 105 106 Parameters 107 ---------- 108 source_df : geopandas.GeoDataFrame, required 109 geodataframe containing source original data to be represented by another geometry 110 target_df : geopandas.GeoDataFrame, required 111 geodataframe containing target boundaries that will be used to represent the source data 112 raster : str, required (default="nlcd_2011") 113 path to raster file that will be used to input data to the regression model. 114 i.e. a coefficients refer to the relationship between pixel counts and population counts. 115 Defaults to 2011 NLCD 116 raster_codes : list, required (default =[21, 22, 23, 24, 41, 42, 52]) 117 list of integers that represent different types of raster cells. If no formula is given, 118 the model will be fit from a linear combination of the logged count of each cell type 119 listed here. Defaults to [21, 22, 23, 24, 41, 42, 52] which 120 are informative land type cells from the NLCD 121 variable : str, required 122 name of the variable (column) to be modeled from the `source_df` 123 formula : str, optional 124 patsy-style model formula that specifies the model. Raster codes should be prefixed with 125 "Type_", e.g. `"n_total_pop ~ -1 + np.log1p(Type_21) + np.log1p(Type_22)` 126 likelihood : str, {'poisson', 'gaussian', 'neg_binomial'} (default = "poisson") 127 the likelihood function used in the model 128 force_crs_match : bool 129 whether to coerce geodataframe and raster to the same CRS 130 return model : bool 131 whether to return the fitted model in addition to the interpolated geodataframe. 132 If true, this will return (geodataframe, model) 133 134 Returns 135 -------- 136 interpolated : geopandas.GeoDataFrame 137 a new geopandas dataframe with boundaries from `target_df` and modeled attribute 138 data from the `source_df`. If `return_model` is true, the function will also return 139 the fitted regression model for further diagnostics 140 141 142 """ 143 source_df = source_df.copy() 144 target_df = target_df.copy() 145 _check_presence_of_crs(source_df) 146 liks = {"poisson": Poisson, "gaussian": Gaussian, "neg_binomial": NegativeBinomial} 147 148 if likelihood not in liks.keys(): 149 raise ValueError(f"likelihood must one of {liks.keys()}") 150 151 if not raster_codes: 152 raster_codes = [21, 22, 23, 24, 41, 42, 52] 153 raster_codes = ["Type_" + str(i) for i in raster_codes] 154 155 if not formula: 156 formula = ( 157 variable 158 + "~ -1 +" 159 + "+".join(["np.log1p(" + code + ")" for code in raster_codes]) 160 ) 161 if source_df.crs.is_geographic: 162 source_df["area"] = project_gdf(source_df).area 163 warn("Geograpic CRS detected. Calculating area using auto UTM reprojection") 164 else: 165 source_df["area"] = source_df.area 166 167 profiled_df = _fast_append_profile_in_gdf( 168 source_df[[source_df.geometry.name, variable, "area"]], raster, force_crs_match 169 ) 170 171 results = smf.glm(formula, data=profiled_df, family=liks[likelihood]()).fit() 172 173 out = target_df[[target_df.geometry.name]] 174 temp = _fast_append_profile_in_gdf( 175 out[[out.geometry.name]], raster, force_crs_match 176 ) 177 temp["area"] = temp.area 178 179 out[variable] = results.predict(temp.drop(columns=[temp.geometry.name]).fillna(0)) 180 181 if return_model: 182 return out, results 183 184 return out 185