1""" 2Utilities used within the ``urbansim.models`` package. 3 4""" 5import collections 6import logging 7import numbers 8try: 9 from StringIO import StringIO 10except ImportError: 11 from io import StringIO 12from tokenize import generate_tokens, NAME 13 14import numpy as np 15import pandas as pd 16import patsy 17import toolz as tz 18 19from ..utils.logutil import log_start_finish 20 21logger = logging.getLogger(__name__) 22 23 24def apply_filter_query(df, filters=None): 25 """ 26 Use the DataFrame.query method to filter a table down to the 27 desired rows. 28 29 Parameters 30 ---------- 31 df : pandas.DataFrame 32 filters : list of str or str, optional 33 List of filters to apply. Will be joined together with 34 ' and ' and passed to DataFrame.query. A string will be passed 35 straight to DataFrame.query. 36 If not supplied no filtering will be done. 37 38 Returns 39 ------- 40 filtered_df : pandas.DataFrame 41 42 """ 43 with log_start_finish('apply filter query: {!r}'.format(filters), logger): 44 if filters: 45 if isinstance(filters, str): 46 query = filters 47 else: 48 query = ' and '.join(filters) 49 return df.query(query) 50 else: 51 return df 52 53 54def _filterize(name, value): 55 """ 56 Turn a `name` and `value` into a string expression compatible 57 the ``DataFrame.query`` method. 58 59 Parameters 60 ---------- 61 name : str 62 Should be the name of a column in the table to which the 63 filter will be applied. 64 65 A suffix of '_max' will result in a "less than" filter, 66 a suffix of '_min' will result in a "greater than or equal to" filter, 67 and no recognized suffix will result in an "equal to" filter. 68 value : any 69 Value side of filter for comparison to column values. 70 71 Returns 72 ------- 73 filter_exp : str 74 75 """ 76 if name.endswith('_min'): 77 name = name[:-4] 78 comp = '>=' 79 elif name.endswith('_max'): 80 name = name[:-4] 81 comp = '<' 82 else: 83 comp = '==' 84 85 result = '{} {} {!r}'.format(name, comp, value) 86 logger.debug( 87 'converted name={} and value={} to filter {}'.format( 88 name, value, result)) 89 return result 90 91 92def filter_table(table, filter_series, ignore=None): 93 """ 94 Filter a table based on a set of restrictions given in 95 Series of column name / filter parameter pairs. The column 96 names can have suffixes `_min` and `_max` to indicate 97 "less than" and "greater than" constraints. 98 99 Parameters 100 ---------- 101 table : pandas.DataFrame 102 Table to filter. 103 filter_series : pandas.Series 104 Series of column name / value pairs of filter constraints. 105 Columns that ends with '_max' will be used to create 106 a "less than" filters, columns that end with '_min' will be 107 used to create "greater than or equal to" filters. 108 A column with no suffix will be used to make an 'equal to' filter. 109 ignore : sequence of str, optional 110 List of column names that should not be used for filtering. 111 112 Returns 113 ------- 114 filtered : pandas.DataFrame 115 116 """ 117 with log_start_finish('filter table', logger): 118 ignore = ignore if ignore else set() 119 120 filters = [_filterize(name, val) 121 for name, val in filter_series.iteritems() 122 if not (name in ignore or 123 (isinstance(val, numbers.Number) and 124 np.isnan(val)))] 125 126 return apply_filter_query(table, filters) 127 128 129def concat_indexes(indexes): 130 """ 131 Concatenate a sequence of pandas Indexes. 132 133 Parameters 134 ---------- 135 indexes : sequence of pandas.Index 136 137 Returns 138 ------- 139 pandas.Index 140 141 """ 142 return pd.Index(np.concatenate(indexes)) 143 144 145def has_constant_expr(expr): 146 """ 147 Report whether a model expression has constant specific term. 148 That is, a term explicitly specying whether the model should or 149 should not include a constant. (e.g. '+ 1' or '- 1'.) 150 151 Parameters 152 ---------- 153 expr : str 154 Model expression to check. 155 156 Returns 157 ------- 158 has_constant : bool 159 160 """ 161 def has_constant(node): 162 if node.type == 'ONE': 163 return True 164 165 for n in node.args: 166 if has_constant(n): 167 return True 168 169 return False 170 171 return has_constant(patsy.parse_formula.parse_formula(expr)) 172 173 174def str_model_expression(expr, add_constant=True): 175 """ 176 We support specifying model expressions as strings, lists, or dicts; 177 but for use with patsy and statsmodels we need a string. 178 This function will take any of those as input and return a string. 179 180 Parameters 181 ---------- 182 expr : str, iterable, or dict 183 A string will be returned unmodified except to add or remove 184 a constant. 185 An iterable sequence will be joined together with ' + '. 186 A dictionary should have ``right_side`` and, optionally, 187 ``left_side`` keys. The ``right_side`` can be a list or a string 188 and will be handled as above. If ``left_side`` is present it will 189 be joined with ``right_side`` with ' ~ '. 190 add_constant : bool, optional 191 Whether to add a ' + 1' (if True) or ' - 1' (if False) to the model. 192 If the expression already has a '+ 1' or '- 1' this option will be 193 ignored. 194 195 Returns 196 ------- 197 model_expression : str 198 A string model expression suitable for use with statsmodels and patsy. 199 200 """ 201 if not isinstance(expr, str): 202 if isinstance(expr, collections.Mapping): 203 left_side = expr.get('left_side') 204 right_side = str_model_expression(expr['right_side'], add_constant) 205 else: 206 # some kind of iterable like a list 207 left_side = None 208 right_side = ' + '.join(expr) 209 210 if left_side: 211 model_expression = ' ~ '.join((left_side, right_side)) 212 else: 213 model_expression = right_side 214 215 else: 216 model_expression = expr 217 218 if not has_constant_expr(model_expression): 219 if add_constant: 220 model_expression += ' + 1' 221 else: 222 model_expression += ' - 1' 223 224 logger.debug( 225 'converted expression: {!r} to model: {!r}'.format( 226 expr, model_expression)) 227 return model_expression 228 229 230def sorted_groupby(df, groupby): 231 """ 232 Perform a groupby on a DataFrame using a specific column 233 and assuming that that column is sorted. 234 235 Parameters 236 ---------- 237 df : pandas.DataFrame 238 groupby : object 239 Column name on which to groupby. This column must be sorted. 240 241 Returns 242 ------- 243 generator 244 Yields pairs of group_name, DataFrame. 245 246 """ 247 start = 0 248 prev = df[groupby].iloc[start] 249 for i, x in enumerate(df[groupby]): 250 if x != prev: 251 yield prev, df.iloc[start:i] 252 prev = x 253 start = i 254 # need to send back the last group 255 yield prev, df.iloc[start:] 256 257 258def columns_in_filters(filters): 259 """ 260 Returns a list of the columns used in a set of query filters. 261 262 Parameters 263 ---------- 264 filters : list of str or str 265 List of the filters as passed passed to ``apply_filter_query``. 266 267 Returns 268 ------- 269 columns : list of str 270 List of all the strings mentioned in the filters. 271 272 """ 273 if not filters: 274 return [] 275 276 if not isinstance(filters, str): 277 filters = ' '.join(filters) 278 279 columns = [] 280 reserved = {'and', 'or', 'in', 'not'} 281 282 for toknum, tokval, _, _, _ in generate_tokens(StringIO(filters).readline): 283 if toknum == NAME and tokval not in reserved: 284 columns.append(tokval) 285 286 return list(tz.unique(columns)) 287 288 289def _tokens_from_patsy(node): 290 """ 291 Yields all the individual tokens from within a patsy formula 292 as parsed by patsy.parse_formula.parse_formula. 293 294 Parameters 295 ---------- 296 node : patsy.parse_formula.ParseNode 297 298 """ 299 for n in node.args: 300 for t in _tokens_from_patsy(n): 301 yield t 302 303 if node.token: 304 yield node.token 305 306 307def columns_in_formula(formula): 308 """ 309 Returns the names of all the columns used in a patsy formula. 310 311 Parameters 312 ---------- 313 formula : str, iterable, or dict 314 Any formula construction supported by ``str_model_expression``. 315 316 Returns 317 ------- 318 columns : list of str 319 320 """ 321 if formula is None: 322 return [] 323 324 formula = str_model_expression(formula, add_constant=False) 325 columns = [] 326 327 tokens = map( 328 lambda x: x.extra, 329 tz.remove( 330 lambda x: x.extra is None, 331 _tokens_from_patsy(patsy.parse_formula.parse_formula(formula)))) 332 333 for tok in tokens: 334 # if there are parentheses in the expression we 335 # want to drop them and everything outside 336 # and start again from the top 337 if '(' in tok: 338 start = tok.find('(') + 1 339 fin = tok.rfind(')') 340 columns.extend(columns_in_formula(tok[start:fin])) 341 else: 342 for toknum, tokval, _, _, _ in generate_tokens( 343 StringIO(tok).readline): 344 if toknum == NAME: 345 columns.append(tokval) 346 347 return list(tz.unique(columns)) 348