1"""
2Utilities used within the ``urbansim.models`` package.
3
4"""
5import collections
6import logging
7import numbers
8try:
9    from StringIO import StringIO
10except ImportError:
11    from io import StringIO
12from tokenize import generate_tokens, NAME
13
14import numpy as np
15import pandas as pd
16import patsy
17import toolz as tz
18
19from ..utils.logutil import log_start_finish
20
21logger = logging.getLogger(__name__)
22
23
24def apply_filter_query(df, filters=None):
25    """
26    Use the DataFrame.query method to filter a table down to the
27    desired rows.
28
29    Parameters
30    ----------
31    df : pandas.DataFrame
32    filters : list of str or str, optional
33        List of filters to apply. Will be joined together with
34        ' and ' and passed to DataFrame.query. A string will be passed
35        straight to DataFrame.query.
36        If not supplied no filtering will be done.
37
38    Returns
39    -------
40    filtered_df : pandas.DataFrame
41
42    """
43    with log_start_finish('apply filter query: {!r}'.format(filters), logger):
44        if filters:
45            if isinstance(filters, str):
46                query = filters
47            else:
48                query = ' and '.join(filters)
49            return df.query(query)
50        else:
51            return df
52
53
54def _filterize(name, value):
55    """
56    Turn a `name` and `value` into a string expression compatible
57    the ``DataFrame.query`` method.
58
59    Parameters
60    ----------
61    name : str
62        Should be the name of a column in the table to which the
63        filter will be applied.
64
65        A suffix of '_max' will result in a "less than" filter,
66        a suffix of '_min' will result in a "greater than or equal to" filter,
67        and no recognized suffix will result in an "equal to" filter.
68    value : any
69        Value side of filter for comparison to column values.
70
71    Returns
72    -------
73    filter_exp : str
74
75    """
76    if name.endswith('_min'):
77        name = name[:-4]
78        comp = '>='
79    elif name.endswith('_max'):
80        name = name[:-4]
81        comp = '<'
82    else:
83        comp = '=='
84
85    result = '{} {} {!r}'.format(name, comp, value)
86    logger.debug(
87        'converted name={} and value={} to filter {}'.format(
88            name, value, result))
89    return result
90
91
92def filter_table(table, filter_series, ignore=None):
93    """
94    Filter a table based on a set of restrictions given in
95    Series of column name / filter parameter pairs. The column
96    names can have suffixes `_min` and `_max` to indicate
97    "less than" and "greater than" constraints.
98
99    Parameters
100    ----------
101    table : pandas.DataFrame
102        Table to filter.
103    filter_series : pandas.Series
104        Series of column name / value pairs of filter constraints.
105        Columns that ends with '_max' will be used to create
106        a "less than" filters, columns that end with '_min' will be
107        used to create "greater than or equal to" filters.
108        A column with no suffix will be used to make an 'equal to' filter.
109    ignore : sequence of str, optional
110        List of column names that should not be used for filtering.
111
112    Returns
113    -------
114    filtered : pandas.DataFrame
115
116    """
117    with log_start_finish('filter table', logger):
118        ignore = ignore if ignore else set()
119
120        filters = [_filterize(name, val)
121                   for name, val in filter_series.iteritems()
122                   if not (name in ignore or
123                           (isinstance(val, numbers.Number) and
124                            np.isnan(val)))]
125
126        return apply_filter_query(table, filters)
127
128
129def concat_indexes(indexes):
130    """
131    Concatenate a sequence of pandas Indexes.
132
133    Parameters
134    ----------
135    indexes : sequence of pandas.Index
136
137    Returns
138    -------
139    pandas.Index
140
141    """
142    return pd.Index(np.concatenate(indexes))
143
144
145def has_constant_expr(expr):
146    """
147    Report whether a model expression has constant specific term.
148    That is, a term explicitly specying whether the model should or
149    should not include a constant. (e.g. '+ 1' or '- 1'.)
150
151    Parameters
152    ----------
153    expr : str
154        Model expression to check.
155
156    Returns
157    -------
158    has_constant : bool
159
160    """
161    def has_constant(node):
162        if node.type == 'ONE':
163            return True
164
165        for n in node.args:
166            if has_constant(n):
167                return True
168
169        return False
170
171    return has_constant(patsy.parse_formula.parse_formula(expr))
172
173
174def str_model_expression(expr, add_constant=True):
175    """
176    We support specifying model expressions as strings, lists, or dicts;
177    but for use with patsy and statsmodels we need a string.
178    This function will take any of those as input and return a string.
179
180    Parameters
181    ----------
182    expr : str, iterable, or dict
183        A string will be returned unmodified except to add or remove
184        a constant.
185        An iterable sequence will be joined together with ' + '.
186        A dictionary should have ``right_side`` and, optionally,
187        ``left_side`` keys. The ``right_side`` can be a list or a string
188        and will be handled as above. If ``left_side`` is present it will
189        be joined with ``right_side`` with ' ~ '.
190    add_constant : bool, optional
191        Whether to add a ' + 1' (if True) or ' - 1' (if False) to the model.
192        If the expression already has a '+ 1' or '- 1' this option will be
193        ignored.
194
195    Returns
196    -------
197    model_expression : str
198        A string model expression suitable for use with statsmodels and patsy.
199
200    """
201    if not isinstance(expr, str):
202        if isinstance(expr, collections.Mapping):
203            left_side = expr.get('left_side')
204            right_side = str_model_expression(expr['right_side'], add_constant)
205        else:
206            # some kind of iterable like a list
207            left_side = None
208            right_side = ' + '.join(expr)
209
210        if left_side:
211            model_expression = ' ~ '.join((left_side, right_side))
212        else:
213            model_expression = right_side
214
215    else:
216        model_expression = expr
217
218    if not has_constant_expr(model_expression):
219        if add_constant:
220            model_expression += ' + 1'
221        else:
222            model_expression += ' - 1'
223
224    logger.debug(
225        'converted expression: {!r} to model: {!r}'.format(
226            expr, model_expression))
227    return model_expression
228
229
230def sorted_groupby(df, groupby):
231    """
232    Perform a groupby on a DataFrame using a specific column
233    and assuming that that column is sorted.
234
235    Parameters
236    ----------
237    df : pandas.DataFrame
238    groupby : object
239        Column name on which to groupby. This column must be sorted.
240
241    Returns
242    -------
243    generator
244        Yields pairs of group_name, DataFrame.
245
246    """
247    start = 0
248    prev = df[groupby].iloc[start]
249    for i, x in enumerate(df[groupby]):
250        if x != prev:
251            yield prev, df.iloc[start:i]
252            prev = x
253            start = i
254    # need to send back the last group
255    yield prev, df.iloc[start:]
256
257
258def columns_in_filters(filters):
259    """
260    Returns a list of the columns used in a set of query filters.
261
262    Parameters
263    ----------
264    filters : list of str or str
265        List of the filters as passed passed to ``apply_filter_query``.
266
267    Returns
268    -------
269    columns : list of str
270        List of all the strings mentioned in the filters.
271
272    """
273    if not filters:
274        return []
275
276    if not isinstance(filters, str):
277        filters = ' '.join(filters)
278
279    columns = []
280    reserved = {'and', 'or', 'in', 'not'}
281
282    for toknum, tokval, _, _, _ in generate_tokens(StringIO(filters).readline):
283        if toknum == NAME and tokval not in reserved:
284            columns.append(tokval)
285
286    return list(tz.unique(columns))
287
288
289def _tokens_from_patsy(node):
290    """
291    Yields all the individual tokens from within a patsy formula
292    as parsed by patsy.parse_formula.parse_formula.
293
294    Parameters
295    ----------
296    node : patsy.parse_formula.ParseNode
297
298    """
299    for n in node.args:
300        for t in _tokens_from_patsy(n):
301            yield t
302
303    if node.token:
304        yield node.token
305
306
307def columns_in_formula(formula):
308    """
309    Returns the names of all the columns used in a patsy formula.
310
311    Parameters
312    ----------
313    formula : str, iterable, or dict
314        Any formula construction supported by ``str_model_expression``.
315
316    Returns
317    -------
318    columns : list of str
319
320    """
321    if formula is None:
322        return []
323
324    formula = str_model_expression(formula, add_constant=False)
325    columns = []
326
327    tokens = map(
328        lambda x: x.extra,
329        tz.remove(
330            lambda x: x.extra is None,
331            _tokens_from_patsy(patsy.parse_formula.parse_formula(formula))))
332
333    for tok in tokens:
334        # if there are parentheses in the expression we
335        # want to drop them and everything outside
336        # and start again from the top
337        if '(' in tok:
338            start = tok.find('(') + 1
339            fin = tok.rfind(')')
340            columns.extend(columns_in_formula(tok[start:fin]))
341        else:
342            for toknum, tokval, _, _, _ in generate_tokens(
343                    StringIO(tok).readline):
344                if toknum == NAME:
345                    columns.append(tokval)
346
347    return list(tz.unique(columns))
348