1import re
2import warnings
3from collections.abc import Iterable
4
5from datetime import datetime, timedelta, timezone
6from numbers import Number, Real, Integral
7from math import isnan, floor
8from pickle import PickleError
9
10import numpy as np
11import scipy.sparse as sp
12
13from Orange.data import _variable
14from Orange.util import Registry, Reprable, OrangeDeprecationWarning
15
16
17__all__ = ["Unknown", "MISSING_VALUES", "make_variable", "is_discrete_values",
18           "Value", "Variable", "ContinuousVariable", "DiscreteVariable",
19           "StringVariable", "TimeVariable"]
20
21
22# For storing unknowns
23Unknown = ValueUnknown = float("nan")
24# For checking for unknowns
25MISSING_VALUES = {np.nan, "?", "nan", ".", "", "NA", "~", None}
26
27DISCRETE_MAX_VALUES = 3  # == 2 + nan
28MAX_NUM_OF_DECIMALS = 5
29# the variable with more than 100 different values should not be StringVariable
30DISCRETE_MAX_ALLOWED_VALUES = 100
31
32
33def make_variable(cls, compute_value, *args):
34    if compute_value is not None:
35        return cls(*args, compute_value=compute_value)
36    else:
37        # For compatibility with old pickles: remove the second arg if it's
38        # bool `compute_value` (args[3]) can't be bool, so this should be safe
39        if len(args) > 2 and isinstance(args[2], bool):
40            args = args[:2] + args[3:]
41        return cls(*args)
42
43
44def is_discrete_values(values):
45    """
46    Return set of uniques if `values` is an iterable of discrete values
47    else False if non-discrete, or None if indeterminate.
48
49    Note
50    ----
51    Assumes consistent type of items of `values`.
52    """
53    if len(values) == 0:
54        return None
55    # If the first few values are, or can be converted to, floats,
56    # the type is numeric
57    try:
58        isinstance(next(iter(values)), Number) or \
59        [v not in MISSING_VALUES and float(v)
60         for _, v in zip(range(min(3, len(values))), values)]
61    except ValueError:
62        is_numeric = False
63        max_values = int(round(len(values)**.7))
64    else:
65        is_numeric = True
66        max_values = DISCRETE_MAX_VALUES
67
68    # If more than max values => not discrete
69    unique = set()
70    for i in values:
71        unique.add(i)
72        if (len(unique) > max_values or
73                len(unique) > DISCRETE_MAX_ALLOWED_VALUES):
74            return False
75
76    # Strip NaN from unique
77    unique = {i for i in unique
78              if (not i in MISSING_VALUES and
79                  not (isinstance(i, Number) and np.isnan(i)))}
80
81    # All NaNs => indeterminate
82    if not unique:
83        return None
84
85    # Strings with |values| < max_unique
86    if not is_numeric:
87        return unique
88
89    # Handle numbers
90    try:
91        unique_float = set(map(float, unique))
92    except ValueError:
93        # Converting all the values to floats resulted in an error.
94        # Since the values have enough unique values, they are probably
95        # string values and discrete.
96        return unique
97
98    # If only values are {0, 1} or {1, 2} (or a subset of those sets) => discrete
99    return (not (unique_float - {0, 1}) or
100            not (unique_float - {1, 2})) and unique
101
102
103class Value(float):
104    """
105    The class representing a value. The class is not used to store values but
106    only to return them in contexts in which we want the value to be accompanied
107    with the descriptor, for instance to print the symbolic value of discrete
108    variables.
109
110    The class is derived from `float`, with an additional attribute `variable`
111    which holds the descriptor of type :obj:`Orange.data.Variable`. If the
112    value continuous or discrete, it is stored as a float. Other types of
113    values, like strings, are stored in the attribute `value`.
114
115    The class overloads the methods for printing out the value:
116    `variable.repr_val` and `variable.str_val` are used to get a suitable
117    representation of the value.
118
119    Equivalence operator is overloaded as follows:
120
121    - unknown values are equal; if one value is unknown and the other is not,
122      they are different;
123
124    - if the value is compared with the string, the value is converted to a
125      string using `variable.str_val` and the two strings are compared
126
127    - if the value is stored in attribute `value`, it is compared with the
128      given other value
129
130    - otherwise, the inherited comparison operator for `float` is called.
131
132    Finally, value defines a hash, so values can be put in sets and appear as
133    keys in dictionaries.
134
135    .. attribute:: variable (:obj:`Orange.data.Variable`)
136
137        Descriptor; used for printing out and for comparing with strings
138
139    .. attribute:: value
140
141        Value; the value can be of arbitrary type and is used only for variables
142        that are neither discrete nor continuous. If `value` is `None`, the
143        derived `float` value is used.
144    """
145    __slots__ = "variable", "_value"
146
147    def __new__(cls, variable, value=Unknown):
148        """
149        Construct a new instance of Value with the given descriptor and value.
150        If the argument `value` can be converted to float, it is stored as
151        `float` and the attribute `value` is set to `None`. Otherwise, the
152        inherited float is set to `Unknown` and the value is held by the
153        attribute `value`.
154
155        :param variable: descriptor
156        :type variable: Orange.data.Variable
157        :param value: value
158        """
159        if variable.is_primitive():
160            self = super().__new__(cls, value)
161            self.variable = variable
162            self._value = None
163        else:
164            isunknown = value == variable.Unknown
165            self = super().__new__(
166                cls, np.nan if isunknown else np.finfo(float).min)
167            self.variable = variable
168            self._value = value
169        return self
170
171    def __init__(self, _, __=Unknown):
172        # __new__ does the job, pylint: disable=super-init-not-called
173        pass
174
175    def __repr__(self):
176        return "Value('%s', %s)" % (self.variable.name,
177                                    self.variable.repr_val(self))
178
179    def __str__(self):
180        return self.variable.str_val(self)
181
182    def __eq__(self, other):
183        if isinstance(self, Real) and isnan(self):
184            if isinstance(other, Real):
185                return isnan(other)
186            else:
187                return other in self.variable.unknown_str
188        if isinstance(other, str):
189            return self.variable.str_val(self) == other
190        if isinstance(other, Value):
191            return self.value == other.value
192        return super().__eq__(other)
193
194    def __ne__(self, other):
195        return not self.__eq__(other)
196
197    def __lt__(self, other):
198        if self.variable.is_primitive():
199            if isinstance(other, str):
200                return super().__lt__(self.variable.to_val(other))
201            else:
202                return super().__lt__(other)
203        else:
204            if isinstance(other, str):
205                return self.value < other
206            else:
207                return self.value < other.value
208
209    def __le__(self, other):
210        return self.__lt__(other) or self.__eq__(other)
211
212    def __gt__(self, other):
213        return not self.__le__(other)
214
215    def __ge__(self, other):
216        return not self.__lt__(other)
217
218    def __contains__(self, other):
219        if (self._value is not None
220                and isinstance(self._value, str)
221                and isinstance(other, str)):
222            return other in self._value
223        raise TypeError("invalid operation on Value()")
224
225    def __hash__(self):
226        if self.variable.is_discrete:
227            # It is not possible to hash the id and the domain value to the
228            # same number as required by __eq__.
229            # hash(1)
230            # == hash(Value(DiscreteVariable("var", ["red", "green", "blue"]), 1))
231            # == hash("green")
232            # User should hash directly ids or domain values instead.
233            raise TypeError("unhashable type - cannot hash values of discrete variables!")
234        if self._value is None:
235            return super().__hash__()
236        else:
237            return hash(self._value)
238
239    @property
240    def value(self):
241        if self.variable.is_discrete:
242            return Unknown if isnan(self) else self.variable.values[int(self)]
243        if self.variable.is_string:
244            return self._value
245        return float(self)
246
247    def __getnewargs__(self):
248        return self.variable, float(self)
249
250    def __getstate__(self):
251        return dict(value=getattr(self, '_value', None))
252
253    def __setstate__(self, state):
254        # defined in __new__, pylint: disable=attribute-defined-outside-init
255        self._value = state.get('value', None)
256
257
258class VariableMeta(Registry):
259    pass
260
261
262class _predicatedescriptor(property):
263    """
264    A property that behaves as a class method if accessed via a class
265    >>> class A:
266    ...     foo = False
267    ...     @_predicatedescriptor
268    ...     def is_foo(self):
269    ...         return self.foo
270    ...
271    >>> a = A()
272    >>> a.is_foo
273    False
274    >>> A.is_foo(a)
275    False
276    """
277    def __get__(self, instance, objtype=None):
278        if instance is None:
279            return self.fget
280        else:
281            return super().__get__(instance, objtype)
282
283
284class Variable(Reprable, metaclass=VariableMeta):
285    """
286    The base class for variable descriptors contains the variable's
287    name and some basic properties.
288
289    .. attribute:: name
290
291        The name of the variable.
292
293    .. attribute:: unknown_str
294
295        A set of values that represent unknowns in conversion from textual
296        formats. Default is `{"?", ".", "", "NA", "~", None}`.
297
298    .. attribute:: compute_value
299
300        A function for computing the variable's value when converting from
301        another domain which does not contain this variable. The function will
302        be called with a data set (`Orange.data.Table`) and has to return
303        an array of computed values for all its instances. The base class
304        defines a static method `compute_value`, which returns `Unknown`.
305        Non-primitive variables must redefine it to return `None`.
306
307    .. attribute:: sparse
308
309        A flag about sparsity of the variable. When set, the variable suggests
310        it should be stored in a sparse matrix.
311
312    .. attribute:: source_variable
313
314        An optional descriptor of the source variable - if any - from which
315        this variable is derived and computed via :obj:`compute_value`.
316
317    .. attribute:: attributes
318
319        A dictionary with user-defined attributes of the variable
320    """
321    Unknown = ValueUnknown
322
323    def __init__(self, name="", compute_value=None, *, sparse=False):
324        """
325        Construct a variable descriptor.
326        """
327        if not name:
328            warnings.warn("Variable must have a name", OrangeDeprecationWarning,
329                          stacklevel=3)
330        self._name = name
331        self._compute_value = compute_value
332        self.unknown_str = MISSING_VALUES
333        self.source_variable = None
334        self.sparse = sparse
335        self.attributes = {}
336
337    @property
338    def name(self):
339        return self._name
340
341    def make_proxy(self):
342        """
343        Copy the variable and set the master to `self.master` or to `self`.
344
345        :return: copy of self
346        :rtype: Variable
347        """
348        var = self.__class__(self.name)
349        var.__dict__.update(self.__dict__)
350        var.attributes = dict(self.attributes)
351        return var
352
353    def __eq__(self, other):
354        if type(self) is not type(other):
355            return False
356
357        var1 = self._get_identical_source(self)
358        var2 = self._get_identical_source(other)
359        # pylint: disable=protected-access
360        return (
361            self.name == other.name
362            and var1.name == var2.name
363            and var1._compute_value == var2._compute_value
364        )
365
366    def __hash__(self):
367        var = self._get_identical_source(self)
368        return hash((self.name, var.name, type(self), var._compute_value))
369
370    @staticmethod
371    def _get_identical_source(var):
372        # pylint: disable=protected-access,import-outside-toplevel
373        from Orange.preprocess.transformation import Identity
374        while isinstance(var._compute_value, Identity):
375            var = var._compute_value.variable
376        return var
377
378    @classmethod
379    def make(cls, name, *args, **kwargs):
380        """
381        Return an existing continuous variable with the given name, or
382        construct and return a new one.
383        """
384        return cls(name, *args, **kwargs)
385
386    @classmethod
387    def _clear_cache(cls):
388        warnings.warn(
389            "_clear_cache is no longer needed and thus deprecated")
390
391    @staticmethod
392    def _clear_all_caches():
393        warnings.warn(
394            "_clear_all_caches is no longer needed and thus deprecated")
395
396    @classmethod
397    def is_primitive(cls, var=None):
398        """
399        `True` if the variable's values are stored as floats.
400        Non-primitive variables can appear in the data only as meta attributes.
401        """
402        to_check = cls if var is None else type(var)
403        return issubclass(to_check, (DiscreteVariable, ContinuousVariable))
404
405    @_predicatedescriptor
406    def is_discrete(self):
407        return isinstance(self, DiscreteVariable)
408
409    @_predicatedescriptor
410    def is_continuous(self):
411        return isinstance(self, ContinuousVariable)
412
413    @_predicatedescriptor
414    def is_string(self):
415        return isinstance(self, StringVariable)
416
417    @_predicatedescriptor
418    def is_time(self):
419        return isinstance(self, TimeVariable)
420
421    @staticmethod
422    def repr_val(val):
423        """
424        Return a textual representation of variable's value `val`. Argument
425        `val` must be a float (for primitive variables) or an arbitrary
426        Python object (for non-primitives).
427
428        Derived classes must overload the function.
429        """
430        raise RuntimeError("variable descriptors must overload repr_val()")
431
432    str_val = repr_val
433
434    def to_val(self, s):
435        """
436        Convert the given argument to a value of the variable. The
437        argument can be a string, a number or `None`. For primitive variables,
438        the base class provides a method that returns
439        :obj:`~Orange.data.Unknown` if `s` is found in
440        :obj:`~Orange.data.Variable.unknown_str`, and raises an exception
441        otherwise. For non-primitive variables it returns the argument itself.
442
443        Derived classes of primitive variables must overload the function.
444
445        :param s: value, represented as a number, string or `None`
446        :type s: str, float or None
447        :rtype: float or object
448        """
449        if not self.is_primitive():
450            return s
451        if s in self.unknown_str:
452            return Unknown
453        raise RuntimeError(
454            "primitive variable descriptors must overload to_val()")
455
456    def val_from_str_add(self, s):
457        """
458        Convert the given string to a value of the variable. The method
459        is similar to :obj:`to_val` except that it only accepts strings and
460        that it adds new values to the variable's domain where applicable.
461
462        The base class method calls `to_val`.
463
464        :param s: symbolic representation of the value
465        :type s: str
466        :rtype: float or object
467        """
468        return self.to_val(s)
469
470    def __str__(self):
471        return self.name
472
473    @property
474    def compute_value(self):
475        return self._compute_value
476
477    def __reduce__(self):
478        if not self.name:
479            raise PickleError("Variables without names cannot be pickled")
480
481        # Use make to unpickle variables.
482        return make_variable, (self.__class__, self._compute_value, self.name), self.__dict__
483
484    _CopyComputeValue = object()
485
486    def copy(self, compute_value=_CopyComputeValue, *, name=None, **kwargs):
487        if compute_value is self._CopyComputeValue:
488            compute_value = self.compute_value
489        var = type(self)(name=name or self.name,
490                         compute_value=compute_value,
491                         sparse=self.sparse, **kwargs)
492        var.attributes = dict(self.attributes)
493        return var
494
495    def renamed(self, new_name):
496        # prevent cyclic import, pylint: disable=import-outside-toplevel
497        from Orange.preprocess.transformation import Identity
498        return self.copy(name=new_name, compute_value=Identity(variable=self))
499
500del _predicatedescriptor
501
502
503class ContinuousVariable(Variable):
504    """
505    Descriptor for continuous variables.
506
507    .. attribute:: number_of_decimals
508
509        The number of decimals when the value is printed out (default: 3).
510
511    .. attribute:: adjust_decimals
512
513        A flag regulating whether the `number_of_decimals` is being adjusted
514        by :obj:`to_val`.
515
516    The value of `number_of_decimals` is set to 3 and `adjust_decimals`
517    is set to 2. When :obj:`val_from_str_add` is called for the first
518    time with a string as an argument, `number_of_decimals` is set to the
519    number of decimals in the string and `adjust_decimals` is set to 1.
520    In the subsequent calls of `to_val`, the nubmer of decimals is
521    increased if the string argument has a larger number of decimals.
522
523    If the `number_of_decimals` is set manually, `adjust_decimals` is
524    set to 0 to prevent changes by `to_val`.
525    """
526
527    TYPE_HEADERS = ('continuous', 'c', 'numeric', 'n')
528
529    def __init__(self, name="", number_of_decimals=None, compute_value=None, *, sparse=False):
530        """
531        Construct a new continuous variable. The number of decimals is set to
532        three, but adjusted at the first call of :obj:`to_val`.
533        """
534        super().__init__(name, compute_value, sparse=sparse)
535        self._max_round_diff = 0
536        self.number_of_decimals = number_of_decimals
537
538    @property
539    def number_of_decimals(self):
540        return self._number_of_decimals
541
542    @property
543    def format_str(self):
544        return self._format_str
545
546    @format_str.setter
547    def format_str(self, value):
548        self._format_str = value
549
550    # noinspection PyAttributeOutsideInit
551    @number_of_decimals.setter
552    def number_of_decimals(self, x):
553        if x is None:
554            self._number_of_decimals = 3
555            self.adjust_decimals = 2
556            self._format_str = "%g"
557            return
558
559        self._number_of_decimals = x
560        self._max_round_diff = 10 ** (-x - 6)
561        self.adjust_decimals = 0
562        if self._number_of_decimals <= MAX_NUM_OF_DECIMALS:
563            self._format_str = "%.{}f".format(self.number_of_decimals)
564        else:
565            self._format_str = "%g"
566
567    def to_val(self, s):
568        """
569        Convert a value, given as an instance of an arbitrary type, to a float.
570        """
571        if s in self.unknown_str:
572            return Unknown
573        return float(s)
574
575    def val_from_str_add(self, s):
576        """
577        Convert a value from a string and adjust the number of decimals if
578        `adjust_decimals` is non-zero.
579        """
580        return _variable.val_from_str_add_cont(self, s)
581
582    def repr_val(self, val):
583        """
584        Return the value as a string with the prescribed number of decimals.
585        """
586        # Table value can't be inf, but repr_val can be used to print any float
587        if not np.isfinite(val):
588            return "?"
589        if self.format_str != "%g" \
590                and abs(round(val, self._number_of_decimals) - val) \
591                > self._max_round_diff:
592            return f"{val:.{self._number_of_decimals + 2}f}"
593        return self._format_str % val
594
595    str_val = repr_val
596
597    def copy(self, compute_value=Variable._CopyComputeValue,
598             *, name=None, **kwargs):
599        # pylint understand not that `var` is `DiscreteVariable`:
600        # pylint: disable=protected-access
601        number_of_decimals = kwargs.pop("number_of_decimals", None)
602        var = super().copy(compute_value=compute_value, name=name, **kwargs)
603        if number_of_decimals is not None:
604            var.number_of_decimals = number_of_decimals
605        else:
606            var._number_of_decimals = self._number_of_decimals
607            var._max_round_diff = self._max_round_diff
608            var.adjust_decimals = self.adjust_decimals
609            var.format_str = self._format_str
610        return var
611
612
613TupleList = tuple # backward compatibility (for pickled table)
614
615
616class DiscreteVariable(Variable):
617    """
618    Descriptor for symbolic, discrete variables. Values of discrete variables
619    are stored as floats; the numbers corresponds to indices in the list of
620    values.
621
622    .. attribute:: values
623
624        A list of variable's values.
625    """
626
627    TYPE_HEADERS = ('discrete', 'd', 'categorical')
628
629    presorted_values = []
630
631    def __init__(
632            self, name="", values=(), compute_value=None, *, sparse=False
633    ):
634        """ Construct a discrete variable descriptor with the given values. """
635        values = tuple(values)  # some people (including me) pass a generator
636        if not all(isinstance(value, str) for value in values):
637            raise TypeError("values of DiscreteVariables must be strings")
638
639        super().__init__(name, compute_value, sparse=sparse)
640        self._values = values
641        self._value_index = {value: i for i, value in enumerate(values)}
642
643    @property
644    def values(self):
645        return self._values
646
647    def get_mapping_from(self, other):
648        return np.array(
649            [self._value_index.get(value, np.nan) for value in other.values],
650            dtype=float)
651
652    def get_mapper_from(self, other):
653        mapping = self.get_mapping_from(other)
654        if not mapping.size:
655            # Nans in data are temporarily replaced with 0, mapped and changed
656            # back to nans. This would fail is mapping[0] is out of range.
657            mapping = np.array([np.nan])
658
659        def mapper(value, col_idx=None):
660
661            # In-place mapping
662            if col_idx is not None:
663                if sp.issparse(value) and mapping[0] != 0:
664                    raise ValueError(
665                        "In-place mapping of sparse matrices must map 0 to 0")
666
667                # CSR requires mapping of non-contiguous area
668                if sp.isspmatrix_csr(value):
669                    col = value.indices == col_idx
670                    nans = np.isnan(value.data) * col
671                    value.data[nans] = 0
672                    value.data[col] = mapping[value.data[col].astype(int)]
673                    value.data[nans] = np.nan
674                    return None
675
676                # Dense and CSC map a contiguous area
677                if isinstance(value, np.ndarray) and value.ndim == 2:
678                    col = value[:, col_idx]
679                elif sp.isspmatrix_csc(value):
680                    col = value.data[value.indptr[col_idx]
681                                     :value.indptr[col_idx + 1]]
682                else:
683                    raise ValueError(
684                        "In-place column mapping requires a 2d array or"
685                        "a csc or csr matrix.")
686
687                nans = np.isnan(col)
688                col[nans] = 0
689                col[:] = mapping[col.astype(int)]
690                col[nans] = np.nan
691                return None
692
693            # Mapping into a copy
694            if isinstance(value, (int, float)):
695                return value if np.isnan(value) else mapping[int(value)]
696            if isinstance(value, str):
697                return mapping[other.values.index(value)]
698            if isinstance(value, np.ndarray):
699                if not (value.ndim == 1
700                        or value.ndim != 2 and min(value.shape) != 1):
701                    raise ValueError(
702                        f"Column mapping can't map {value.ndim}-d objects")
703
704                if value.dtype == object:
705                    value = value.astype(float)  # this happens with metas
706                try:
707                    nans = np.isnan(value)
708                except TypeError:  # suppose it's already an integer type
709                    return mapping[value]
710                value = value.astype(int)
711                value[nans] = 0
712                value = mapping[value]
713                value[nans] = np.nan
714                return value
715            if sp.issparse(value):
716                if min(value.shape) != 1:
717                    raise ValueError("Column mapping can't map "
718                                     f"{value.ndim}-dimensional objects")
719                if mapping[0] != 0 and not np.isnan(mapping[0]):
720                    return mapper(np.array(value.todense()).flatten())
721                value = value.copy()
722                value.data = mapper(value.data)
723                return value
724            if isinstance(value, Iterable):
725                return type(value)(val if np.isnan(val) else mapping[int(val)]
726                                   for val in value)
727            raise ValueError(
728                f"invalid type for value(s): {type(value).__name__}")
729
730        return mapper
731
732    def to_val(self, s):
733        """
734        Convert the given argument to a value of the variable (`float`).
735        If the argument is numeric, its value is returned without checking
736        whether it is integer and within bounds. `Unknown` is returned if the
737        argument is one of the representations for unknown values. Otherwise,
738        the argument must be a string and the method returns its index in
739        :obj:`values`.
740
741        :param s: values, represented as a number, string or `None`
742        :rtype: float
743        """
744        if s is None:
745            return ValueUnknown
746
747        if isinstance(s, Integral):
748            return s
749        if isinstance(s, Real):
750            return s if isnan(s) else floor(s + 0.25)
751        if s in self.unknown_str:
752            return ValueUnknown
753        if not isinstance(s, str):
754            raise TypeError('Cannot convert {} to value of "{}"'.format(
755                type(s).__name__, self.name))
756        if s not in self._value_index:
757            raise ValueError(f"Value {s} does not exist")
758        return self._value_index[s]
759
760    def add_value(self, s):
761        """ Add a value `s` to the list of values.
762        """
763        if not isinstance(s, str):
764            raise TypeError("values of DiscreteVariables must be strings")
765        if s in self._value_index:
766            return
767        self._value_index[s] = len(self.values)
768        self._values += (s, )
769
770    def val_from_str_add(self, s):
771        """
772        Similar to :obj:`to_val`, except that it accepts only strings and that
773        it adds the value to the list if it does not exist yet.
774
775        :param s: symbolic representation of the value
776        :type s: str
777        :rtype: float
778        """
779        s = str(s) if s is not None else s
780        if s in self.unknown_str:
781            return ValueUnknown
782        val = self._value_index.get(s)
783        if val is None:
784            self.add_value(s)
785            val = len(self.values) - 1
786        return val
787
788    def repr_val(self, val):
789        """
790        Return a textual representation of the value (`self.values[int(val)]`)
791        or "?" if the value is unknown.
792
793        :param val: value
794        :type val: float (should be whole number)
795        :rtype: str
796        """
797        if isnan(val):
798            return "?"
799        return '{}'.format(self.values[int(val)])
800
801    str_val = repr_val
802
803    def __reduce__(self):
804        if not self.name:
805            raise PickleError("Variables without names cannot be pickled")
806        __dict__ = dict(self.__dict__)
807        __dict__.pop("_values")
808        return (
809            make_variable,
810            (self.__class__, self._compute_value, self.name, self.values),
811            __dict__
812        )
813
814    def copy(self, compute_value=Variable._CopyComputeValue,
815             *, name=None, values=None, **_):
816        # pylint: disable=arguments-differ
817        if values is not None and len(values) != len(self.values):
818            raise ValueError(
819                "number of values must match the number of original values")
820        return super().copy(compute_value=compute_value, name=name,
821                            values=values or self.values)
822
823
824class StringVariable(Variable):
825    """
826    Descriptor for string variables. String variables can only appear as
827    meta attributes.
828    """
829    Unknown = ""
830    TYPE_HEADERS = ('string', 's', 'text')
831
832    def to_val(self, s):
833        """
834        Return the value as a string. If it is already a string, the same
835        object is returned.
836        """
837        if s is None:
838            return ""
839        if isinstance(s, str):
840            return s
841        return str(s)
842
843    val_from_str_add = to_val
844
845    @staticmethod
846    def str_val(val):
847        """Return a string representation of the value."""
848        if isinstance(val, str) and val == "":
849            return "?"
850        if isinstance(val, Value):
851            if not val.value:
852                return "?"
853            val = val.value
854        return str(val)
855
856    def repr_val(self, val):
857        """Return a string representation of the value."""
858        return '"{}"'.format(self.str_val(val))
859
860
861class TimeVariable(ContinuousVariable):
862    """
863    TimeVariable is a continuous variable with Unix epoch
864    (1970-01-01 00:00:00+0000) as the origin (0.0). Later dates are positive
865    real numbers (equivalent to Unix timestamp, with microseconds in the
866    fraction part), and the dates before it map to the negative real numbers.
867
868    Unfortunately due to limitation of Python datetime, only dates
869    with year >= 1 (A.D.) are supported.
870
871    If time is specified without a date, Unix epoch is assumed.
872
873    If time is specified wihout an UTC offset, localtime is assumed.
874    """
875    _all_vars = {}
876    TYPE_HEADERS = ('time', 't')
877    UNIX_EPOCH = datetime(1970, 1, 1)
878    _ISO_FORMATS = (
879        # have_date, have_time, format_str
880        # in order of decreased probability
881        (1, 1, '%Y-%m-%d %H:%M:%S%z'),
882        (1, 1, '%Y-%m-%d %H:%M:%S'),
883        (1, 1, '%Y-%m-%d %H:%M'),
884        (1, 1, '%Y-%m-%dT%H:%M:%S%z'),
885        (1, 1, '%Y-%m-%dT%H:%M:%S'),
886
887        (1, 0, '%Y-%m-%d'),
888
889        (1, 1, '%Y-%m-%d %H:%M:%S.%f'),
890        (1, 1, '%Y-%m-%dT%H:%M:%S.%f'),
891        (1, 1, '%Y-%m-%d %H:%M:%S.%f%z'),
892        (1, 1, '%Y-%m-%dT%H:%M:%S.%f%z'),
893
894        (1, 1, '%Y%m%dT%H%M%S%z'),
895        (1, 1, '%Y%m%d%H%M%S%z'),
896
897        (0, 1, '%H:%M:%S.%f'),
898        (0, 1, '%H:%M:%S'),
899        (0, 1, '%H:%M'),
900
901        # These parse as continuous features (plain numbers)
902        (1, 1, '%Y%m%dT%H%M%S'),
903        (1, 1, '%Y%m%d%H%M%S'),
904        (1, 0, '%Y%m%d'),
905        (1, 0, '%Y%j'),
906        (1, 0, '%Y'),
907        (0, 1, '%H%M%S.%f'),
908
909        # BUG: In Python as in C, %j doesn't necessitate 0-padding,
910        # so these two lines must be in this order
911        (1, 0, '%Y-%m'),
912        (1, 0, '%Y-%j'),
913    )
914    # Order in which `_ISO_FORMATS` are tried. Must never change order of
915    # last 2 items. Only modified via assignment in `parse`.
916    __ISO_FORMATS_PROBE_SEQ = list(range(len(_ISO_FORMATS)))
917    # The regex that matches all above formats
918    REGEX = (r'^('
919             r'\d{1,4}-\d{2}-\d{2}([ T]\d{2}:\d{2}(:\d{2}(\.\d+)?([+-]\d{4})?)?)?|'
920             r'\d{1,4}\d{2}\d{2}(T?\d{2}\d{2}\d{2}([+-]\d{4})?)?|'
921             r'\d{2}:\d{2}(:\d{2}(\.\d+)?)?|'
922             r'\d{2}\d{2}\d{2}\.\d+|'
923             r'\d{1,4}(-?\d{2,3})?'
924             r')$')
925
926    class InvalidDateTimeFormatError(ValueError):
927        def __init__(self, date_string):
928            super().__init__(
929                "Invalid datetime format '{}'. "
930                "Only ISO 8601 supported.".format(date_string))
931
932    _matches_iso_format = re.compile(REGEX).match
933
934    # UTC offset and associated timezone. If parsed datetime values provide an
935    # offset, it is used for display. If not all values have the same offset,
936    # +0000 (=UTC) timezone is used and utc_offset is set to False.
937    utc_offset = None
938    timezone = timezone.utc
939
940    def __init__(self, *args, have_date=0, have_time=0, **kwargs):
941        super().__init__(*args, **kwargs)
942        self.have_date = have_date
943        self.have_time = have_time
944
945    def copy(self, compute_value=Variable._CopyComputeValue, *, name=None, **_):
946        return super().copy(compute_value=compute_value, name=name,
947                            have_date=self.have_date, have_time=self.have_time)
948
949    @staticmethod
950    def _tzre_sub(s, _subtz=re.compile(r'([+-])(\d\d):(\d\d)$').sub):
951        # Replace +ZZ:ZZ with ISO-compatible +ZZZZ, or strip +0000
952        return s[:-6] if s.endswith(('+00:00', '-00:00')) else _subtz(r'\1\2\3', s)
953
954    def repr_val(self, val):
955        if isnan(val):
956            return '?'
957        if not self.have_date and not self.have_time:
958            # The time is relative, unitless. The value is absolute.
959            return str(val.value) if isinstance(val, Value) else str(val)
960
961        # If you know how to simplify this, be my guest
962        seconds = int(val)
963        microseconds = int(round((val - seconds) * 1e6))
964        if val < 0:
965            if microseconds:
966                seconds, microseconds = seconds - 1, int(1e6) + microseconds
967            date = datetime.fromtimestamp(0, tz=self.timezone) + timedelta(seconds=seconds)
968        else:
969            date = datetime.fromtimestamp(seconds, tz=self.timezone)
970        date = str(date.replace(microsecond=microseconds))
971
972        if self.have_date and not self.have_time:
973            date = date.split()[0]
974        elif not self.have_date and self.have_time:
975            date = date.split()[1]
976        date = self._tzre_sub(date)
977        return date
978
979    str_val = repr_val
980
981    def parse(self, datestr):
982        """
983        Return `datestr`, a datetime provided in one of ISO 8601 formats,
984        parsed as a real number. Value 0 marks the Unix epoch, positive values
985        are the dates after it, negative before.
986
987        If date is unspecified, epoch date is assumed.
988
989        If time is unspecified, 00:00:00.0 is assumed.
990
991        If timezone is unspecified, local time is assumed.
992        """
993        if datestr in MISSING_VALUES:
994            return Unknown
995        datestr = datestr.strip().rstrip('Z')
996
997        if not self._matches_iso_format(datestr):
998            try:
999                # If it is a number, assume it is a unix timestamp
1000                value = float(datestr)
1001                self.have_date = self.have_time = 1
1002                return value
1003            except ValueError:
1004                raise self.InvalidDateTimeFormatError(datestr)
1005
1006        try_order = self.__ISO_FORMATS_PROBE_SEQ
1007        for i, (have_date, have_time, fmt) in enumerate(
1008                map(self._ISO_FORMATS.__getitem__, try_order)):
1009            try:
1010                dt = datetime.strptime(datestr, fmt)
1011            except ValueError:
1012                continue
1013            else:
1014                # Pop this most-recently-used format index to front,
1015                # excluding last 2
1016                if 0 < i < len(try_order) - 2:
1017                    try_order = try_order.copy()
1018                    try_order[i], try_order[0] = try_order[0], try_order[i]
1019                    TimeVariable.__ISO_FORMATS_PROBE_SEQ = try_order
1020                self.have_date |= have_date
1021                self.have_time |= have_time
1022                if not have_date:
1023                    dt = dt.replace(self.UNIX_EPOCH.year,
1024                                    self.UNIX_EPOCH.month,
1025                                    self.UNIX_EPOCH.day)
1026                break
1027        else:
1028            raise self.InvalidDateTimeFormatError(datestr)
1029
1030        # Remember UTC offset. If not all parsed values share the same offset,
1031        # remember none of it.
1032        offset = dt.utcoffset()
1033        if self.utc_offset is not False:
1034            if offset and self.utc_offset is None:
1035                self.utc_offset = offset
1036                self.timezone = timezone(offset)
1037            elif self.utc_offset != offset:
1038                self.utc_offset = False
1039                self.timezone = timezone.utc
1040
1041        # Convert time to UTC timezone. In dates without timezone,
1042        # localtime is assumed. See also:
1043        # https://docs.python.org/3.4/library/datetime.html#datetime.datetime.timestamp
1044        if dt.tzinfo:
1045            dt -= dt.utcoffset()
1046        dt = dt.replace(tzinfo=timezone.utc)
1047
1048        # Unix epoch is the origin, older dates are negative
1049        try:
1050            return dt.timestamp()
1051        except OverflowError:
1052            return -(self.UNIX_EPOCH - dt).total_seconds()
1053
1054    def parse_exact_iso(self, datestr):
1055        """
1056        This function is a meta function to `parse` function. It checks
1057        whether the date is of the iso format - it does not accept float-like
1058        date.
1059        """
1060        if not self._matches_iso_format(datestr):
1061            raise self.InvalidDateTimeFormatError(datestr)
1062        return self.parse(datestr)
1063
1064    def to_val(self, s):
1065        """
1066        Convert a value, given as an instance of an arbitrary type, to a float.
1067        """
1068        if isinstance(s, str):
1069            return self.parse(s)
1070        else:
1071            return super().to_val(s)
1072