1import re 2import warnings 3from collections.abc import Iterable 4 5from datetime import datetime, timedelta, timezone 6from numbers import Number, Real, Integral 7from math import isnan, floor 8from pickle import PickleError 9 10import numpy as np 11import scipy.sparse as sp 12 13from Orange.data import _variable 14from Orange.util import Registry, Reprable, OrangeDeprecationWarning 15 16 17__all__ = ["Unknown", "MISSING_VALUES", "make_variable", "is_discrete_values", 18 "Value", "Variable", "ContinuousVariable", "DiscreteVariable", 19 "StringVariable", "TimeVariable"] 20 21 22# For storing unknowns 23Unknown = ValueUnknown = float("nan") 24# For checking for unknowns 25MISSING_VALUES = {np.nan, "?", "nan", ".", "", "NA", "~", None} 26 27DISCRETE_MAX_VALUES = 3 # == 2 + nan 28MAX_NUM_OF_DECIMALS = 5 29# the variable with more than 100 different values should not be StringVariable 30DISCRETE_MAX_ALLOWED_VALUES = 100 31 32 33def make_variable(cls, compute_value, *args): 34 if compute_value is not None: 35 return cls(*args, compute_value=compute_value) 36 else: 37 # For compatibility with old pickles: remove the second arg if it's 38 # bool `compute_value` (args[3]) can't be bool, so this should be safe 39 if len(args) > 2 and isinstance(args[2], bool): 40 args = args[:2] + args[3:] 41 return cls(*args) 42 43 44def is_discrete_values(values): 45 """ 46 Return set of uniques if `values` is an iterable of discrete values 47 else False if non-discrete, or None if indeterminate. 48 49 Note 50 ---- 51 Assumes consistent type of items of `values`. 52 """ 53 if len(values) == 0: 54 return None 55 # If the first few values are, or can be converted to, floats, 56 # the type is numeric 57 try: 58 isinstance(next(iter(values)), Number) or \ 59 [v not in MISSING_VALUES and float(v) 60 for _, v in zip(range(min(3, len(values))), values)] 61 except ValueError: 62 is_numeric = False 63 max_values = int(round(len(values)**.7)) 64 else: 65 is_numeric = True 66 max_values = DISCRETE_MAX_VALUES 67 68 # If more than max values => not discrete 69 unique = set() 70 for i in values: 71 unique.add(i) 72 if (len(unique) > max_values or 73 len(unique) > DISCRETE_MAX_ALLOWED_VALUES): 74 return False 75 76 # Strip NaN from unique 77 unique = {i for i in unique 78 if (not i in MISSING_VALUES and 79 not (isinstance(i, Number) and np.isnan(i)))} 80 81 # All NaNs => indeterminate 82 if not unique: 83 return None 84 85 # Strings with |values| < max_unique 86 if not is_numeric: 87 return unique 88 89 # Handle numbers 90 try: 91 unique_float = set(map(float, unique)) 92 except ValueError: 93 # Converting all the values to floats resulted in an error. 94 # Since the values have enough unique values, they are probably 95 # string values and discrete. 96 return unique 97 98 # If only values are {0, 1} or {1, 2} (or a subset of those sets) => discrete 99 return (not (unique_float - {0, 1}) or 100 not (unique_float - {1, 2})) and unique 101 102 103class Value(float): 104 """ 105 The class representing a value. The class is not used to store values but 106 only to return them in contexts in which we want the value to be accompanied 107 with the descriptor, for instance to print the symbolic value of discrete 108 variables. 109 110 The class is derived from `float`, with an additional attribute `variable` 111 which holds the descriptor of type :obj:`Orange.data.Variable`. If the 112 value continuous or discrete, it is stored as a float. Other types of 113 values, like strings, are stored in the attribute `value`. 114 115 The class overloads the methods for printing out the value: 116 `variable.repr_val` and `variable.str_val` are used to get a suitable 117 representation of the value. 118 119 Equivalence operator is overloaded as follows: 120 121 - unknown values are equal; if one value is unknown and the other is not, 122 they are different; 123 124 - if the value is compared with the string, the value is converted to a 125 string using `variable.str_val` and the two strings are compared 126 127 - if the value is stored in attribute `value`, it is compared with the 128 given other value 129 130 - otherwise, the inherited comparison operator for `float` is called. 131 132 Finally, value defines a hash, so values can be put in sets and appear as 133 keys in dictionaries. 134 135 .. attribute:: variable (:obj:`Orange.data.Variable`) 136 137 Descriptor; used for printing out and for comparing with strings 138 139 .. attribute:: value 140 141 Value; the value can be of arbitrary type and is used only for variables 142 that are neither discrete nor continuous. If `value` is `None`, the 143 derived `float` value is used. 144 """ 145 __slots__ = "variable", "_value" 146 147 def __new__(cls, variable, value=Unknown): 148 """ 149 Construct a new instance of Value with the given descriptor and value. 150 If the argument `value` can be converted to float, it is stored as 151 `float` and the attribute `value` is set to `None`. Otherwise, the 152 inherited float is set to `Unknown` and the value is held by the 153 attribute `value`. 154 155 :param variable: descriptor 156 :type variable: Orange.data.Variable 157 :param value: value 158 """ 159 if variable.is_primitive(): 160 self = super().__new__(cls, value) 161 self.variable = variable 162 self._value = None 163 else: 164 isunknown = value == variable.Unknown 165 self = super().__new__( 166 cls, np.nan if isunknown else np.finfo(float).min) 167 self.variable = variable 168 self._value = value 169 return self 170 171 def __init__(self, _, __=Unknown): 172 # __new__ does the job, pylint: disable=super-init-not-called 173 pass 174 175 def __repr__(self): 176 return "Value('%s', %s)" % (self.variable.name, 177 self.variable.repr_val(self)) 178 179 def __str__(self): 180 return self.variable.str_val(self) 181 182 def __eq__(self, other): 183 if isinstance(self, Real) and isnan(self): 184 if isinstance(other, Real): 185 return isnan(other) 186 else: 187 return other in self.variable.unknown_str 188 if isinstance(other, str): 189 return self.variable.str_val(self) == other 190 if isinstance(other, Value): 191 return self.value == other.value 192 return super().__eq__(other) 193 194 def __ne__(self, other): 195 return not self.__eq__(other) 196 197 def __lt__(self, other): 198 if self.variable.is_primitive(): 199 if isinstance(other, str): 200 return super().__lt__(self.variable.to_val(other)) 201 else: 202 return super().__lt__(other) 203 else: 204 if isinstance(other, str): 205 return self.value < other 206 else: 207 return self.value < other.value 208 209 def __le__(self, other): 210 return self.__lt__(other) or self.__eq__(other) 211 212 def __gt__(self, other): 213 return not self.__le__(other) 214 215 def __ge__(self, other): 216 return not self.__lt__(other) 217 218 def __contains__(self, other): 219 if (self._value is not None 220 and isinstance(self._value, str) 221 and isinstance(other, str)): 222 return other in self._value 223 raise TypeError("invalid operation on Value()") 224 225 def __hash__(self): 226 if self.variable.is_discrete: 227 # It is not possible to hash the id and the domain value to the 228 # same number as required by __eq__. 229 # hash(1) 230 # == hash(Value(DiscreteVariable("var", ["red", "green", "blue"]), 1)) 231 # == hash("green") 232 # User should hash directly ids or domain values instead. 233 raise TypeError("unhashable type - cannot hash values of discrete variables!") 234 if self._value is None: 235 return super().__hash__() 236 else: 237 return hash(self._value) 238 239 @property 240 def value(self): 241 if self.variable.is_discrete: 242 return Unknown if isnan(self) else self.variable.values[int(self)] 243 if self.variable.is_string: 244 return self._value 245 return float(self) 246 247 def __getnewargs__(self): 248 return self.variable, float(self) 249 250 def __getstate__(self): 251 return dict(value=getattr(self, '_value', None)) 252 253 def __setstate__(self, state): 254 # defined in __new__, pylint: disable=attribute-defined-outside-init 255 self._value = state.get('value', None) 256 257 258class VariableMeta(Registry): 259 pass 260 261 262class _predicatedescriptor(property): 263 """ 264 A property that behaves as a class method if accessed via a class 265 >>> class A: 266 ... foo = False 267 ... @_predicatedescriptor 268 ... def is_foo(self): 269 ... return self.foo 270 ... 271 >>> a = A() 272 >>> a.is_foo 273 False 274 >>> A.is_foo(a) 275 False 276 """ 277 def __get__(self, instance, objtype=None): 278 if instance is None: 279 return self.fget 280 else: 281 return super().__get__(instance, objtype) 282 283 284class Variable(Reprable, metaclass=VariableMeta): 285 """ 286 The base class for variable descriptors contains the variable's 287 name and some basic properties. 288 289 .. attribute:: name 290 291 The name of the variable. 292 293 .. attribute:: unknown_str 294 295 A set of values that represent unknowns in conversion from textual 296 formats. Default is `{"?", ".", "", "NA", "~", None}`. 297 298 .. attribute:: compute_value 299 300 A function for computing the variable's value when converting from 301 another domain which does not contain this variable. The function will 302 be called with a data set (`Orange.data.Table`) and has to return 303 an array of computed values for all its instances. The base class 304 defines a static method `compute_value`, which returns `Unknown`. 305 Non-primitive variables must redefine it to return `None`. 306 307 .. attribute:: sparse 308 309 A flag about sparsity of the variable. When set, the variable suggests 310 it should be stored in a sparse matrix. 311 312 .. attribute:: source_variable 313 314 An optional descriptor of the source variable - if any - from which 315 this variable is derived and computed via :obj:`compute_value`. 316 317 .. attribute:: attributes 318 319 A dictionary with user-defined attributes of the variable 320 """ 321 Unknown = ValueUnknown 322 323 def __init__(self, name="", compute_value=None, *, sparse=False): 324 """ 325 Construct a variable descriptor. 326 """ 327 if not name: 328 warnings.warn("Variable must have a name", OrangeDeprecationWarning, 329 stacklevel=3) 330 self._name = name 331 self._compute_value = compute_value 332 self.unknown_str = MISSING_VALUES 333 self.source_variable = None 334 self.sparse = sparse 335 self.attributes = {} 336 337 @property 338 def name(self): 339 return self._name 340 341 def make_proxy(self): 342 """ 343 Copy the variable and set the master to `self.master` or to `self`. 344 345 :return: copy of self 346 :rtype: Variable 347 """ 348 var = self.__class__(self.name) 349 var.__dict__.update(self.__dict__) 350 var.attributes = dict(self.attributes) 351 return var 352 353 def __eq__(self, other): 354 if type(self) is not type(other): 355 return False 356 357 var1 = self._get_identical_source(self) 358 var2 = self._get_identical_source(other) 359 # pylint: disable=protected-access 360 return ( 361 self.name == other.name 362 and var1.name == var2.name 363 and var1._compute_value == var2._compute_value 364 ) 365 366 def __hash__(self): 367 var = self._get_identical_source(self) 368 return hash((self.name, var.name, type(self), var._compute_value)) 369 370 @staticmethod 371 def _get_identical_source(var): 372 # pylint: disable=protected-access,import-outside-toplevel 373 from Orange.preprocess.transformation import Identity 374 while isinstance(var._compute_value, Identity): 375 var = var._compute_value.variable 376 return var 377 378 @classmethod 379 def make(cls, name, *args, **kwargs): 380 """ 381 Return an existing continuous variable with the given name, or 382 construct and return a new one. 383 """ 384 return cls(name, *args, **kwargs) 385 386 @classmethod 387 def _clear_cache(cls): 388 warnings.warn( 389 "_clear_cache is no longer needed and thus deprecated") 390 391 @staticmethod 392 def _clear_all_caches(): 393 warnings.warn( 394 "_clear_all_caches is no longer needed and thus deprecated") 395 396 @classmethod 397 def is_primitive(cls, var=None): 398 """ 399 `True` if the variable's values are stored as floats. 400 Non-primitive variables can appear in the data only as meta attributes. 401 """ 402 to_check = cls if var is None else type(var) 403 return issubclass(to_check, (DiscreteVariable, ContinuousVariable)) 404 405 @_predicatedescriptor 406 def is_discrete(self): 407 return isinstance(self, DiscreteVariable) 408 409 @_predicatedescriptor 410 def is_continuous(self): 411 return isinstance(self, ContinuousVariable) 412 413 @_predicatedescriptor 414 def is_string(self): 415 return isinstance(self, StringVariable) 416 417 @_predicatedescriptor 418 def is_time(self): 419 return isinstance(self, TimeVariable) 420 421 @staticmethod 422 def repr_val(val): 423 """ 424 Return a textual representation of variable's value `val`. Argument 425 `val` must be a float (for primitive variables) or an arbitrary 426 Python object (for non-primitives). 427 428 Derived classes must overload the function. 429 """ 430 raise RuntimeError("variable descriptors must overload repr_val()") 431 432 str_val = repr_val 433 434 def to_val(self, s): 435 """ 436 Convert the given argument to a value of the variable. The 437 argument can be a string, a number or `None`. For primitive variables, 438 the base class provides a method that returns 439 :obj:`~Orange.data.Unknown` if `s` is found in 440 :obj:`~Orange.data.Variable.unknown_str`, and raises an exception 441 otherwise. For non-primitive variables it returns the argument itself. 442 443 Derived classes of primitive variables must overload the function. 444 445 :param s: value, represented as a number, string or `None` 446 :type s: str, float or None 447 :rtype: float or object 448 """ 449 if not self.is_primitive(): 450 return s 451 if s in self.unknown_str: 452 return Unknown 453 raise RuntimeError( 454 "primitive variable descriptors must overload to_val()") 455 456 def val_from_str_add(self, s): 457 """ 458 Convert the given string to a value of the variable. The method 459 is similar to :obj:`to_val` except that it only accepts strings and 460 that it adds new values to the variable's domain where applicable. 461 462 The base class method calls `to_val`. 463 464 :param s: symbolic representation of the value 465 :type s: str 466 :rtype: float or object 467 """ 468 return self.to_val(s) 469 470 def __str__(self): 471 return self.name 472 473 @property 474 def compute_value(self): 475 return self._compute_value 476 477 def __reduce__(self): 478 if not self.name: 479 raise PickleError("Variables without names cannot be pickled") 480 481 # Use make to unpickle variables. 482 return make_variable, (self.__class__, self._compute_value, self.name), self.__dict__ 483 484 _CopyComputeValue = object() 485 486 def copy(self, compute_value=_CopyComputeValue, *, name=None, **kwargs): 487 if compute_value is self._CopyComputeValue: 488 compute_value = self.compute_value 489 var = type(self)(name=name or self.name, 490 compute_value=compute_value, 491 sparse=self.sparse, **kwargs) 492 var.attributes = dict(self.attributes) 493 return var 494 495 def renamed(self, new_name): 496 # prevent cyclic import, pylint: disable=import-outside-toplevel 497 from Orange.preprocess.transformation import Identity 498 return self.copy(name=new_name, compute_value=Identity(variable=self)) 499 500del _predicatedescriptor 501 502 503class ContinuousVariable(Variable): 504 """ 505 Descriptor for continuous variables. 506 507 .. attribute:: number_of_decimals 508 509 The number of decimals when the value is printed out (default: 3). 510 511 .. attribute:: adjust_decimals 512 513 A flag regulating whether the `number_of_decimals` is being adjusted 514 by :obj:`to_val`. 515 516 The value of `number_of_decimals` is set to 3 and `adjust_decimals` 517 is set to 2. When :obj:`val_from_str_add` is called for the first 518 time with a string as an argument, `number_of_decimals` is set to the 519 number of decimals in the string and `adjust_decimals` is set to 1. 520 In the subsequent calls of `to_val`, the nubmer of decimals is 521 increased if the string argument has a larger number of decimals. 522 523 If the `number_of_decimals` is set manually, `adjust_decimals` is 524 set to 0 to prevent changes by `to_val`. 525 """ 526 527 TYPE_HEADERS = ('continuous', 'c', 'numeric', 'n') 528 529 def __init__(self, name="", number_of_decimals=None, compute_value=None, *, sparse=False): 530 """ 531 Construct a new continuous variable. The number of decimals is set to 532 three, but adjusted at the first call of :obj:`to_val`. 533 """ 534 super().__init__(name, compute_value, sparse=sparse) 535 self._max_round_diff = 0 536 self.number_of_decimals = number_of_decimals 537 538 @property 539 def number_of_decimals(self): 540 return self._number_of_decimals 541 542 @property 543 def format_str(self): 544 return self._format_str 545 546 @format_str.setter 547 def format_str(self, value): 548 self._format_str = value 549 550 # noinspection PyAttributeOutsideInit 551 @number_of_decimals.setter 552 def number_of_decimals(self, x): 553 if x is None: 554 self._number_of_decimals = 3 555 self.adjust_decimals = 2 556 self._format_str = "%g" 557 return 558 559 self._number_of_decimals = x 560 self._max_round_diff = 10 ** (-x - 6) 561 self.adjust_decimals = 0 562 if self._number_of_decimals <= MAX_NUM_OF_DECIMALS: 563 self._format_str = "%.{}f".format(self.number_of_decimals) 564 else: 565 self._format_str = "%g" 566 567 def to_val(self, s): 568 """ 569 Convert a value, given as an instance of an arbitrary type, to a float. 570 """ 571 if s in self.unknown_str: 572 return Unknown 573 return float(s) 574 575 def val_from_str_add(self, s): 576 """ 577 Convert a value from a string and adjust the number of decimals if 578 `adjust_decimals` is non-zero. 579 """ 580 return _variable.val_from_str_add_cont(self, s) 581 582 def repr_val(self, val): 583 """ 584 Return the value as a string with the prescribed number of decimals. 585 """ 586 # Table value can't be inf, but repr_val can be used to print any float 587 if not np.isfinite(val): 588 return "?" 589 if self.format_str != "%g" \ 590 and abs(round(val, self._number_of_decimals) - val) \ 591 > self._max_round_diff: 592 return f"{val:.{self._number_of_decimals + 2}f}" 593 return self._format_str % val 594 595 str_val = repr_val 596 597 def copy(self, compute_value=Variable._CopyComputeValue, 598 *, name=None, **kwargs): 599 # pylint understand not that `var` is `DiscreteVariable`: 600 # pylint: disable=protected-access 601 number_of_decimals = kwargs.pop("number_of_decimals", None) 602 var = super().copy(compute_value=compute_value, name=name, **kwargs) 603 if number_of_decimals is not None: 604 var.number_of_decimals = number_of_decimals 605 else: 606 var._number_of_decimals = self._number_of_decimals 607 var._max_round_diff = self._max_round_diff 608 var.adjust_decimals = self.adjust_decimals 609 var.format_str = self._format_str 610 return var 611 612 613TupleList = tuple # backward compatibility (for pickled table) 614 615 616class DiscreteVariable(Variable): 617 """ 618 Descriptor for symbolic, discrete variables. Values of discrete variables 619 are stored as floats; the numbers corresponds to indices in the list of 620 values. 621 622 .. attribute:: values 623 624 A list of variable's values. 625 """ 626 627 TYPE_HEADERS = ('discrete', 'd', 'categorical') 628 629 presorted_values = [] 630 631 def __init__( 632 self, name="", values=(), compute_value=None, *, sparse=False 633 ): 634 """ Construct a discrete variable descriptor with the given values. """ 635 values = tuple(values) # some people (including me) pass a generator 636 if not all(isinstance(value, str) for value in values): 637 raise TypeError("values of DiscreteVariables must be strings") 638 639 super().__init__(name, compute_value, sparse=sparse) 640 self._values = values 641 self._value_index = {value: i for i, value in enumerate(values)} 642 643 @property 644 def values(self): 645 return self._values 646 647 def get_mapping_from(self, other): 648 return np.array( 649 [self._value_index.get(value, np.nan) for value in other.values], 650 dtype=float) 651 652 def get_mapper_from(self, other): 653 mapping = self.get_mapping_from(other) 654 if not mapping.size: 655 # Nans in data are temporarily replaced with 0, mapped and changed 656 # back to nans. This would fail is mapping[0] is out of range. 657 mapping = np.array([np.nan]) 658 659 def mapper(value, col_idx=None): 660 661 # In-place mapping 662 if col_idx is not None: 663 if sp.issparse(value) and mapping[0] != 0: 664 raise ValueError( 665 "In-place mapping of sparse matrices must map 0 to 0") 666 667 # CSR requires mapping of non-contiguous area 668 if sp.isspmatrix_csr(value): 669 col = value.indices == col_idx 670 nans = np.isnan(value.data) * col 671 value.data[nans] = 0 672 value.data[col] = mapping[value.data[col].astype(int)] 673 value.data[nans] = np.nan 674 return None 675 676 # Dense and CSC map a contiguous area 677 if isinstance(value, np.ndarray) and value.ndim == 2: 678 col = value[:, col_idx] 679 elif sp.isspmatrix_csc(value): 680 col = value.data[value.indptr[col_idx] 681 :value.indptr[col_idx + 1]] 682 else: 683 raise ValueError( 684 "In-place column mapping requires a 2d array or" 685 "a csc or csr matrix.") 686 687 nans = np.isnan(col) 688 col[nans] = 0 689 col[:] = mapping[col.astype(int)] 690 col[nans] = np.nan 691 return None 692 693 # Mapping into a copy 694 if isinstance(value, (int, float)): 695 return value if np.isnan(value) else mapping[int(value)] 696 if isinstance(value, str): 697 return mapping[other.values.index(value)] 698 if isinstance(value, np.ndarray): 699 if not (value.ndim == 1 700 or value.ndim != 2 and min(value.shape) != 1): 701 raise ValueError( 702 f"Column mapping can't map {value.ndim}-d objects") 703 704 if value.dtype == object: 705 value = value.astype(float) # this happens with metas 706 try: 707 nans = np.isnan(value) 708 except TypeError: # suppose it's already an integer type 709 return mapping[value] 710 value = value.astype(int) 711 value[nans] = 0 712 value = mapping[value] 713 value[nans] = np.nan 714 return value 715 if sp.issparse(value): 716 if min(value.shape) != 1: 717 raise ValueError("Column mapping can't map " 718 f"{value.ndim}-dimensional objects") 719 if mapping[0] != 0 and not np.isnan(mapping[0]): 720 return mapper(np.array(value.todense()).flatten()) 721 value = value.copy() 722 value.data = mapper(value.data) 723 return value 724 if isinstance(value, Iterable): 725 return type(value)(val if np.isnan(val) else mapping[int(val)] 726 for val in value) 727 raise ValueError( 728 f"invalid type for value(s): {type(value).__name__}") 729 730 return mapper 731 732 def to_val(self, s): 733 """ 734 Convert the given argument to a value of the variable (`float`). 735 If the argument is numeric, its value is returned without checking 736 whether it is integer and within bounds. `Unknown` is returned if the 737 argument is one of the representations for unknown values. Otherwise, 738 the argument must be a string and the method returns its index in 739 :obj:`values`. 740 741 :param s: values, represented as a number, string or `None` 742 :rtype: float 743 """ 744 if s is None: 745 return ValueUnknown 746 747 if isinstance(s, Integral): 748 return s 749 if isinstance(s, Real): 750 return s if isnan(s) else floor(s + 0.25) 751 if s in self.unknown_str: 752 return ValueUnknown 753 if not isinstance(s, str): 754 raise TypeError('Cannot convert {} to value of "{}"'.format( 755 type(s).__name__, self.name)) 756 if s not in self._value_index: 757 raise ValueError(f"Value {s} does not exist") 758 return self._value_index[s] 759 760 def add_value(self, s): 761 """ Add a value `s` to the list of values. 762 """ 763 if not isinstance(s, str): 764 raise TypeError("values of DiscreteVariables must be strings") 765 if s in self._value_index: 766 return 767 self._value_index[s] = len(self.values) 768 self._values += (s, ) 769 770 def val_from_str_add(self, s): 771 """ 772 Similar to :obj:`to_val`, except that it accepts only strings and that 773 it adds the value to the list if it does not exist yet. 774 775 :param s: symbolic representation of the value 776 :type s: str 777 :rtype: float 778 """ 779 s = str(s) if s is not None else s 780 if s in self.unknown_str: 781 return ValueUnknown 782 val = self._value_index.get(s) 783 if val is None: 784 self.add_value(s) 785 val = len(self.values) - 1 786 return val 787 788 def repr_val(self, val): 789 """ 790 Return a textual representation of the value (`self.values[int(val)]`) 791 or "?" if the value is unknown. 792 793 :param val: value 794 :type val: float (should be whole number) 795 :rtype: str 796 """ 797 if isnan(val): 798 return "?" 799 return '{}'.format(self.values[int(val)]) 800 801 str_val = repr_val 802 803 def __reduce__(self): 804 if not self.name: 805 raise PickleError("Variables without names cannot be pickled") 806 __dict__ = dict(self.__dict__) 807 __dict__.pop("_values") 808 return ( 809 make_variable, 810 (self.__class__, self._compute_value, self.name, self.values), 811 __dict__ 812 ) 813 814 def copy(self, compute_value=Variable._CopyComputeValue, 815 *, name=None, values=None, **_): 816 # pylint: disable=arguments-differ 817 if values is not None and len(values) != len(self.values): 818 raise ValueError( 819 "number of values must match the number of original values") 820 return super().copy(compute_value=compute_value, name=name, 821 values=values or self.values) 822 823 824class StringVariable(Variable): 825 """ 826 Descriptor for string variables. String variables can only appear as 827 meta attributes. 828 """ 829 Unknown = "" 830 TYPE_HEADERS = ('string', 's', 'text') 831 832 def to_val(self, s): 833 """ 834 Return the value as a string. If it is already a string, the same 835 object is returned. 836 """ 837 if s is None: 838 return "" 839 if isinstance(s, str): 840 return s 841 return str(s) 842 843 val_from_str_add = to_val 844 845 @staticmethod 846 def str_val(val): 847 """Return a string representation of the value.""" 848 if isinstance(val, str) and val == "": 849 return "?" 850 if isinstance(val, Value): 851 if not val.value: 852 return "?" 853 val = val.value 854 return str(val) 855 856 def repr_val(self, val): 857 """Return a string representation of the value.""" 858 return '"{}"'.format(self.str_val(val)) 859 860 861class TimeVariable(ContinuousVariable): 862 """ 863 TimeVariable is a continuous variable with Unix epoch 864 (1970-01-01 00:00:00+0000) as the origin (0.0). Later dates are positive 865 real numbers (equivalent to Unix timestamp, with microseconds in the 866 fraction part), and the dates before it map to the negative real numbers. 867 868 Unfortunately due to limitation of Python datetime, only dates 869 with year >= 1 (A.D.) are supported. 870 871 If time is specified without a date, Unix epoch is assumed. 872 873 If time is specified wihout an UTC offset, localtime is assumed. 874 """ 875 _all_vars = {} 876 TYPE_HEADERS = ('time', 't') 877 UNIX_EPOCH = datetime(1970, 1, 1) 878 _ISO_FORMATS = ( 879 # have_date, have_time, format_str 880 # in order of decreased probability 881 (1, 1, '%Y-%m-%d %H:%M:%S%z'), 882 (1, 1, '%Y-%m-%d %H:%M:%S'), 883 (1, 1, '%Y-%m-%d %H:%M'), 884 (1, 1, '%Y-%m-%dT%H:%M:%S%z'), 885 (1, 1, '%Y-%m-%dT%H:%M:%S'), 886 887 (1, 0, '%Y-%m-%d'), 888 889 (1, 1, '%Y-%m-%d %H:%M:%S.%f'), 890 (1, 1, '%Y-%m-%dT%H:%M:%S.%f'), 891 (1, 1, '%Y-%m-%d %H:%M:%S.%f%z'), 892 (1, 1, '%Y-%m-%dT%H:%M:%S.%f%z'), 893 894 (1, 1, '%Y%m%dT%H%M%S%z'), 895 (1, 1, '%Y%m%d%H%M%S%z'), 896 897 (0, 1, '%H:%M:%S.%f'), 898 (0, 1, '%H:%M:%S'), 899 (0, 1, '%H:%M'), 900 901 # These parse as continuous features (plain numbers) 902 (1, 1, '%Y%m%dT%H%M%S'), 903 (1, 1, '%Y%m%d%H%M%S'), 904 (1, 0, '%Y%m%d'), 905 (1, 0, '%Y%j'), 906 (1, 0, '%Y'), 907 (0, 1, '%H%M%S.%f'), 908 909 # BUG: In Python as in C, %j doesn't necessitate 0-padding, 910 # so these two lines must be in this order 911 (1, 0, '%Y-%m'), 912 (1, 0, '%Y-%j'), 913 ) 914 # Order in which `_ISO_FORMATS` are tried. Must never change order of 915 # last 2 items. Only modified via assignment in `parse`. 916 __ISO_FORMATS_PROBE_SEQ = list(range(len(_ISO_FORMATS))) 917 # The regex that matches all above formats 918 REGEX = (r'^(' 919 r'\d{1,4}-\d{2}-\d{2}([ T]\d{2}:\d{2}(:\d{2}(\.\d+)?([+-]\d{4})?)?)?|' 920 r'\d{1,4}\d{2}\d{2}(T?\d{2}\d{2}\d{2}([+-]\d{4})?)?|' 921 r'\d{2}:\d{2}(:\d{2}(\.\d+)?)?|' 922 r'\d{2}\d{2}\d{2}\.\d+|' 923 r'\d{1,4}(-?\d{2,3})?' 924 r')$') 925 926 class InvalidDateTimeFormatError(ValueError): 927 def __init__(self, date_string): 928 super().__init__( 929 "Invalid datetime format '{}'. " 930 "Only ISO 8601 supported.".format(date_string)) 931 932 _matches_iso_format = re.compile(REGEX).match 933 934 # UTC offset and associated timezone. If parsed datetime values provide an 935 # offset, it is used for display. If not all values have the same offset, 936 # +0000 (=UTC) timezone is used and utc_offset is set to False. 937 utc_offset = None 938 timezone = timezone.utc 939 940 def __init__(self, *args, have_date=0, have_time=0, **kwargs): 941 super().__init__(*args, **kwargs) 942 self.have_date = have_date 943 self.have_time = have_time 944 945 def copy(self, compute_value=Variable._CopyComputeValue, *, name=None, **_): 946 return super().copy(compute_value=compute_value, name=name, 947 have_date=self.have_date, have_time=self.have_time) 948 949 @staticmethod 950 def _tzre_sub(s, _subtz=re.compile(r'([+-])(\d\d):(\d\d)$').sub): 951 # Replace +ZZ:ZZ with ISO-compatible +ZZZZ, or strip +0000 952 return s[:-6] if s.endswith(('+00:00', '-00:00')) else _subtz(r'\1\2\3', s) 953 954 def repr_val(self, val): 955 if isnan(val): 956 return '?' 957 if not self.have_date and not self.have_time: 958 # The time is relative, unitless. The value is absolute. 959 return str(val.value) if isinstance(val, Value) else str(val) 960 961 # If you know how to simplify this, be my guest 962 seconds = int(val) 963 microseconds = int(round((val - seconds) * 1e6)) 964 if val < 0: 965 if microseconds: 966 seconds, microseconds = seconds - 1, int(1e6) + microseconds 967 date = datetime.fromtimestamp(0, tz=self.timezone) + timedelta(seconds=seconds) 968 else: 969 date = datetime.fromtimestamp(seconds, tz=self.timezone) 970 date = str(date.replace(microsecond=microseconds)) 971 972 if self.have_date and not self.have_time: 973 date = date.split()[0] 974 elif not self.have_date and self.have_time: 975 date = date.split()[1] 976 date = self._tzre_sub(date) 977 return date 978 979 str_val = repr_val 980 981 def parse(self, datestr): 982 """ 983 Return `datestr`, a datetime provided in one of ISO 8601 formats, 984 parsed as a real number. Value 0 marks the Unix epoch, positive values 985 are the dates after it, negative before. 986 987 If date is unspecified, epoch date is assumed. 988 989 If time is unspecified, 00:00:00.0 is assumed. 990 991 If timezone is unspecified, local time is assumed. 992 """ 993 if datestr in MISSING_VALUES: 994 return Unknown 995 datestr = datestr.strip().rstrip('Z') 996 997 if not self._matches_iso_format(datestr): 998 try: 999 # If it is a number, assume it is a unix timestamp 1000 value = float(datestr) 1001 self.have_date = self.have_time = 1 1002 return value 1003 except ValueError: 1004 raise self.InvalidDateTimeFormatError(datestr) 1005 1006 try_order = self.__ISO_FORMATS_PROBE_SEQ 1007 for i, (have_date, have_time, fmt) in enumerate( 1008 map(self._ISO_FORMATS.__getitem__, try_order)): 1009 try: 1010 dt = datetime.strptime(datestr, fmt) 1011 except ValueError: 1012 continue 1013 else: 1014 # Pop this most-recently-used format index to front, 1015 # excluding last 2 1016 if 0 < i < len(try_order) - 2: 1017 try_order = try_order.copy() 1018 try_order[i], try_order[0] = try_order[0], try_order[i] 1019 TimeVariable.__ISO_FORMATS_PROBE_SEQ = try_order 1020 self.have_date |= have_date 1021 self.have_time |= have_time 1022 if not have_date: 1023 dt = dt.replace(self.UNIX_EPOCH.year, 1024 self.UNIX_EPOCH.month, 1025 self.UNIX_EPOCH.day) 1026 break 1027 else: 1028 raise self.InvalidDateTimeFormatError(datestr) 1029 1030 # Remember UTC offset. If not all parsed values share the same offset, 1031 # remember none of it. 1032 offset = dt.utcoffset() 1033 if self.utc_offset is not False: 1034 if offset and self.utc_offset is None: 1035 self.utc_offset = offset 1036 self.timezone = timezone(offset) 1037 elif self.utc_offset != offset: 1038 self.utc_offset = False 1039 self.timezone = timezone.utc 1040 1041 # Convert time to UTC timezone. In dates without timezone, 1042 # localtime is assumed. See also: 1043 # https://docs.python.org/3.4/library/datetime.html#datetime.datetime.timestamp 1044 if dt.tzinfo: 1045 dt -= dt.utcoffset() 1046 dt = dt.replace(tzinfo=timezone.utc) 1047 1048 # Unix epoch is the origin, older dates are negative 1049 try: 1050 return dt.timestamp() 1051 except OverflowError: 1052 return -(self.UNIX_EPOCH - dt).total_seconds() 1053 1054 def parse_exact_iso(self, datestr): 1055 """ 1056 This function is a meta function to `parse` function. It checks 1057 whether the date is of the iso format - it does not accept float-like 1058 date. 1059 """ 1060 if not self._matches_iso_format(datestr): 1061 raise self.InvalidDateTimeFormatError(datestr) 1062 return self.parse(datestr) 1063 1064 def to_val(self, s): 1065 """ 1066 Convert a value, given as an instance of an arbitrary type, to a float. 1067 """ 1068 if isinstance(s, str): 1069 return self.parse(s) 1070 else: 1071 return super().to_val(s) 1072