1"""String formatting routines for __repr__. 2""" 3import contextlib 4import functools 5from datetime import datetime, timedelta 6from itertools import chain, zip_longest 7from typing import Hashable 8 9import numpy as np 10import pandas as pd 11from pandas.errors import OutOfBoundsDatetime 12 13from .duck_array_ops import array_equiv 14from .indexing import MemoryCachedArray 15from .options import OPTIONS, _get_boolean_with_default 16from .pycompat import dask_array_type, sparse_array_type 17from .utils import is_duck_array 18 19 20def pretty_print(x, numchars: int): 21 """Given an object `x`, call `str(x)` and format the returned string so 22 that it is numchars long, padding with trailing spaces or truncating with 23 ellipses as necessary 24 """ 25 s = maybe_truncate(x, numchars) 26 return s + " " * max(numchars - len(s), 0) 27 28 29def maybe_truncate(obj, maxlen=500): 30 s = str(obj) 31 if len(s) > maxlen: 32 s = s[: (maxlen - 3)] + "..." 33 return s 34 35 36def wrap_indent(text, start="", length=None): 37 if length is None: 38 length = len(start) 39 indent = "\n" + " " * length 40 return start + indent.join(x for x in text.splitlines()) 41 42 43def _get_indexer_at_least_n_items(shape, n_desired, from_end): 44 assert 0 < n_desired <= np.prod(shape) 45 cum_items = np.cumprod(shape[::-1]) 46 n_steps = np.argmax(cum_items >= n_desired) 47 stop = int(np.ceil(float(n_desired) / np.r_[1, cum_items][n_steps])) 48 indexer = ( 49 ((-1 if from_end else 0),) * (len(shape) - 1 - n_steps) 50 + ((slice(-stop, None) if from_end else slice(stop)),) 51 + (slice(None),) * n_steps 52 ) 53 return indexer 54 55 56def first_n_items(array, n_desired): 57 """Returns the first n_desired items of an array""" 58 # Unfortunately, we can't just do array.flat[:n_desired] here because it 59 # might not be a numpy.ndarray. Moreover, access to elements of the array 60 # could be very expensive (e.g. if it's only available over DAP), so go out 61 # of our way to get them in a single call to __getitem__ using only slices. 62 if n_desired < 1: 63 raise ValueError("must request at least one item") 64 65 if array.size == 0: 66 # work around for https://github.com/numpy/numpy/issues/5195 67 return [] 68 69 if n_desired < array.size: 70 indexer = _get_indexer_at_least_n_items(array.shape, n_desired, from_end=False) 71 array = array[indexer] 72 return np.asarray(array).flat[:n_desired] 73 74 75def last_n_items(array, n_desired): 76 """Returns the last n_desired items of an array""" 77 # Unfortunately, we can't just do array.flat[-n_desired:] here because it 78 # might not be a numpy.ndarray. Moreover, access to elements of the array 79 # could be very expensive (e.g. if it's only available over DAP), so go out 80 # of our way to get them in a single call to __getitem__ using only slices. 81 if (n_desired == 0) or (array.size == 0): 82 return [] 83 84 if n_desired < array.size: 85 indexer = _get_indexer_at_least_n_items(array.shape, n_desired, from_end=True) 86 array = array[indexer] 87 return np.asarray(array).flat[-n_desired:] 88 89 90def last_item(array): 91 """Returns the last item of an array in a list or an empty list.""" 92 if array.size == 0: 93 # work around for https://github.com/numpy/numpy/issues/5195 94 return [] 95 96 indexer = (slice(-1, None),) * array.ndim 97 return np.ravel(np.asarray(array[indexer])).tolist() 98 99 100def format_timestamp(t): 101 """Cast given object to a Timestamp and return a nicely formatted string""" 102 # Timestamp is only valid for 1678 to 2262 103 try: 104 datetime_str = str(pd.Timestamp(t)) 105 except OutOfBoundsDatetime: 106 datetime_str = str(t) 107 108 try: 109 date_str, time_str = datetime_str.split() 110 except ValueError: 111 # catch NaT and others that don't split nicely 112 return datetime_str 113 else: 114 if time_str == "00:00:00": 115 return date_str 116 else: 117 return f"{date_str}T{time_str}" 118 119 120def format_timedelta(t, timedelta_format=None): 121 """Cast given object to a Timestamp and return a nicely formatted string""" 122 timedelta_str = str(pd.Timedelta(t)) 123 try: 124 days_str, time_str = timedelta_str.split(" days ") 125 except ValueError: 126 # catch NaT and others that don't split nicely 127 return timedelta_str 128 else: 129 if timedelta_format == "date": 130 return days_str + " days" 131 elif timedelta_format == "time": 132 return time_str 133 else: 134 return timedelta_str 135 136 137def format_item(x, timedelta_format=None, quote_strings=True): 138 """Returns a succinct summary of an object as a string""" 139 if isinstance(x, (np.datetime64, datetime)): 140 return format_timestamp(x) 141 if isinstance(x, (np.timedelta64, timedelta)): 142 return format_timedelta(x, timedelta_format=timedelta_format) 143 elif isinstance(x, (str, bytes)): 144 return repr(x) if quote_strings else x 145 elif hasattr(x, "dtype") and np.issubdtype(x.dtype, np.floating): 146 return f"{x:.4}" 147 else: 148 return str(x) 149 150 151def format_items(x): 152 """Returns a succinct summaries of all items in a sequence as strings""" 153 x = np.asarray(x) 154 timedelta_format = "datetime" 155 if np.issubdtype(x.dtype, np.timedelta64): 156 x = np.asarray(x, dtype="timedelta64[ns]") 157 day_part = x[~pd.isnull(x)].astype("timedelta64[D]").astype("timedelta64[ns]") 158 time_needed = x[~pd.isnull(x)] != day_part 159 day_needed = day_part != np.timedelta64(0, "ns") 160 if np.logical_not(day_needed).all(): 161 timedelta_format = "time" 162 elif np.logical_not(time_needed).all(): 163 timedelta_format = "date" 164 165 formatted = [format_item(xi, timedelta_format) for xi in x] 166 return formatted 167 168 169def format_array_flat(array, max_width: int): 170 """Return a formatted string for as many items in the flattened version of 171 array that will fit within max_width characters. 172 """ 173 # every item will take up at least two characters, but we always want to 174 # print at least first and last items 175 max_possibly_relevant = min( 176 max(array.size, 1), max(int(np.ceil(max_width / 2.0)), 2) 177 ) 178 relevant_front_items = format_items( 179 first_n_items(array, (max_possibly_relevant + 1) // 2) 180 ) 181 relevant_back_items = format_items(last_n_items(array, max_possibly_relevant // 2)) 182 # interleave relevant front and back items: 183 # [a, b, c] and [y, z] -> [a, z, b, y, c] 184 relevant_items = sum( 185 zip_longest(relevant_front_items, reversed(relevant_back_items)), () 186 )[:max_possibly_relevant] 187 188 cum_len = np.cumsum([len(s) + 1 for s in relevant_items]) - 1 189 if (array.size > 2) and ( 190 (max_possibly_relevant < array.size) or (cum_len > max_width).any() 191 ): 192 padding = " ... " 193 max_len = max(int(np.argmax(cum_len + len(padding) - 1 > max_width)), 2) # type: ignore[type-var] 194 count = min(array.size, max_len) 195 else: 196 count = array.size 197 padding = "" if (count <= 1) else " " 198 199 num_front = (count + 1) // 2 200 num_back = count - num_front 201 # note that num_back is 0 <--> array.size is 0 or 1 202 # <--> relevant_back_items is [] 203 pprint_str = "".join( 204 [ 205 " ".join(relevant_front_items[:num_front]), 206 padding, 207 " ".join(relevant_back_items[-num_back:]), 208 ] 209 ) 210 211 # As a final check, if it's still too long even with the limit in values, 212 # replace the end with an ellipsis 213 # NB: this will still returns a full 3-character ellipsis when max_width < 3 214 if len(pprint_str) > max_width: 215 pprint_str = pprint_str[: max(max_width - 3, 0)] + "..." 216 217 return pprint_str 218 219 220_KNOWN_TYPE_REPRS = {np.ndarray: "np.ndarray"} 221with contextlib.suppress(ImportError): 222 import sparse 223 224 _KNOWN_TYPE_REPRS[sparse.COO] = "sparse.COO" 225 226 227def inline_dask_repr(array): 228 """Similar to dask.array.DataArray.__repr__, but without 229 redundant information that's already printed by the repr 230 function of the xarray wrapper. 231 """ 232 assert isinstance(array, dask_array_type), array 233 234 chunksize = tuple(c[0] for c in array.chunks) 235 236 if hasattr(array, "_meta"): 237 meta = array._meta 238 if type(meta) in _KNOWN_TYPE_REPRS: 239 meta_repr = _KNOWN_TYPE_REPRS[type(meta)] 240 else: 241 meta_repr = type(meta).__name__ 242 meta_string = f", meta={meta_repr}" 243 else: 244 meta_string = "" 245 246 return f"dask.array<chunksize={chunksize}{meta_string}>" 247 248 249def inline_sparse_repr(array): 250 """Similar to sparse.COO.__repr__, but without the redundant shape/dtype.""" 251 assert isinstance(array, sparse_array_type), array 252 return "<{}: nnz={:d}, fill_value={!s}>".format( 253 type(array).__name__, array.nnz, array.fill_value 254 ) 255 256 257def inline_variable_array_repr(var, max_width): 258 """Build a one-line summary of a variable's data.""" 259 if var._in_memory: 260 return format_array_flat(var, max_width) 261 elif hasattr(var._data, "_repr_inline_"): 262 return var._data._repr_inline_(max_width) 263 elif isinstance(var._data, dask_array_type): 264 return inline_dask_repr(var.data) 265 elif isinstance(var._data, sparse_array_type): 266 return inline_sparse_repr(var.data) 267 elif hasattr(var._data, "__array_function__"): 268 return maybe_truncate(repr(var._data).replace("\n", " "), max_width) 269 else: 270 # internal xarray array type 271 return "..." 272 273 274def summarize_variable( 275 name: Hashable, var, col_width: int, marker: str = " ", max_width: int = None 276): 277 """Summarize a variable in one line, e.g., for the Dataset.__repr__.""" 278 if max_width is None: 279 max_width_options = OPTIONS["display_width"] 280 if not isinstance(max_width_options, int): 281 raise TypeError(f"`max_width` value of `{max_width}` is not a valid int") 282 else: 283 max_width = max_width_options 284 first_col = pretty_print(f" {marker} {name} ", col_width) 285 if var.dims: 286 dims_str = "({}) ".format(", ".join(map(str, var.dims))) 287 else: 288 dims_str = "" 289 front_str = f"{first_col}{dims_str}{var.dtype} " 290 291 values_width = max_width - len(front_str) 292 values_str = inline_variable_array_repr(var, values_width) 293 294 return front_str + values_str 295 296 297def _summarize_coord_multiindex(coord, col_width, marker): 298 first_col = pretty_print(f" {marker} {coord.name} ", col_width) 299 return "{}({}) MultiIndex".format(first_col, str(coord.dims[0])) 300 301 302def _summarize_coord_levels(coord, col_width, marker="-"): 303 if len(coord) > 100 and col_width < len(coord): 304 n_values = col_width 305 indices = list(range(0, n_values)) + list(range(-n_values, 0)) 306 subset = coord[indices] 307 else: 308 subset = coord 309 310 return "\n".join( 311 summarize_variable( 312 lname, subset.get_level_variable(lname), col_width, marker=marker 313 ) 314 for lname in subset.level_names 315 ) 316 317 318def summarize_datavar(name, var, col_width): 319 return summarize_variable(name, var.variable, col_width) 320 321 322def summarize_coord(name: Hashable, var, col_width: int): 323 is_index = name in var.dims 324 marker = "*" if is_index else " " 325 if is_index: 326 coord = var.variable.to_index_variable() 327 if coord.level_names is not None: 328 return "\n".join( 329 [ 330 _summarize_coord_multiindex(coord, col_width, marker), 331 _summarize_coord_levels(coord, col_width), 332 ] 333 ) 334 return summarize_variable(name, var.variable, col_width, marker) 335 336 337def summarize_attr(key, value, col_width=None): 338 """Summary for __repr__ - use ``X.attrs[key]`` for full value.""" 339 # Indent key and add ':', then right-pad if col_width is not None 340 k_str = f" {key}:" 341 if col_width is not None: 342 k_str = pretty_print(k_str, col_width) 343 # Replace tabs and newlines, so we print on one line in known width 344 v_str = str(value).replace("\t", "\\t").replace("\n", "\\n") 345 # Finally, truncate to the desired display width 346 return maybe_truncate(f"{k_str} {v_str}", OPTIONS["display_width"]) 347 348 349EMPTY_REPR = " *empty*" 350 351 352def _get_col_items(mapping): 353 """Get all column items to format, including both keys of `mapping` 354 and MultiIndex levels if any. 355 """ 356 from .variable import IndexVariable 357 358 col_items = [] 359 for k, v in mapping.items(): 360 col_items.append(k) 361 var = getattr(v, "variable", v) 362 if isinstance(var, IndexVariable): 363 level_names = var.to_index_variable().level_names 364 if level_names is not None: 365 col_items += list(level_names) 366 return col_items 367 368 369def _calculate_col_width(col_items): 370 max_name_length = max(len(str(s)) for s in col_items) if col_items else 0 371 col_width = max(max_name_length, 7) + 6 372 return col_width 373 374 375def _mapping_repr( 376 mapping, title, summarizer, expand_option_name, col_width=None, max_rows=None 377): 378 if col_width is None: 379 col_width = _calculate_col_width(mapping) 380 summary = [f"{title}:"] 381 if mapping: 382 len_mapping = len(mapping) 383 if not _get_boolean_with_default(expand_option_name, default=True): 384 summary = [f"{summary[0]} ({len_mapping})"] 385 elif max_rows is not None and len_mapping > max_rows: 386 summary = [f"{summary[0]} ({max_rows}/{len_mapping})"] 387 first_rows = max_rows // 2 + max_rows % 2 388 keys = list(mapping.keys()) 389 summary += [summarizer(k, mapping[k], col_width) for k in keys[:first_rows]] 390 if max_rows > 1: 391 last_rows = max_rows // 2 392 summary += [pretty_print(" ...", col_width) + " ..."] 393 summary += [ 394 summarizer(k, mapping[k], col_width) for k in keys[-last_rows:] 395 ] 396 else: 397 summary += [summarizer(k, v, col_width) for k, v in mapping.items()] 398 else: 399 summary += [EMPTY_REPR] 400 return "\n".join(summary) 401 402 403data_vars_repr = functools.partial( 404 _mapping_repr, 405 title="Data variables", 406 summarizer=summarize_datavar, 407 expand_option_name="display_expand_data_vars", 408) 409 410 411attrs_repr = functools.partial( 412 _mapping_repr, 413 title="Attributes", 414 summarizer=summarize_attr, 415 expand_option_name="display_expand_attrs", 416) 417 418 419def coords_repr(coords, col_width=None, max_rows=None): 420 if col_width is None: 421 col_width = _calculate_col_width(_get_col_items(coords)) 422 return _mapping_repr( 423 coords, 424 title="Coordinates", 425 summarizer=summarize_coord, 426 expand_option_name="display_expand_coords", 427 col_width=col_width, 428 max_rows=max_rows, 429 ) 430 431 432def indexes_repr(indexes): 433 summary = [] 434 for k, v in indexes.items(): 435 summary.append(wrap_indent(repr(v), f"{k}: ")) 436 return "\n".join(summary) 437 438 439def dim_summary(obj): 440 elements = [f"{k}: {v}" for k, v in obj.sizes.items()] 441 return ", ".join(elements) 442 443 444def unindexed_dims_repr(dims, coords): 445 unindexed_dims = [d for d in dims if d not in coords] 446 if unindexed_dims: 447 dims_str = ", ".join(f"{d}" for d in unindexed_dims) 448 return "Dimensions without coordinates: " + dims_str 449 else: 450 return None 451 452 453@contextlib.contextmanager 454def set_numpy_options(*args, **kwargs): 455 original = np.get_printoptions() 456 np.set_printoptions(*args, **kwargs) 457 try: 458 yield 459 finally: 460 np.set_printoptions(**original) 461 462 463def limit_lines(string: str, *, limit: int): 464 """ 465 If the string is more lines than the limit, 466 this returns the middle lines replaced by an ellipsis 467 """ 468 lines = string.splitlines() 469 if len(lines) > limit: 470 string = "\n".join(chain(lines[: limit // 2], ["..."], lines[-limit // 2 :])) 471 return string 472 473 474def short_numpy_repr(array): 475 array = np.asarray(array) 476 477 # default to lower precision so a full (abbreviated) line can fit on 478 # one line with the default display_width 479 options = {"precision": 6, "linewidth": OPTIONS["display_width"], "threshold": 200} 480 if array.ndim < 3: 481 edgeitems = 3 482 elif array.ndim == 3: 483 edgeitems = 2 484 else: 485 edgeitems = 1 486 options["edgeitems"] = edgeitems 487 with set_numpy_options(**options): 488 return repr(array) 489 490 491def short_data_repr(array): 492 """Format "data" for DataArray and Variable.""" 493 internal_data = getattr(array, "variable", array)._data 494 if isinstance(array, np.ndarray): 495 return short_numpy_repr(array) 496 elif is_duck_array(internal_data): 497 return limit_lines(repr(array.data), limit=40) 498 elif array._in_memory or array.size < 1e5: 499 return short_numpy_repr(array) 500 else: 501 # internal xarray array type 502 return f"[{array.size} values with dtype={array.dtype}]" 503 504 505def array_repr(arr): 506 from .variable import Variable 507 508 # used for DataArray, Variable and IndexVariable 509 if hasattr(arr, "name") and arr.name is not None: 510 name_str = f"{arr.name!r} " 511 else: 512 name_str = "" 513 514 if ( 515 isinstance(arr, Variable) 516 or _get_boolean_with_default("display_expand_data", default=True) 517 or isinstance(arr.variable._data, MemoryCachedArray) 518 ): 519 data_repr = short_data_repr(arr) 520 else: 521 data_repr = inline_variable_array_repr(arr.variable, OPTIONS["display_width"]) 522 523 summary = [ 524 "<xarray.{} {}({})>".format(type(arr).__name__, name_str, dim_summary(arr)), 525 data_repr, 526 ] 527 528 if hasattr(arr, "coords"): 529 if arr.coords: 530 summary.append(repr(arr.coords)) 531 532 unindexed_dims_str = unindexed_dims_repr(arr.dims, arr.coords) 533 if unindexed_dims_str: 534 summary.append(unindexed_dims_str) 535 536 if arr.attrs: 537 summary.append(attrs_repr(arr.attrs)) 538 539 return "\n".join(summary) 540 541 542def dataset_repr(ds): 543 summary = ["<xarray.{}>".format(type(ds).__name__)] 544 545 col_width = _calculate_col_width(_get_col_items(ds.variables)) 546 max_rows = OPTIONS["display_max_rows"] 547 548 dims_start = pretty_print("Dimensions:", col_width) 549 summary.append("{}({})".format(dims_start, dim_summary(ds))) 550 551 if ds.coords: 552 summary.append(coords_repr(ds.coords, col_width=col_width, max_rows=max_rows)) 553 554 unindexed_dims_str = unindexed_dims_repr(ds.dims, ds.coords) 555 if unindexed_dims_str: 556 summary.append(unindexed_dims_str) 557 558 summary.append(data_vars_repr(ds.data_vars, col_width=col_width, max_rows=max_rows)) 559 560 if ds.attrs: 561 summary.append(attrs_repr(ds.attrs, max_rows=max_rows)) 562 563 return "\n".join(summary) 564 565 566def diff_dim_summary(a, b): 567 if a.dims != b.dims: 568 return "Differing dimensions:\n ({}) != ({})".format( 569 dim_summary(a), dim_summary(b) 570 ) 571 else: 572 return "" 573 574 575def _diff_mapping_repr(a_mapping, b_mapping, compat, title, summarizer, col_width=None): 576 def extra_items_repr(extra_keys, mapping, ab_side): 577 extra_repr = [summarizer(k, mapping[k], col_width) for k in extra_keys] 578 if extra_repr: 579 header = f"{title} only on the {ab_side} object:" 580 return [header] + extra_repr 581 else: 582 return [] 583 584 a_keys = set(a_mapping) 585 b_keys = set(b_mapping) 586 587 summary = [] 588 589 diff_items = [] 590 591 for k in a_keys & b_keys: 592 try: 593 # compare xarray variable 594 if not callable(compat): 595 compatible = getattr(a_mapping[k], compat)(b_mapping[k]) 596 else: 597 compatible = compat(a_mapping[k], b_mapping[k]) 598 is_variable = True 599 except AttributeError: 600 # compare attribute value 601 if is_duck_array(a_mapping[k]) or is_duck_array(b_mapping[k]): 602 compatible = array_equiv(a_mapping[k], b_mapping[k]) 603 else: 604 compatible = a_mapping[k] == b_mapping[k] 605 606 is_variable = False 607 608 if not compatible: 609 temp = [ 610 summarizer(k, vars[k], col_width) for vars in (a_mapping, b_mapping) 611 ] 612 613 if compat == "identical" and is_variable: 614 attrs_summary = [] 615 616 for m in (a_mapping, b_mapping): 617 attr_s = "\n".join( 618 summarize_attr(ak, av) for ak, av in m[k].attrs.items() 619 ) 620 attrs_summary.append(attr_s) 621 622 temp = [ 623 "\n".join([var_s, attr_s]) if attr_s else var_s 624 for var_s, attr_s in zip(temp, attrs_summary) 625 ] 626 627 diff_items += [ab_side + s[1:] for ab_side, s in zip(("L", "R"), temp)] 628 629 if diff_items: 630 summary += [f"Differing {title.lower()}:"] + diff_items 631 632 summary += extra_items_repr(a_keys - b_keys, a_mapping, "left") 633 summary += extra_items_repr(b_keys - a_keys, b_mapping, "right") 634 635 return "\n".join(summary) 636 637 638diff_coords_repr = functools.partial( 639 _diff_mapping_repr, title="Coordinates", summarizer=summarize_coord 640) 641 642 643diff_data_vars_repr = functools.partial( 644 _diff_mapping_repr, title="Data variables", summarizer=summarize_datavar 645) 646 647 648diff_attrs_repr = functools.partial( 649 _diff_mapping_repr, title="Attributes", summarizer=summarize_attr 650) 651 652 653def _compat_to_str(compat): 654 if callable(compat): 655 compat = compat.__name__ 656 657 if compat == "equals": 658 return "equal" 659 elif compat == "allclose": 660 return "close" 661 else: 662 return compat 663 664 665def diff_array_repr(a, b, compat): 666 # used for DataArray, Variable and IndexVariable 667 summary = [ 668 "Left and right {} objects are not {}".format( 669 type(a).__name__, _compat_to_str(compat) 670 ) 671 ] 672 673 summary.append(diff_dim_summary(a, b)) 674 if callable(compat): 675 equiv = compat 676 else: 677 equiv = array_equiv 678 679 if not equiv(a.data, b.data): 680 temp = [wrap_indent(short_numpy_repr(obj), start=" ") for obj in (a, b)] 681 diff_data_repr = [ 682 ab_side + "\n" + ab_data_repr 683 for ab_side, ab_data_repr in zip(("L", "R"), temp) 684 ] 685 summary += ["Differing values:"] + diff_data_repr 686 687 if hasattr(a, "coords"): 688 col_width = _calculate_col_width(set(a.coords) | set(b.coords)) 689 summary.append( 690 diff_coords_repr(a.coords, b.coords, compat, col_width=col_width) 691 ) 692 693 if compat == "identical": 694 summary.append(diff_attrs_repr(a.attrs, b.attrs, compat)) 695 696 return "\n".join(summary) 697 698 699def diff_dataset_repr(a, b, compat): 700 summary = [ 701 "Left and right {} objects are not {}".format( 702 type(a).__name__, _compat_to_str(compat) 703 ) 704 ] 705 706 col_width = _calculate_col_width( 707 set(_get_col_items(a.variables) + _get_col_items(b.variables)) 708 ) 709 710 summary.append(diff_dim_summary(a, b)) 711 summary.append(diff_coords_repr(a.coords, b.coords, compat, col_width=col_width)) 712 summary.append( 713 diff_data_vars_repr(a.data_vars, b.data_vars, compat, col_width=col_width) 714 ) 715 716 if compat == "identical": 717 summary.append(diff_attrs_repr(a.attrs, b.attrs, compat)) 718 719 return "\n".join(summary) 720