1import warnings 2from collections import defaultdict 3 4import numpy as np 5import pandas as pd 6 7from .coding import strings, times, variables 8from .coding.variables import SerializationWarning, pop_to 9from .core import duck_array_ops, indexing 10from .core.common import contains_cftime_datetimes 11from .core.pycompat import is_duck_dask_array 12from .core.variable import IndexVariable, Variable, as_variable 13 14CF_RELATED_DATA = ( 15 "bounds", 16 "grid_mapping", 17 "climatology", 18 "geometry", 19 "node_coordinates", 20 "node_count", 21 "part_node_count", 22 "interior_ring", 23 "cell_measures", 24 "formula_terms", 25) 26CF_RELATED_DATA_NEEDS_PARSING = ( 27 "cell_measures", 28 "formula_terms", 29) 30 31 32class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin): 33 """Decode arrays on the fly from non-native to native endianness 34 35 This is useful for decoding arrays from netCDF3 files (which are all 36 big endian) into native endianness, so they can be used with Cython 37 functions, such as those found in bottleneck and pandas. 38 39 >>> x = np.arange(5, dtype=">i2") 40 41 >>> x.dtype 42 dtype('>i2') 43 44 >>> NativeEndiannessArray(x).dtype 45 dtype('int16') 46 47 >>> indexer = indexing.BasicIndexer((slice(None),)) 48 >>> NativeEndiannessArray(x)[indexer].dtype 49 dtype('int16') 50 """ 51 52 __slots__ = ("array",) 53 54 def __init__(self, array): 55 self.array = indexing.as_indexable(array) 56 57 @property 58 def dtype(self): 59 return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize)) 60 61 def __getitem__(self, key): 62 return np.asarray(self.array[key], dtype=self.dtype) 63 64 65class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): 66 """Decode arrays on the fly from integer to boolean datatype 67 68 This is useful for decoding boolean arrays from integer typed netCDF 69 variables. 70 71 >>> x = np.array([1, 0, 1, 1, 0], dtype="i1") 72 73 >>> x.dtype 74 dtype('int8') 75 76 >>> BoolTypeArray(x).dtype 77 dtype('bool') 78 79 >>> indexer = indexing.BasicIndexer((slice(None),)) 80 >>> BoolTypeArray(x)[indexer].dtype 81 dtype('bool') 82 """ 83 84 __slots__ = ("array",) 85 86 def __init__(self, array): 87 self.array = indexing.as_indexable(array) 88 89 @property 90 def dtype(self): 91 return np.dtype("bool") 92 93 def __getitem__(self, key): 94 return np.asarray(self.array[key], dtype=self.dtype) 95 96 97def _var_as_tuple(var): 98 return var.dims, var.data, var.attrs.copy(), var.encoding.copy() 99 100 101def maybe_encode_nonstring_dtype(var, name=None): 102 if "dtype" in var.encoding and var.encoding["dtype"] not in ("S1", str): 103 dims, data, attrs, encoding = _var_as_tuple(var) 104 dtype = np.dtype(encoding.pop("dtype")) 105 if dtype != var.dtype: 106 if np.issubdtype(dtype, np.integer): 107 if ( 108 np.issubdtype(var.dtype, np.floating) 109 and "_FillValue" not in var.attrs 110 and "missing_value" not in var.attrs 111 ): 112 warnings.warn( 113 f"saving variable {name} with floating " 114 "point data as an integer dtype without " 115 "any _FillValue to use for NaNs", 116 SerializationWarning, 117 stacklevel=10, 118 ) 119 data = duck_array_ops.around(data)[...] 120 data = data.astype(dtype=dtype) 121 var = Variable(dims, data, attrs, encoding) 122 return var 123 124 125def maybe_default_fill_value(var): 126 # make NaN the fill value for float types: 127 if ( 128 "_FillValue" not in var.attrs 129 and "_FillValue" not in var.encoding 130 and np.issubdtype(var.dtype, np.floating) 131 ): 132 var.attrs["_FillValue"] = var.dtype.type(np.nan) 133 return var 134 135 136def maybe_encode_bools(var): 137 if ( 138 (var.dtype == bool) 139 and ("dtype" not in var.encoding) 140 and ("dtype" not in var.attrs) 141 ): 142 dims, data, attrs, encoding = _var_as_tuple(var) 143 attrs["dtype"] = "bool" 144 data = data.astype(dtype="i1", copy=True) 145 var = Variable(dims, data, attrs, encoding) 146 return var 147 148 149def _infer_dtype(array, name=None): 150 """Given an object array with no missing values, infer its dtype from its 151 first element 152 """ 153 if array.dtype.kind != "O": 154 raise TypeError("infer_type must be called on a dtype=object array") 155 156 if array.size == 0: 157 return np.dtype(float) 158 159 element = array[(0,) * array.ndim] 160 if isinstance(element, (bytes, str)): 161 return strings.create_vlen_dtype(type(element)) 162 163 dtype = np.array(element).dtype 164 if dtype.kind != "O": 165 return dtype 166 167 raise ValueError( 168 "unable to infer dtype on variable {!r}; xarray " 169 "cannot serialize arbitrary Python objects".format(name) 170 ) 171 172 173def ensure_not_multiindex(var, name=None): 174 if isinstance(var, IndexVariable) and isinstance(var.to_index(), pd.MultiIndex): 175 raise NotImplementedError( 176 "variable {!r} is a MultiIndex, which cannot yet be " 177 "serialized to netCDF files " 178 "(https://github.com/pydata/xarray/issues/1077). Use " 179 "reset_index() to convert MultiIndex levels into coordinate " 180 "variables instead.".format(name) 181 ) 182 183 184def _copy_with_dtype(data, dtype): 185 """Create a copy of an array with the given dtype. 186 187 We use this instead of np.array() to ensure that custom object dtypes end 188 up on the resulting array. 189 """ 190 result = np.empty(data.shape, dtype) 191 result[...] = data 192 return result 193 194 195def ensure_dtype_not_object(var, name=None): 196 # TODO: move this from conventions to backends? (it's not CF related) 197 if var.dtype.kind == "O": 198 dims, data, attrs, encoding = _var_as_tuple(var) 199 200 if is_duck_dask_array(data): 201 warnings.warn( 202 "variable {} has data in the form of a dask array with " 203 "dtype=object, which means it is being loaded into memory " 204 "to determine a data type that can be safely stored on disk. " 205 "To avoid this, coerce this variable to a fixed-size dtype " 206 "with astype() before saving it.".format(name), 207 SerializationWarning, 208 ) 209 data = data.compute() 210 211 missing = pd.isnull(data) 212 if missing.any(): 213 # nb. this will fail for dask.array data 214 non_missing_values = data[~missing] 215 inferred_dtype = _infer_dtype(non_missing_values, name) 216 217 # There is no safe bit-pattern for NA in typical binary string 218 # formats, we so can't set a fill_value. Unfortunately, this means 219 # we can't distinguish between missing values and empty strings. 220 if strings.is_bytes_dtype(inferred_dtype): 221 fill_value = b"" 222 elif strings.is_unicode_dtype(inferred_dtype): 223 fill_value = "" 224 else: 225 # insist on using float for numeric values 226 if not np.issubdtype(inferred_dtype, np.floating): 227 inferred_dtype = np.dtype(float) 228 fill_value = inferred_dtype.type(np.nan) 229 230 data = _copy_with_dtype(data, dtype=inferred_dtype) 231 data[missing] = fill_value 232 else: 233 data = _copy_with_dtype(data, dtype=_infer_dtype(data, name)) 234 235 assert data.dtype.kind != "O" or data.dtype.metadata 236 var = Variable(dims, data, attrs, encoding) 237 return var 238 239 240def encode_cf_variable(var, needs_copy=True, name=None): 241 """ 242 Converts an Variable into an Variable which follows some 243 of the CF conventions: 244 245 - Nans are masked using _FillValue (or the deprecated missing_value) 246 - Rescaling via: scale_factor and add_offset 247 - datetimes are converted to the CF 'units since time' format 248 - dtype encodings are enforced. 249 250 Parameters 251 ---------- 252 var : Variable 253 A variable holding un-encoded data. 254 255 Returns 256 ------- 257 out : Variable 258 A variable which has been encoded as described above. 259 """ 260 ensure_not_multiindex(var, name=name) 261 262 for coder in [ 263 times.CFDatetimeCoder(), 264 times.CFTimedeltaCoder(), 265 variables.CFScaleOffsetCoder(), 266 variables.CFMaskCoder(), 267 variables.UnsignedIntegerCoder(), 268 ]: 269 var = coder.encode(var, name=name) 270 271 # TODO(shoyer): convert all of these to use coders, too: 272 var = maybe_encode_nonstring_dtype(var, name=name) 273 var = maybe_default_fill_value(var) 274 var = maybe_encode_bools(var) 275 var = ensure_dtype_not_object(var, name=name) 276 277 for attr_name in CF_RELATED_DATA: 278 pop_to(var.encoding, var.attrs, attr_name) 279 return var 280 281 282def decode_cf_variable( 283 name, 284 var, 285 concat_characters=True, 286 mask_and_scale=True, 287 decode_times=True, 288 decode_endianness=True, 289 stack_char_dim=True, 290 use_cftime=None, 291 decode_timedelta=None, 292): 293 """ 294 Decodes a variable which may hold CF encoded information. 295 296 This includes variables that have been masked and scaled, which 297 hold CF style time variables (this is almost always the case if 298 the dataset has been serialized) and which have strings encoded 299 as character arrays. 300 301 Parameters 302 ---------- 303 name : str 304 Name of the variable. Used for better error messages. 305 var : Variable 306 A variable holding potentially CF encoded information. 307 concat_characters : bool 308 Should character arrays be concatenated to strings, for 309 example: ["h", "e", "l", "l", "o"] -> "hello" 310 mask_and_scale : bool 311 Lazily scale (using scale_factor and add_offset) and mask 312 (using _FillValue). If the _Unsigned attribute is present 313 treat integer arrays as unsigned. 314 decode_times : bool 315 Decode cf times ("hours since 2000-01-01") to np.datetime64. 316 decode_endianness : bool 317 Decode arrays from non-native to native endianness. 318 stack_char_dim : bool 319 Whether to stack characters into bytes along the last dimension of this 320 array. Passed as an argument because we need to look at the full 321 dataset to figure out if this is appropriate. 322 use_cftime : bool, optional 323 Only relevant if encoded dates come from a standard calendar 324 (e.g. "gregorian", "proleptic_gregorian", "standard", or not 325 specified). If None (default), attempt to decode times to 326 ``np.datetime64[ns]`` objects; if this is not possible, decode times to 327 ``cftime.datetime`` objects. If True, always decode times to 328 ``cftime.datetime`` objects, regardless of whether or not they can be 329 represented using ``np.datetime64[ns]`` objects. If False, always 330 decode times to ``np.datetime64[ns]`` objects; if this is not possible 331 raise an error. 332 333 Returns 334 ------- 335 out : Variable 336 A variable holding the decoded equivalent of var. 337 """ 338 var = as_variable(var) 339 original_dtype = var.dtype 340 341 if decode_timedelta is None: 342 decode_timedelta = decode_times 343 344 if concat_characters: 345 if stack_char_dim: 346 var = strings.CharacterArrayCoder().decode(var, name=name) 347 var = strings.EncodedStringCoder().decode(var) 348 349 if mask_and_scale: 350 for coder in [ 351 variables.UnsignedIntegerCoder(), 352 variables.CFMaskCoder(), 353 variables.CFScaleOffsetCoder(), 354 ]: 355 var = coder.decode(var, name=name) 356 357 if decode_timedelta: 358 var = times.CFTimedeltaCoder().decode(var, name=name) 359 if decode_times: 360 var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name) 361 362 dimensions, data, attributes, encoding = variables.unpack_for_decoding(var) 363 # TODO(shoyer): convert everything below to use coders 364 365 if decode_endianness and not data.dtype.isnative: 366 # do this last, so it's only done if we didn't already unmask/scale 367 data = NativeEndiannessArray(data) 368 original_dtype = data.dtype 369 370 encoding.setdefault("dtype", original_dtype) 371 372 if "dtype" in attributes and attributes["dtype"] == "bool": 373 del attributes["dtype"] 374 data = BoolTypeArray(data) 375 376 if not is_duck_dask_array(data): 377 data = indexing.LazilyIndexedArray(data) 378 379 return Variable(dimensions, data, attributes, encoding=encoding) 380 381 382def _update_bounds_attributes(variables): 383 """Adds time attributes to time bounds variables. 384 385 Variables handling time bounds ("Cell boundaries" in the CF 386 conventions) do not necessarily carry the necessary attributes to be 387 decoded. This copies the attributes from the time variable to the 388 associated boundaries. 389 390 See Also: 391 392 http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/ 393 cf-conventions.html#cell-boundaries 394 395 https://github.com/pydata/xarray/issues/2565 396 """ 397 398 # For all time variables with bounds 399 for v in variables.values(): 400 attrs = v.attrs 401 has_date_units = "units" in attrs and "since" in attrs["units"] 402 if has_date_units and "bounds" in attrs: 403 if attrs["bounds"] in variables: 404 bounds_attrs = variables[attrs["bounds"]].attrs 405 bounds_attrs.setdefault("units", attrs["units"]) 406 if "calendar" in attrs: 407 bounds_attrs.setdefault("calendar", attrs["calendar"]) 408 409 410def _update_bounds_encoding(variables): 411 """Adds time encoding to time bounds variables. 412 413 Variables handling time bounds ("Cell boundaries" in the CF 414 conventions) do not necessarily carry the necessary attributes to be 415 decoded. This copies the encoding from the time variable to the 416 associated bounds variable so that we write CF-compliant files. 417 418 See Also: 419 420 http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/ 421 cf-conventions.html#cell-boundaries 422 423 https://github.com/pydata/xarray/issues/2565 424 """ 425 426 # For all time variables with bounds 427 for v in variables.values(): 428 attrs = v.attrs 429 encoding = v.encoding 430 has_date_units = "units" in encoding and "since" in encoding["units"] 431 is_datetime_type = np.issubdtype( 432 v.dtype, np.datetime64 433 ) or contains_cftime_datetimes(v) 434 435 if ( 436 is_datetime_type 437 and not has_date_units 438 and "bounds" in attrs 439 and attrs["bounds"] in variables 440 ): 441 warnings.warn( 442 "Variable '{0}' has datetime type and a " 443 "bounds variable but {0}.encoding does not have " 444 "units specified. The units encodings for '{0}' " 445 "and '{1}' will be determined independently " 446 "and may not be equal, counter to CF-conventions. " 447 "If this is a concern, specify a units encoding for " 448 "'{0}' before writing to a file.".format(v.name, attrs["bounds"]), 449 UserWarning, 450 ) 451 452 if has_date_units and "bounds" in attrs: 453 if attrs["bounds"] in variables: 454 bounds_encoding = variables[attrs["bounds"]].encoding 455 bounds_encoding.setdefault("units", encoding["units"]) 456 if "calendar" in encoding: 457 bounds_encoding.setdefault("calendar", encoding["calendar"]) 458 459 460def decode_cf_variables( 461 variables, 462 attributes, 463 concat_characters=True, 464 mask_and_scale=True, 465 decode_times=True, 466 decode_coords=True, 467 drop_variables=None, 468 use_cftime=None, 469 decode_timedelta=None, 470): 471 """ 472 Decode several CF encoded variables. 473 474 See: decode_cf_variable 475 """ 476 dimensions_used_by = defaultdict(list) 477 for v in variables.values(): 478 for d in v.dims: 479 dimensions_used_by[d].append(v) 480 481 def stackable(dim): 482 # figure out if a dimension can be concatenated over 483 if dim in variables: 484 return False 485 for v in dimensions_used_by[dim]: 486 if v.dtype.kind != "S" or dim != v.dims[-1]: 487 return False 488 return True 489 490 coord_names = set() 491 492 if isinstance(drop_variables, str): 493 drop_variables = [drop_variables] 494 elif drop_variables is None: 495 drop_variables = [] 496 drop_variables = set(drop_variables) 497 498 # Time bounds coordinates might miss the decoding attributes 499 if decode_times: 500 _update_bounds_attributes(variables) 501 502 new_vars = {} 503 for k, v in variables.items(): 504 if k in drop_variables: 505 continue 506 stack_char_dim = ( 507 concat_characters 508 and v.dtype == "S1" 509 and v.ndim > 0 510 and stackable(v.dims[-1]) 511 ) 512 new_vars[k] = decode_cf_variable( 513 k, 514 v, 515 concat_characters=concat_characters, 516 mask_and_scale=mask_and_scale, 517 decode_times=decode_times, 518 stack_char_dim=stack_char_dim, 519 use_cftime=use_cftime, 520 decode_timedelta=decode_timedelta, 521 ) 522 if decode_coords in [True, "coordinates", "all"]: 523 var_attrs = new_vars[k].attrs 524 if "coordinates" in var_attrs: 525 coord_str = var_attrs["coordinates"] 526 var_coord_names = coord_str.split() 527 if all(k in variables for k in var_coord_names): 528 new_vars[k].encoding["coordinates"] = coord_str 529 del var_attrs["coordinates"] 530 coord_names.update(var_coord_names) 531 532 if decode_coords == "all": 533 for attr_name in CF_RELATED_DATA: 534 if attr_name in var_attrs: 535 attr_val = var_attrs[attr_name] 536 if attr_name not in CF_RELATED_DATA_NEEDS_PARSING: 537 var_names = attr_val.split() 538 else: 539 roles_and_names = [ 540 role_or_name 541 for part in attr_val.split(":") 542 for role_or_name in part.split() 543 ] 544 if len(roles_and_names) % 2 == 1: 545 warnings.warn( 546 f"Attribute {attr_name:s} malformed", stacklevel=5 547 ) 548 var_names = roles_and_names[1::2] 549 if all(var_name in variables for var_name in var_names): 550 new_vars[k].encoding[attr_name] = attr_val 551 coord_names.update(var_names) 552 else: 553 referenced_vars_not_in_variables = [ 554 proj_name 555 for proj_name in var_names 556 if proj_name not in variables 557 ] 558 warnings.warn( 559 f"Variable(s) referenced in {attr_name:s} not in variables: {referenced_vars_not_in_variables!s}", 560 stacklevel=5, 561 ) 562 del var_attrs[attr_name] 563 564 if decode_coords and "coordinates" in attributes: 565 attributes = dict(attributes) 566 coord_names.update(attributes.pop("coordinates").split()) 567 568 return new_vars, attributes, coord_names 569 570 571def decode_cf( 572 obj, 573 concat_characters=True, 574 mask_and_scale=True, 575 decode_times=True, 576 decode_coords=True, 577 drop_variables=None, 578 use_cftime=None, 579 decode_timedelta=None, 580): 581 """Decode the given Dataset or Datastore according to CF conventions into 582 a new Dataset. 583 584 Parameters 585 ---------- 586 obj : Dataset or DataStore 587 Object to decode. 588 concat_characters : bool, optional 589 Should character arrays be concatenated to strings, for 590 example: ["h", "e", "l", "l", "o"] -> "hello" 591 mask_and_scale : bool, optional 592 Lazily scale (using scale_factor and add_offset) and mask 593 (using _FillValue). 594 decode_times : bool, optional 595 Decode cf times (e.g., integers since "hours since 2000-01-01") to 596 np.datetime64. 597 decode_coords : bool or {"coordinates", "all"}, optional 598 Controls which variables are set as coordinate variables: 599 600 - "coordinates" or True: Set variables referred to in the 601 ``'coordinates'`` attribute of the datasets or individual variables 602 as coordinate variables. 603 - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and 604 other attributes as coordinate variables. 605 drop_variables : str or iterable, optional 606 A variable or list of variables to exclude from being parsed from the 607 dataset. This may be useful to drop variables with problems or 608 inconsistent values. 609 use_cftime : bool, optional 610 Only relevant if encoded dates come from a standard calendar 611 (e.g. "gregorian", "proleptic_gregorian", "standard", or not 612 specified). If None (default), attempt to decode times to 613 ``np.datetime64[ns]`` objects; if this is not possible, decode times to 614 ``cftime.datetime`` objects. If True, always decode times to 615 ``cftime.datetime`` objects, regardless of whether or not they can be 616 represented using ``np.datetime64[ns]`` objects. If False, always 617 decode times to ``np.datetime64[ns]`` objects; if this is not possible 618 raise an error. 619 decode_timedelta : bool, optional 620 If True, decode variables and coordinates with time units in 621 {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} 622 into timedelta objects. If False, leave them encoded as numbers. 623 If None (default), assume the same value of decode_time. 624 625 Returns 626 ------- 627 decoded : Dataset 628 """ 629 from .backends.common import AbstractDataStore 630 from .core.dataset import Dataset 631 632 if isinstance(obj, Dataset): 633 vars = obj._variables 634 attrs = obj.attrs 635 extra_coords = set(obj.coords) 636 close = obj._close 637 encoding = obj.encoding 638 elif isinstance(obj, AbstractDataStore): 639 vars, attrs = obj.load() 640 extra_coords = set() 641 close = obj.close 642 encoding = obj.get_encoding() 643 else: 644 raise TypeError("can only decode Dataset or DataStore objects") 645 646 vars, attrs, coord_names = decode_cf_variables( 647 vars, 648 attrs, 649 concat_characters, 650 mask_and_scale, 651 decode_times, 652 decode_coords, 653 drop_variables=drop_variables, 654 use_cftime=use_cftime, 655 decode_timedelta=decode_timedelta, 656 ) 657 ds = Dataset(vars, attrs=attrs) 658 ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) 659 ds.set_close(close) 660 ds.encoding = encoding 661 662 return ds 663 664 665def cf_decoder( 666 variables, 667 attributes, 668 concat_characters=True, 669 mask_and_scale=True, 670 decode_times=True, 671): 672 """ 673 Decode a set of CF encoded variables and attributes. 674 675 Parameters 676 ---------- 677 variables : dict 678 A dictionary mapping from variable name to xarray.Variable 679 attributes : dict 680 A dictionary mapping from attribute name to value 681 concat_characters : bool 682 Should character arrays be concatenated to strings, for 683 example: ["h", "e", "l", "l", "o"] -> "hello" 684 mask_and_scale : bool 685 Lazily scale (using scale_factor and add_offset) and mask 686 (using _FillValue). 687 decode_times : bool 688 Decode cf times ("hours since 2000-01-01") to np.datetime64. 689 690 Returns 691 ------- 692 decoded_variables : dict 693 A dictionary mapping from variable name to xarray.Variable objects. 694 decoded_attributes : dict 695 A dictionary mapping from attribute name to values. 696 697 See Also 698 -------- 699 decode_cf_variable 700 """ 701 variables, attributes, _ = decode_cf_variables( 702 variables, attributes, concat_characters, mask_and_scale, decode_times 703 ) 704 return variables, attributes 705 706 707def _encode_coordinates(variables, attributes, non_dim_coord_names): 708 # calculate global and variable specific coordinates 709 non_dim_coord_names = set(non_dim_coord_names) 710 711 for name in list(non_dim_coord_names): 712 if isinstance(name, str) and " " in name: 713 warnings.warn( 714 "coordinate {!r} has a space in its name, which means it " 715 "cannot be marked as a coordinate on disk and will be " 716 "saved as a data variable instead".format(name), 717 SerializationWarning, 718 stacklevel=6, 719 ) 720 non_dim_coord_names.discard(name) 721 722 global_coordinates = non_dim_coord_names.copy() 723 variable_coordinates = defaultdict(set) 724 not_technically_coordinates = set() 725 for coord_name in non_dim_coord_names: 726 target_dims = variables[coord_name].dims 727 for k, v in variables.items(): 728 if ( 729 k not in non_dim_coord_names 730 and k not in v.dims 731 and set(target_dims) <= set(v.dims) 732 ): 733 variable_coordinates[k].add(coord_name) 734 735 if any( 736 attr_name in v.encoding and coord_name in v.encoding.get(attr_name) 737 for attr_name in CF_RELATED_DATA 738 ): 739 not_technically_coordinates.add(coord_name) 740 global_coordinates.discard(coord_name) 741 742 variables = {k: v.copy(deep=False) for k, v in variables.items()} 743 744 # keep track of variable names written to file under the "coordinates" attributes 745 written_coords = set() 746 for name, var in variables.items(): 747 encoding = var.encoding 748 attrs = var.attrs 749 if "coordinates" in attrs and "coordinates" in encoding: 750 raise ValueError( 751 f"'coordinates' found in both attrs and encoding for variable {name!r}." 752 ) 753 754 # if coordinates set to None, don't write coordinates attribute 755 if ( 756 "coordinates" in attrs 757 and attrs.get("coordinates") is None 758 or "coordinates" in encoding 759 and encoding.get("coordinates") is None 760 ): 761 # make sure "coordinates" is removed from attrs/encoding 762 attrs.pop("coordinates", None) 763 encoding.pop("coordinates", None) 764 continue 765 766 # this will copy coordinates from encoding to attrs if "coordinates" in attrs 767 # after the next line, "coordinates" is never in encoding 768 # we get support for attrs["coordinates"] for free. 769 coords_str = pop_to(encoding, attrs, "coordinates") 770 if not coords_str and variable_coordinates[name]: 771 coordinates_text = " ".join( 772 str(coord_name) 773 for coord_name in variable_coordinates[name] 774 if coord_name not in not_technically_coordinates 775 ) 776 if coordinates_text: 777 attrs["coordinates"] = coordinates_text 778 if "coordinates" in attrs: 779 written_coords.update(attrs["coordinates"].split()) 780 781 # These coordinates are not associated with any particular variables, so we 782 # save them under a global 'coordinates' attribute so xarray can roundtrip 783 # the dataset faithfully. Because this serialization goes beyond CF 784 # conventions, only do it if necessary. 785 # Reference discussion: 786 # http://mailman.cgd.ucar.edu/pipermail/cf-metadata/2014/007571.html 787 global_coordinates.difference_update(written_coords) 788 if global_coordinates: 789 attributes = dict(attributes) 790 if "coordinates" in attributes: 791 warnings.warn( 792 f"cannot serialize global coordinates {global_coordinates!r} because the global " 793 f"attribute 'coordinates' already exists. This may prevent faithful roundtripping" 794 f"of xarray datasets", 795 SerializationWarning, 796 ) 797 else: 798 attributes["coordinates"] = " ".join(map(str, global_coordinates)) 799 800 return variables, attributes 801 802 803def encode_dataset_coordinates(dataset): 804 """Encode coordinates on the given dataset object into variable specific 805 and global attributes. 806 807 When possible, this is done according to CF conventions. 808 809 Parameters 810 ---------- 811 dataset : Dataset 812 Object to encode. 813 814 Returns 815 ------- 816 variables : dict 817 attrs : dict 818 """ 819 non_dim_coord_names = set(dataset.coords) - set(dataset.dims) 820 return _encode_coordinates( 821 dataset._variables, dataset.attrs, non_dim_coord_names=non_dim_coord_names 822 ) 823 824 825def cf_encoder(variables, attributes): 826 """ 827 Encode a set of CF encoded variables and attributes. 828 Takes a dicts of variables and attributes and encodes them 829 to conform to CF conventions as much as possible. 830 This includes masking, scaling, character array handling, 831 and CF-time encoding. 832 833 Parameters 834 ---------- 835 variables : dict 836 A dictionary mapping from variable name to xarray.Variable 837 attributes : dict 838 A dictionary mapping from attribute name to value 839 840 Returns 841 ------- 842 encoded_variables : dict 843 A dictionary mapping from variable name to xarray.Variable, 844 encoded_attributes : dict 845 A dictionary mapping from attribute name to value 846 847 See Also 848 -------- 849 decode_cf_variable, encode_cf_variable 850 """ 851 852 # add encoding for time bounds variables if present. 853 _update_bounds_encoding(variables) 854 855 new_vars = {k: encode_cf_variable(v, name=k) for k, v in variables.items()} 856 857 # Remove attrs from bounds variables (issue #2921) 858 for var in new_vars.values(): 859 bounds = var.attrs["bounds"] if "bounds" in var.attrs else None 860 if bounds and bounds in new_vars: 861 # see http://cfconventions.org/cf-conventions/cf-conventions.html#cell-boundaries 862 for attr in [ 863 "units", 864 "standard_name", 865 "axis", 866 "positive", 867 "calendar", 868 "long_name", 869 "leap_month", 870 "leap_year", 871 "month_lengths", 872 ]: 873 if attr in new_vars[bounds].attrs and attr in var.attrs: 874 if new_vars[bounds].attrs[attr] == var.attrs[attr]: 875 new_vars[bounds].attrs.pop(attr) 876 877 return new_vars, attributes 878