1#----------------------------------------------------------------------------- 2# Copyright (c) 2012 - 2021, Anaconda, Inc., and Bokeh Contributors. 3# All rights reserved. 4# 5# The full license is in the file LICENSE.txt, distributed with this software. 6#----------------------------------------------------------------------------- 7''' 8Functions for helping with serialization and deserialization of 9Bokeh objects. 10 11Certain NumPy array dtypes can be serialized to a binary format for 12performance and efficiency. The list of supported dtypes is: 13 14{binary_array_types} 15 16''' 17 18#----------------------------------------------------------------------------- 19# Boilerplate 20#----------------------------------------------------------------------------- 21import logging # isort:skip 22log = logging.getLogger(__name__) 23 24#----------------------------------------------------------------------------- 25# Imports 26#----------------------------------------------------------------------------- 27 28# Standard library imports 29import base64 30import datetime as dt 31import sys 32import uuid 33from math import isinf, isnan 34from threading import Lock 35 36# External imports 37import numpy as np 38 39# Bokeh imports 40from ..settings import settings 41from .dependencies import import_optional 42from .string import format_docstring 43 44#----------------------------------------------------------------------------- 45# Globals and constants 46#----------------------------------------------------------------------------- 47 48pd = import_optional('pandas') 49 50BINARY_ARRAY_TYPES = { 51 np.dtype(np.float32), 52 np.dtype(np.float64), 53 np.dtype(np.uint8), 54 np.dtype(np.int8), 55 np.dtype(np.uint16), 56 np.dtype(np.int16), 57 np.dtype(np.uint32), 58 np.dtype(np.int32), 59} 60 61DATETIME_TYPES = { 62 dt.time, 63 dt.datetime, 64 np.datetime64, 65} 66 67if pd: 68 try: 69 _pd_timestamp = pd.Timestamp 70 except AttributeError: 71 _pd_timestamp = pd.tslib.Timestamp 72 DATETIME_TYPES.add(_pd_timestamp) 73 DATETIME_TYPES.add(pd.Timedelta) 74 DATETIME_TYPES.add(pd.Period) 75 DATETIME_TYPES.add(type(pd.NaT)) 76 77NP_EPOCH = np.datetime64(0, 'ms') 78NP_MS_DELTA = np.timedelta64(1, 'ms') 79 80DT_EPOCH = dt.datetime.utcfromtimestamp(0) 81 82__doc__ = format_docstring(__doc__, binary_array_types="\n".join("* ``np." + str(x) + "``" for x in BINARY_ARRAY_TYPES)) 83 84__all__ = ( 85 'array_encoding_disabled', 86 'convert_date_to_datetime', 87 'convert_datetime_array', 88 'convert_datetime_type', 89 'convert_timedelta_type', 90 'decode_base64_dict', 91 'encode_binary_dict', 92 'encode_base64_dict', 93 'is_datetime_type', 94 'is_timedelta_type', 95 'make_globally_unique_id', 96 'make_id', 97 'serialize_array', 98 'transform_array', 99 'transform_array_to_list', 100 'transform_column_source_data', 101 'traverse_data', 102 'transform_series', 103) 104 105#----------------------------------------------------------------------------- 106# General API 107#----------------------------------------------------------------------------- 108 109def is_datetime_type(obj): 110 ''' Whether an object is any date, time, or datetime type recognized by 111 Bokeh. 112 113 Arg: 114 obj (object) : the object to test 115 116 Returns: 117 bool : True if ``obj`` is a datetime type 118 119 ''' 120 return isinstance(obj, _dt_tuple) 121 122def is_timedelta_type(obj): 123 ''' Whether an object is any timedelta type recognized by Bokeh. 124 125 Arg: 126 obj (object) : the object to test 127 128 Returns: 129 bool : True if ``obj`` is a timedelta type 130 131 ''' 132 return isinstance(obj, (dt.timedelta, np.timedelta64)) 133 134def convert_date_to_datetime(obj): 135 ''' Convert a date object to a datetime 136 137 Args: 138 obj (date) : the object to convert 139 140 Returns: 141 datetime 142 143 ''' 144 return (dt.datetime(*obj.timetuple()[:6]) - DT_EPOCH).total_seconds() * 1000 145 146def convert_timedelta_type(obj): 147 ''' Convert any recognized timedelta value to floating point absolute 148 milliseconds. 149 150 Arg: 151 obj (object) : the object to convert 152 153 Returns: 154 float : milliseconds 155 156 ''' 157 if isinstance(obj, dt.timedelta): 158 return obj.total_seconds() * 1000. 159 elif isinstance(obj, np.timedelta64): 160 return (obj / NP_MS_DELTA) 161 162def convert_datetime_type(obj): 163 ''' Convert any recognized date, time, or datetime value to floating point 164 milliseconds since epoch. 165 166 Arg: 167 obj (object) : the object to convert 168 169 Returns: 170 float : milliseconds 171 172 ''' 173 # Pandas NaT 174 if pd and obj is pd.NaT: 175 return np.nan 176 177 # Pandas Period 178 if pd and isinstance(obj, pd.Period): 179 return obj.to_timestamp().value / 10**6.0 180 181 # Pandas Timestamp 182 if pd and isinstance(obj, _pd_timestamp): return obj.value / 10**6.0 183 184 # Pandas Timedelta 185 elif pd and isinstance(obj, pd.Timedelta): return obj.value / 10**6.0 186 187 # Datetime (datetime is a subclass of date) 188 elif isinstance(obj, dt.datetime): 189 diff = obj.replace(tzinfo=None) - DT_EPOCH 190 return diff.total_seconds() * 1000 191 192 # XXX (bev) ideally this would not be here "dates are not datetimes" 193 # Date 194 elif isinstance(obj, dt.date): 195 return convert_date_to_datetime(obj) 196 197 # NumPy datetime64 198 elif isinstance(obj, np.datetime64): 199 epoch_delta = obj - NP_EPOCH 200 return (epoch_delta / NP_MS_DELTA) 201 202 # Time 203 elif isinstance(obj, dt.time): 204 return (obj.hour * 3600 + obj.minute * 60 + obj.second) * 1000 + obj.microsecond / 1000. 205 206def convert_datetime_array(array): 207 ''' Convert NumPy datetime arrays to arrays to milliseconds since epoch. 208 209 Args: 210 array : (obj) 211 A NumPy array of datetime to convert 212 213 If the value passed in is not a NumPy array, it will be returned as-is. 214 215 Returns: 216 array 217 218 ''' 219 220 if not isinstance(array, np.ndarray): 221 return array 222 223 # not quite correct, truncates to ms.. 224 if array.dtype.kind == 'M': 225 array = array.astype('datetime64[us]').astype('int64') / 1000. 226 227 elif array.dtype.kind == 'm': 228 array = array.astype('timedelta64[us]').astype('int64') / 1000. 229 230 # XXX (bev) special case dates, not great 231 elif array.dtype.kind == 'O' and len(array) > 0 and isinstance(array[0], dt.date): 232 try: 233 array = array.astype('datetime64[us]').astype('int64') / 1000. 234 except Exception: 235 pass 236 237 return array 238 239def make_id(): 240 ''' Return a new unique ID for a Bokeh object. 241 242 Normally this function will return simple monotonically increasing integer 243 IDs (as strings) for identifying Bokeh objects within a Document. However, 244 if it is desirable to have globally unique for every object, this behavior 245 can be overridden by setting the environment variable ``BOKEH_SIMPLE_IDS=no``. 246 247 Returns: 248 str 249 250 ''' 251 global _simple_id 252 253 if settings.simple_ids(): 254 with _simple_id_lock: 255 _simple_id += 1 256 return str(_simple_id) 257 else: 258 return make_globally_unique_id() 259 260def make_globally_unique_id(): 261 ''' Return a globally unique UUID. 262 263 Some situations, e.g. id'ing dynamically created Divs in HTML documents, 264 always require globally unique IDs. 265 266 Returns: 267 str 268 269 ''' 270 return str(uuid.uuid4()) 271 272def array_encoding_disabled(array): 273 ''' Determine whether an array may be binary encoded. 274 275 The NumPy array dtypes that can be encoded are: 276 277 {binary_array_types} 278 279 Args: 280 array (np.ndarray) : the array to check 281 282 Returns: 283 bool 284 285 ''' 286 287 # disable binary encoding for non-supported dtypes 288 return array.dtype not in BINARY_ARRAY_TYPES 289 290array_encoding_disabled.__doc__ = format_docstring(array_encoding_disabled.__doc__, 291 binary_array_types="\n ".join("* ``np." + str(x) + "``" 292 for x in BINARY_ARRAY_TYPES)) 293 294def transform_array(array, force_list=False, buffers=None): 295 ''' Transform a NumPy arrays into serialized format 296 297 Converts un-serializable dtypes and returns JSON serializable 298 format 299 300 Args: 301 array (np.ndarray) : a NumPy array to be transformed 302 force_list (bool, optional) : whether to only output to standard lists 303 This function can encode some dtypes using a binary encoding, but 304 setting this argument to True will override that and cause only 305 standard Python lists to be emitted. (default: False) 306 307 buffers (set, optional) : 308 If binary buffers are desired, the buffers parameter may be 309 provided, and any columns that may be sent as binary buffers 310 will be added to the set. If None, then only base64 encoding 311 will be used (default: None) 312 313 If force_list is True, then this value will be ignored, and 314 no buffers will be generated. 315 316 **This is an "out" parameter**. The values it contains will be 317 modified in-place. 318 319 320 Returns: 321 JSON 322 323 ''' 324 325 array = convert_datetime_array(array) 326 327 return serialize_array(array, force_list=force_list, buffers=buffers) 328 329def transform_array_to_list(array): 330 ''' Transforms a NumPy array into a list of values 331 332 Args: 333 array (np.nadarray) : the NumPy array series to transform 334 335 Returns: 336 list or dict 337 338 ''' 339 if (array.dtype.kind in ('u', 'i', 'f') and (~np.isfinite(array)).any()): 340 transformed = array.astype('object') 341 transformed[np.isnan(array)] = 'NaN' 342 transformed[np.isposinf(array)] = 'Infinity' 343 transformed[np.isneginf(array)] = '-Infinity' 344 return transformed.tolist() 345 elif (array.dtype.kind == 'O' and pd and pd.isnull(array).any()): 346 transformed = array.astype('object') 347 transformed[pd.isnull(array)] = 'NaN' 348 return transformed.tolist() 349 return array.tolist() 350 351def transform_series(series, force_list=False, buffers=None): 352 ''' Transforms a Pandas series into serialized form 353 354 Args: 355 series (pd.Series) : the Pandas series to transform 356 force_list (bool, optional) : whether to only output to standard lists 357 This function can encode some dtypes using a binary encoding, but 358 setting this argument to True will override that and cause only 359 standard Python lists to be emitted. (default: False) 360 361 buffers (set, optional) : 362 If binary buffers are desired, the buffers parameter may be 363 provided, and any columns that may be sent as binary buffers 364 will be added to the set. If None, then only base64 encoding 365 will be used (default: None) 366 367 If force_list is True, then this value will be ignored, and 368 no buffers will be generated. 369 370 **This is an "out" parameter**. The values it contains will be 371 modified in-place. 372 373 Returns: 374 list or dict 375 376 ''' 377 # not checking for pd here, this function should only be called if it 378 # is already known that series is a Pandas Series type 379 if isinstance(series, pd.PeriodIndex): 380 vals = series.to_timestamp().values 381 else: 382 vals = series.values 383 return transform_array(vals, force_list=force_list, buffers=buffers) 384 385def serialize_array(array, force_list=False, buffers=None): 386 ''' Transforms a NumPy array into serialized form. 387 388 Args: 389 array (np.ndarray) : the NumPy array to transform 390 force_list (bool, optional) : whether to only output to standard lists 391 This function can encode some dtypes using a binary encoding, but 392 setting this argument to True will override that and cause only 393 standard Python lists to be emitted. (default: False) 394 395 buffers (set, optional) : 396 If binary buffers are desired, the buffers parameter may be 397 provided, and any columns that may be sent as binary buffers 398 will be added to the set. If None, then only base64 encoding 399 will be used (default: None) 400 401 If force_list is True, then this value will be ignored, and 402 no buffers will be generated. 403 404 **This is an "out" parameter**. The values it contains will be 405 modified in-place. 406 407 Returns: 408 list or dict 409 410 ''' 411 if isinstance(array, np.ma.MaskedArray): 412 array = array.filled(np.nan) # Set masked values to nan 413 if (array_encoding_disabled(array) or force_list): 414 return transform_array_to_list(array) 415 if not array.flags['C_CONTIGUOUS']: 416 array = np.ascontiguousarray(array) 417 if buffers is None: 418 return encode_base64_dict(array) 419 else: 420 return encode_binary_dict(array, buffers) 421 422def traverse_data(obj, buffers=None): 423 ''' Recursively traverse an object until a flat list is found. 424 425 The flat list is converted to a numpy array and passed to transform_array() 426 to handle ``nan``, ``inf``, and ``-inf``. 427 428 Args: 429 obj (list) : a list of values or lists 430 431 ''' 432 if all(isinstance(el, np.ndarray) for el in obj): 433 return [transform_array(el, buffers=buffers) for el in obj] 434 obj_copy = [] 435 for item in obj: 436 # Check the base/common case first for performance reasons 437 # Also use type(x) is float because it's faster than isinstance 438 if type(item) is float: 439 if isnan(item): 440 item = 'NaN' 441 elif isinf(item): 442 if item > 0: 443 item = 'Infinity' 444 else: 445 item = '-Infinity' 446 obj_copy.append(item) 447 elif isinstance(item, (list, tuple)): # check less common type second 448 obj_copy.append(traverse_data(item)) 449 else: 450 obj_copy.append(item) 451 return obj_copy 452 453def transform_column_source_data(data, buffers=None, cols=None): 454 ''' Transform ``ColumnSourceData`` data to a serialized format 455 456 Args: 457 data (dict) : the mapping of names to data columns to transform 458 459 buffers (set, optional) : 460 If binary buffers are desired, the buffers parameter may be 461 provided, and any columns that may be sent as binary buffers 462 will be added to the set. If None, then only base64 encoding 463 will be used (default: None) 464 465 **This is an "out" parameter**. The values it contains will be 466 modified in-place. 467 468 cols (list[str], optional) : 469 Optional list of subset of columns to transform. If None, all 470 columns will be transformed (default: None) 471 472 Returns: 473 JSON compatible dict 474 475 ''' 476 to_transform = set(data) if cols is None else set(cols) 477 478 data_copy = {} 479 for key in to_transform: 480 if pd and isinstance(data[key], (pd.Series, pd.Index)): 481 data_copy[key] = transform_series(data[key], buffers=buffers) 482 elif isinstance(data[key], np.ndarray): 483 data_copy[key] = transform_array(data[key], buffers=buffers) 484 else: 485 data_copy[key] = traverse_data(data[key], buffers=buffers) 486 487 return data_copy 488 489def encode_binary_dict(array, buffers): 490 ''' Send a numpy array as an unencoded binary buffer 491 492 The encoded format is a dict with the following structure: 493 494 .. code:: python 495 496 { 497 '__buffer__' : << an ID to locate the buffer >>, 498 'shape' : << array shape >>, 499 'dtype' : << dtype name >>, 500 'order' : << byte order at origin (little or big)>> 501 } 502 503 Args: 504 array (np.ndarray) : an array to encode 505 506 buffers (set) : 507 Set to add buffers to 508 509 **This is an "out" parameter**. The values it contains will be 510 modified in-place. 511 512 Returns: 513 dict 514 515 ''' 516 buffer_id = make_id() 517 buf = (dict(id=buffer_id), array.tobytes()) 518 buffers.append(buf) 519 520 return { 521 '__buffer__' : buffer_id, 522 'shape' : array.shape, 523 'dtype' : array.dtype.name, 524 'order' : sys.byteorder, 525 } 526 527def encode_base64_dict(array): 528 ''' Encode a NumPy array using base64: 529 530 The encoded format is a dict with the following structure: 531 532 .. code:: python 533 534 { 535 '__ndarray__' : << base64 encoded array data >>, 536 'shape' : << array shape >>, 537 'dtype' : << dtype name >>, 538 } 539 540 Args: 541 542 array (np.ndarray) : an array to encode 543 544 Returns: 545 dict 546 547 ''' 548 return { 549 '__ndarray__' : base64.b64encode(array.data).decode('utf-8'), 550 'shape' : array.shape, 551 'dtype' : array.dtype.name, 552 'order' : sys.byteorder, 553 } 554 555def decode_base64_dict(data): 556 ''' Decode a base64 encoded array into a NumPy array. 557 558 Args: 559 data (dict) : encoded array data to decode 560 561 Data should have the format encoded by :func:`encode_base64_dict`. 562 563 Returns: 564 np.ndarray 565 566 ''' 567 b64 = base64.b64decode(data['__ndarray__']) 568 array = np.copy(np.frombuffer(b64, dtype=data['dtype'])) 569 if len(data['shape']) > 1: 570 array = array.reshape(data['shape']) 571 return array 572 573#----------------------------------------------------------------------------- 574# Dev API 575#----------------------------------------------------------------------------- 576 577#----------------------------------------------------------------------------- 578# Private API 579#----------------------------------------------------------------------------- 580 581_simple_id = 999 582_simple_id_lock = Lock() 583 584_dt_tuple = tuple(DATETIME_TYPES) 585 586#----------------------------------------------------------------------------- 587# Code 588#----------------------------------------------------------------------------- 589