1#-----------------------------------------------------------------------------
2# Copyright (c) 2012 - 2021, Anaconda, Inc., and Bokeh Contributors.
3# All rights reserved.
4#
5# The full license is in the file LICENSE.txt, distributed with this software.
6#-----------------------------------------------------------------------------
7'''
8Functions for helping with serialization and deserialization of
9Bokeh objects.
10
11Certain NumPy array dtypes can be serialized to a binary format for
12performance and efficiency. The list of supported dtypes is:
13
14{binary_array_types}
15
16'''
17
18#-----------------------------------------------------------------------------
19# Boilerplate
20#-----------------------------------------------------------------------------
21import logging # isort:skip
22log = logging.getLogger(__name__)
23
24#-----------------------------------------------------------------------------
25# Imports
26#-----------------------------------------------------------------------------
27
28# Standard library imports
29import base64
30import datetime as dt
31import sys
32import uuid
33from math import isinf, isnan
34from threading import Lock
35
36# External imports
37import numpy as np
38
39# Bokeh imports
40from ..settings import settings
41from .dependencies import import_optional
42from .string import format_docstring
43
44#-----------------------------------------------------------------------------
45# Globals and constants
46#-----------------------------------------------------------------------------
47
48pd = import_optional('pandas')
49
50BINARY_ARRAY_TYPES = {
51    np.dtype(np.float32),
52    np.dtype(np.float64),
53    np.dtype(np.uint8),
54    np.dtype(np.int8),
55    np.dtype(np.uint16),
56    np.dtype(np.int16),
57    np.dtype(np.uint32),
58    np.dtype(np.int32),
59}
60
61DATETIME_TYPES = {
62    dt.time,
63    dt.datetime,
64    np.datetime64,
65}
66
67if pd:
68    try:
69        _pd_timestamp = pd.Timestamp
70    except AttributeError:
71        _pd_timestamp = pd.tslib.Timestamp
72    DATETIME_TYPES.add(_pd_timestamp)
73    DATETIME_TYPES.add(pd.Timedelta)
74    DATETIME_TYPES.add(pd.Period)
75    DATETIME_TYPES.add(type(pd.NaT))
76
77NP_EPOCH = np.datetime64(0, 'ms')
78NP_MS_DELTA = np.timedelta64(1, 'ms')
79
80DT_EPOCH = dt.datetime.utcfromtimestamp(0)
81
82__doc__ = format_docstring(__doc__, binary_array_types="\n".join("* ``np." + str(x) + "``" for x in BINARY_ARRAY_TYPES))
83
84__all__ = (
85    'array_encoding_disabled',
86    'convert_date_to_datetime',
87    'convert_datetime_array',
88    'convert_datetime_type',
89    'convert_timedelta_type',
90    'decode_base64_dict',
91    'encode_binary_dict',
92    'encode_base64_dict',
93    'is_datetime_type',
94    'is_timedelta_type',
95    'make_globally_unique_id',
96    'make_id',
97    'serialize_array',
98    'transform_array',
99    'transform_array_to_list',
100    'transform_column_source_data',
101    'traverse_data',
102    'transform_series',
103)
104
105#-----------------------------------------------------------------------------
106# General API
107#-----------------------------------------------------------------------------
108
109def is_datetime_type(obj):
110    ''' Whether an object is any date, time, or datetime type recognized by
111    Bokeh.
112
113    Arg:
114        obj (object) : the object to test
115
116    Returns:
117        bool : True if ``obj`` is a datetime type
118
119    '''
120    return isinstance(obj, _dt_tuple)
121
122def is_timedelta_type(obj):
123    ''' Whether an object is any timedelta type recognized by Bokeh.
124
125    Arg:
126        obj (object) : the object to test
127
128    Returns:
129        bool : True if ``obj`` is a timedelta type
130
131    '''
132    return isinstance(obj, (dt.timedelta, np.timedelta64))
133
134def convert_date_to_datetime(obj):
135    ''' Convert a date object to a datetime
136
137    Args:
138        obj (date) : the object to convert
139
140    Returns:
141        datetime
142
143    '''
144    return (dt.datetime(*obj.timetuple()[:6]) - DT_EPOCH).total_seconds() * 1000
145
146def convert_timedelta_type(obj):
147    ''' Convert any recognized timedelta value to floating point absolute
148    milliseconds.
149
150    Arg:
151        obj (object) : the object to convert
152
153    Returns:
154        float : milliseconds
155
156    '''
157    if isinstance(obj, dt.timedelta):
158        return obj.total_seconds() * 1000.
159    elif isinstance(obj, np.timedelta64):
160        return (obj / NP_MS_DELTA)
161
162def convert_datetime_type(obj):
163    ''' Convert any recognized date, time, or datetime value to floating point
164    milliseconds since epoch.
165
166    Arg:
167        obj (object) : the object to convert
168
169    Returns:
170        float : milliseconds
171
172    '''
173    # Pandas NaT
174    if pd and obj is pd.NaT:
175        return np.nan
176
177    # Pandas Period
178    if pd and isinstance(obj, pd.Period):
179        return obj.to_timestamp().value / 10**6.0
180
181    # Pandas Timestamp
182    if pd and isinstance(obj, _pd_timestamp): return obj.value / 10**6.0
183
184    # Pandas Timedelta
185    elif pd and isinstance(obj, pd.Timedelta): return obj.value / 10**6.0
186
187    # Datetime (datetime is a subclass of date)
188    elif isinstance(obj, dt.datetime):
189        diff = obj.replace(tzinfo=None) - DT_EPOCH
190        return diff.total_seconds() * 1000
191
192    # XXX (bev) ideally this would not be here "dates are not datetimes"
193    # Date
194    elif isinstance(obj, dt.date):
195        return convert_date_to_datetime(obj)
196
197    # NumPy datetime64
198    elif isinstance(obj, np.datetime64):
199        epoch_delta = obj - NP_EPOCH
200        return (epoch_delta / NP_MS_DELTA)
201
202    # Time
203    elif isinstance(obj, dt.time):
204        return (obj.hour * 3600 + obj.minute * 60 + obj.second) * 1000 + obj.microsecond / 1000.
205
206def convert_datetime_array(array):
207    ''' Convert NumPy datetime arrays to arrays to milliseconds since epoch.
208
209    Args:
210        array : (obj)
211            A NumPy array of datetime to convert
212
213            If the value passed in is not a NumPy array, it will be returned as-is.
214
215    Returns:
216        array
217
218    '''
219
220    if not isinstance(array, np.ndarray):
221        return array
222
223    # not quite correct, truncates to ms..
224    if array.dtype.kind == 'M':
225        array =  array.astype('datetime64[us]').astype('int64') / 1000.
226
227    elif array.dtype.kind == 'm':
228        array = array.astype('timedelta64[us]').astype('int64') / 1000.
229
230    # XXX (bev) special case dates, not great
231    elif array.dtype.kind == 'O' and len(array) > 0 and isinstance(array[0], dt.date):
232        try:
233            array = array.astype('datetime64[us]').astype('int64') / 1000.
234        except Exception:
235            pass
236
237    return array
238
239def make_id():
240    ''' Return a new unique ID for a Bokeh object.
241
242    Normally this function will return simple monotonically increasing integer
243    IDs (as strings) for identifying Bokeh objects within a Document. However,
244    if it is desirable to have globally unique for every object, this behavior
245    can be overridden by setting the environment variable ``BOKEH_SIMPLE_IDS=no``.
246
247    Returns:
248        str
249
250    '''
251    global _simple_id
252
253    if settings.simple_ids():
254        with _simple_id_lock:
255            _simple_id += 1
256            return str(_simple_id)
257    else:
258        return make_globally_unique_id()
259
260def make_globally_unique_id():
261    ''' Return a globally unique UUID.
262
263    Some situations, e.g. id'ing dynamically created Divs in HTML documents,
264    always require globally unique IDs.
265
266    Returns:
267        str
268
269    '''
270    return str(uuid.uuid4())
271
272def array_encoding_disabled(array):
273    ''' Determine whether an array may be binary encoded.
274
275    The NumPy array dtypes that can be encoded are:
276
277    {binary_array_types}
278
279    Args:
280        array (np.ndarray) : the array to check
281
282    Returns:
283        bool
284
285    '''
286
287    # disable binary encoding for non-supported dtypes
288    return array.dtype not in BINARY_ARRAY_TYPES
289
290array_encoding_disabled.__doc__ = format_docstring(array_encoding_disabled.__doc__,
291                                                   binary_array_types="\n    ".join("* ``np." + str(x) + "``"
292                                                                                    for x in BINARY_ARRAY_TYPES))
293
294def transform_array(array, force_list=False, buffers=None):
295    ''' Transform a NumPy arrays into serialized format
296
297    Converts un-serializable dtypes and returns JSON serializable
298    format
299
300    Args:
301        array (np.ndarray) : a NumPy array to be transformed
302        force_list (bool, optional) : whether to only output to standard lists
303            This function can encode some dtypes using a binary encoding, but
304            setting this argument to True will override that and cause only
305            standard Python lists to be emitted. (default: False)
306
307        buffers (set, optional) :
308            If binary buffers are desired, the buffers parameter may be
309            provided, and any columns that may be sent as binary buffers
310            will be added to the set. If None, then only base64 encoding
311            will be used (default: None)
312
313            If force_list is True, then this value will be ignored, and
314            no buffers will be generated.
315
316            **This is an "out" parameter**. The values it contains will be
317            modified in-place.
318
319
320    Returns:
321        JSON
322
323    '''
324
325    array = convert_datetime_array(array)
326
327    return serialize_array(array, force_list=force_list, buffers=buffers)
328
329def transform_array_to_list(array):
330    ''' Transforms a NumPy array into a list of values
331
332    Args:
333        array (np.nadarray) : the NumPy array series to transform
334
335    Returns:
336        list or dict
337
338    '''
339    if (array.dtype.kind in ('u', 'i', 'f') and (~np.isfinite(array)).any()):
340        transformed = array.astype('object')
341        transformed[np.isnan(array)] = 'NaN'
342        transformed[np.isposinf(array)] = 'Infinity'
343        transformed[np.isneginf(array)] = '-Infinity'
344        return transformed.tolist()
345    elif (array.dtype.kind == 'O' and pd and pd.isnull(array).any()):
346        transformed = array.astype('object')
347        transformed[pd.isnull(array)] = 'NaN'
348        return transformed.tolist()
349    return array.tolist()
350
351def transform_series(series, force_list=False, buffers=None):
352    ''' Transforms a Pandas series into serialized form
353
354    Args:
355        series (pd.Series) : the Pandas series to transform
356        force_list (bool, optional) : whether to only output to standard lists
357            This function can encode some dtypes using a binary encoding, but
358            setting this argument to True will override that and cause only
359            standard Python lists to be emitted. (default: False)
360
361        buffers (set, optional) :
362            If binary buffers are desired, the buffers parameter may be
363            provided, and any columns that may be sent as binary buffers
364            will be added to the set. If None, then only base64 encoding
365            will be used (default: None)
366
367            If force_list is True, then this value will be ignored, and
368            no buffers will be generated.
369
370            **This is an "out" parameter**. The values it contains will be
371            modified in-place.
372
373    Returns:
374        list or dict
375
376    '''
377    # not checking for pd here, this function should only be called if it
378    # is already known that series is a Pandas Series type
379    if isinstance(series, pd.PeriodIndex):
380        vals = series.to_timestamp().values
381    else:
382        vals = series.values
383    return transform_array(vals, force_list=force_list, buffers=buffers)
384
385def serialize_array(array, force_list=False, buffers=None):
386    ''' Transforms a NumPy array into serialized form.
387
388    Args:
389        array (np.ndarray) : the NumPy array to transform
390        force_list (bool, optional) : whether to only output to standard lists
391            This function can encode some dtypes using a binary encoding, but
392            setting this argument to True will override that and cause only
393            standard Python lists to be emitted. (default: False)
394
395        buffers (set, optional) :
396            If binary buffers are desired, the buffers parameter may be
397            provided, and any columns that may be sent as binary buffers
398            will be added to the set. If None, then only base64 encoding
399            will be used (default: None)
400
401            If force_list is True, then this value will be ignored, and
402            no buffers will be generated.
403
404            **This is an "out" parameter**. The values it contains will be
405            modified in-place.
406
407    Returns:
408        list or dict
409
410    '''
411    if isinstance(array, np.ma.MaskedArray):
412        array = array.filled(np.nan)  # Set masked values to nan
413    if (array_encoding_disabled(array) or force_list):
414        return transform_array_to_list(array)
415    if not array.flags['C_CONTIGUOUS']:
416        array = np.ascontiguousarray(array)
417    if buffers is None:
418        return encode_base64_dict(array)
419    else:
420        return encode_binary_dict(array, buffers)
421
422def traverse_data(obj, buffers=None):
423    ''' Recursively traverse an object until a flat list is found.
424
425    The flat list is converted to a numpy array and passed to transform_array()
426    to handle ``nan``, ``inf``, and ``-inf``.
427
428    Args:
429        obj (list) : a list of values or lists
430
431    '''
432    if all(isinstance(el, np.ndarray) for el in obj):
433        return [transform_array(el, buffers=buffers) for el in obj]
434    obj_copy = []
435    for item in obj:
436        # Check the base/common case first for performance reasons
437        # Also use type(x) is float because it's faster than isinstance
438        if type(item) is float:
439            if isnan(item):
440                item = 'NaN'
441            elif isinf(item):
442                if item > 0:
443                    item = 'Infinity'
444                else:
445                    item = '-Infinity'
446            obj_copy.append(item)
447        elif isinstance(item, (list, tuple)):  # check less common type second
448            obj_copy.append(traverse_data(item))
449        else:
450            obj_copy.append(item)
451    return obj_copy
452
453def transform_column_source_data(data, buffers=None, cols=None):
454    ''' Transform ``ColumnSourceData`` data to a serialized format
455
456    Args:
457        data (dict) : the mapping of names to data columns to transform
458
459        buffers (set, optional) :
460            If binary buffers are desired, the buffers parameter may be
461            provided, and any columns that may be sent as binary buffers
462            will be added to the set. If None, then only base64 encoding
463            will be used (default: None)
464
465            **This is an "out" parameter**. The values it contains will be
466            modified in-place.
467
468        cols (list[str], optional) :
469            Optional list of subset of columns to transform. If None, all
470            columns will be transformed (default: None)
471
472    Returns:
473        JSON compatible dict
474
475    '''
476    to_transform = set(data) if cols is None else set(cols)
477
478    data_copy = {}
479    for key in to_transform:
480        if pd and isinstance(data[key], (pd.Series, pd.Index)):
481            data_copy[key] = transform_series(data[key], buffers=buffers)
482        elif isinstance(data[key], np.ndarray):
483            data_copy[key] = transform_array(data[key], buffers=buffers)
484        else:
485            data_copy[key] = traverse_data(data[key], buffers=buffers)
486
487    return data_copy
488
489def encode_binary_dict(array, buffers):
490    ''' Send a numpy array as an unencoded binary buffer
491
492    The encoded format is a dict with the following structure:
493
494    .. code:: python
495
496        {
497            '__buffer__' :  << an ID to locate the buffer >>,
498            'shape'      : << array shape >>,
499            'dtype'      : << dtype name >>,
500            'order'      : << byte order at origin (little or big)>>
501        }
502
503    Args:
504        array (np.ndarray) : an array to encode
505
506        buffers (set) :
507            Set to add buffers to
508
509            **This is an "out" parameter**. The values it contains will be
510            modified in-place.
511
512    Returns:
513        dict
514
515    '''
516    buffer_id = make_id()
517    buf = (dict(id=buffer_id), array.tobytes())
518    buffers.append(buf)
519
520    return {
521        '__buffer__'  : buffer_id,
522        'shape'       : array.shape,
523        'dtype'       : array.dtype.name,
524        'order'       : sys.byteorder,
525    }
526
527def encode_base64_dict(array):
528    ''' Encode a NumPy array using base64:
529
530    The encoded format is a dict with the following structure:
531
532    .. code:: python
533
534        {
535            '__ndarray__' : << base64 encoded array data >>,
536            'shape'       : << array shape >>,
537            'dtype'       : << dtype name >>,
538        }
539
540    Args:
541
542        array (np.ndarray) : an array to encode
543
544    Returns:
545        dict
546
547    '''
548    return {
549        '__ndarray__' : base64.b64encode(array.data).decode('utf-8'),
550        'shape'       : array.shape,
551        'dtype'       : array.dtype.name,
552        'order'       : sys.byteorder,
553    }
554
555def decode_base64_dict(data):
556    ''' Decode a base64 encoded array into a NumPy array.
557
558    Args:
559        data (dict) : encoded array data to decode
560
561    Data should have the format encoded by :func:`encode_base64_dict`.
562
563    Returns:
564        np.ndarray
565
566    '''
567    b64 = base64.b64decode(data['__ndarray__'])
568    array = np.copy(np.frombuffer(b64, dtype=data['dtype']))
569    if len(data['shape']) > 1:
570        array = array.reshape(data['shape'])
571    return array
572
573#-----------------------------------------------------------------------------
574# Dev API
575#-----------------------------------------------------------------------------
576
577#-----------------------------------------------------------------------------
578# Private API
579#-----------------------------------------------------------------------------
580
581_simple_id = 999
582_simple_id_lock = Lock()
583
584_dt_tuple = tuple(DATETIME_TYPES)
585
586#-----------------------------------------------------------------------------
587# Code
588#-----------------------------------------------------------------------------
589