1# cython: language_level=3
2"""Class to efficiently select and read data from an HDF5 dataset
3
4This is written in Cython to reduce overhead when reading small amounts of
5data. The core of it is translating between numpy-style slicing & indexing and
6HDF5's H5Sselect_hyperslab calls.
7
8Python & numpy distinguish indexing a[3] from slicing a single element a[3:4],
9but there is no equivalent to this when selecting data in HDF5. So we store a
10separate boolean ('scalar') for each dimension to distinguish these cases.
11"""
12from numpy cimport (
13    ndarray, npy_intp, PyArray_SimpleNew, PyArray_DATA, import_array,
14    PyArray_IsNativeByteOrder,
15)
16from cpython cimport PyNumber_Index
17
18import numpy as np
19from .defs cimport *
20from .h5d cimport DatasetID
21from .h5s cimport SpaceID
22from .h5t cimport TypeID, typewrap, py_create
23from .utils cimport emalloc, efree, convert_dims
24
25import_array()
26
27
28cdef object convert_bools(bint* data, hsize_t rank):
29    # Convert a bint array to a Python tuple of bools.
30    cdef list bools_l
31    cdef int i
32    bools_l = []
33
34    for i in range(rank):
35        bools_l.append(bool(data[i]))
36
37    return tuple(bools_l)
38
39
40cdef class Selector:
41    cdef SpaceID spaceobj
42    cdef hid_t space
43    cdef int rank
44    cdef bint is_fancy
45    cdef hsize_t* dims
46    cdef hsize_t* start
47    cdef hsize_t* stride
48    cdef hsize_t* count
49    cdef hsize_t* block
50    cdef bint* scalar
51
52    def __cinit__(self, SpaceID space):
53        self.spaceobj = space
54        self.space = space.id
55        self.rank = H5Sget_simple_extent_ndims(self.space)
56        self.is_fancy = False
57
58        self.dims = <hsize_t*>emalloc(sizeof(hsize_t) * self.rank)
59        self.start = <hsize_t*>emalloc(sizeof(hsize_t) * self.rank)
60        self.stride = <hsize_t*>emalloc(sizeof(hsize_t) * self.rank)
61        self.count = <hsize_t*>emalloc(sizeof(hsize_t) * self.rank)
62        self.block = <hsize_t*>emalloc(sizeof(hsize_t) * self.rank)
63        self.scalar = <bint*>emalloc(sizeof(bint) * self.rank)
64
65        H5Sget_simple_extent_dims(self.space, self.dims, NULL)
66
67    def __dealloc__(self):
68        efree(self.dims)
69        efree(self.start)
70        efree(self.stride)
71        efree(self.count)
72        efree(self.block)
73        efree(self.scalar)
74
75    cdef bint apply_args(self, tuple args) except 0:
76        """Apply indexing arguments to this Selector object"""
77        cdef:
78            int nargs, ellipsis_ix, array_ix = -1
79            bint seen_ellipsis = False
80            int dim_ix = -1
81            hsize_t l
82            ndarray array_arg
83
84        # If no explicit ellipsis, implicit ellipsis is after args
85        nargs = ellipsis_ix = len(args)
86
87        for a in args:
88            dim_ix += 1
89
90            if a is Ellipsis:
91                # [...] - Ellipsis (fill any unspecified dimensions here)
92                if seen_ellipsis:
93                    raise ValueError("Only one ellipsis may be used.")
94                seen_ellipsis = True
95
96                ellipsis_ix = dim_ix
97                nargs -= 1  # Don't count the ... itself
98                if nargs > self.rank:
99                    raise ValueError(f"{nargs} indexing arguments for {self.rank} dimensions")
100
101                # Skip ahead to the remaining dimensions
102                # -1 because the next iteration will increment dim_ix
103                dim_ix += self.rank - nargs - 1
104                continue
105
106            if dim_ix >= self.rank:
107                raise ValueError(f"{nargs} indexing arguments for {self.rank} dimensions")
108
109            # Length of the relevant dimension
110            l = self.dims[dim_ix]
111
112            # [0:10] - slicing
113            if isinstance(a, slice):
114                start, stop, step = a.indices(l)
115                # Now if step > 0, then start and stop are in [0, length];
116                # if step < 0, they are in [-1, length - 1] (Python 2.6b2 and later;
117                # Python issue 3004).
118
119                if step < 1:
120                    raise ValueError("Step must be >= 1 (got %d)" % step)
121                if stop < start:
122                    # list/tuple and numpy consider stop < start to be an empty selection
123                    start, count, step = 0, 0, 1
124                else:
125                    count = 1 + (stop - start - 1) // step
126
127                self.start[dim_ix] = start
128                self.stride[dim_ix] = step
129                self.count[dim_ix] = count
130                self.block[dim_ix] = 1
131                self.scalar[dim_ix] = False
132
133                continue
134
135            # [0] - simple integer indices
136            try:
137                # PyIndex_Check only checks the type - e.g. all numpy arrays
138                # pass PyIndex_Check, but only scalar arrays are valid.
139                a = PyNumber_Index(a)
140            except TypeError:
141                pass  # Fall through to check for list/array
142            else:
143                if a < 0:
144                    a += l
145
146                if not 0 <= a < l:
147                    if l == 0:
148                        msg = f"Index ({a}) out of range for empty dimension"
149                    else:
150                        msg = f"Index ({a}) out of range for (0-{l-1})"
151                    raise IndexError(msg)
152
153                self.start[dim_ix] = a
154                self.stride[dim_ix] = 1
155                self.count[dim_ix] = 1
156                self.block[dim_ix] = 1
157                self.scalar[dim_ix] = True
158
159                continue
160
161            # MultiBlockSlice exposes h5py's separate count & block parameters
162            # to allow more complex repeating selections.
163            if isinstance(a, MultiBlockSlice):
164                (
165                    self.start[dim_ix],
166                    self.stride[dim_ix],
167                    self.count[dim_ix],
168                    self.block[dim_ix],
169                ) = a.indices(l)
170                self.scalar[dim_ix] = False
171
172                continue
173
174            # [[0, 2, 10]] - list/array of indices ('fancy indexing')
175            if isinstance(a, (list, np.ndarray)):
176                if isinstance(a, list) and len(a) == 0:
177                    a = np.asarray(a, dtype=np.intp)
178                else:
179                    a = np.asarray(a)
180                if a.ndim != 1:
181                    raise TypeError("Only 1D arrays allowed for fancy indexing")
182                if not np.issubdtype(a.dtype, np.integer):
183                    raise TypeError("Indexing arrays must have integer dtypes")
184                if array_ix != -1:
185                    raise TypeError("Only one indexing vector or array is currently allowed for fancy indexing")
186
187                # Convert negative indices to positive
188                if np.any(a < 0):
189                    a = a.copy()
190                    a[a < 0] += l
191
192                # Bounds check
193                if np.any((a < 0) | (a > l)):
194                    if l == 0:
195                        msg = "Fancy indexing out of range for empty dimension"
196                    else:
197                        msg = f"Fancy indexing out of range for (0-{l-1})"
198                    raise IndexError(msg)
199
200                if np.any(np.diff(a) <= 0):
201                    raise TypeError("Indexing elements must be in increasing order")
202
203                array_ix = dim_ix
204                array_arg = a
205                self.start[dim_ix] = 0
206                self.stride[dim_ix] = 1
207                self.count[dim_ix] = a.shape[0]
208                self.block[dim_ix] = 1
209                self.scalar[dim_ix] = False
210
211                continue
212
213            raise TypeError("Simple selection can't process %r" % a)
214
215        if nargs < self.rank:
216            # Fill in ellipsis or trailing dimensions
217            ellipsis_end = ellipsis_ix + (self.rank - nargs)
218            for dim_ix in range(ellipsis_ix, ellipsis_end):
219                self.start[dim_ix] = 0
220                self.stride[dim_ix] = 1
221                self.count[dim_ix] = self.dims[dim_ix]
222                self.block[dim_ix] = 1
223                self.scalar[dim_ix] = False
224
225        if nargs == 0:
226            H5Sselect_all(self.space)
227            self.is_fancy = False
228        elif array_ix != -1:
229            self.select_fancy(array_ix, array_arg)
230            self.is_fancy = True
231        else:
232            H5Sselect_hyperslab(self.space, H5S_SELECT_SET, self.start, self.stride, self.count, self.block)
233            self.is_fancy = False
234        return True
235
236    cdef select_fancy(self, int array_ix, ndarray array_arg):
237        """Apply a 'fancy' selection (array of indices) to the dataspace"""
238        cdef hsize_t* tmp_start
239        cdef hsize_t* tmp_count
240        cdef uint64_t i
241
242        H5Sselect_none(self.space)
243
244        tmp_start = <hsize_t*>emalloc(sizeof(hsize_t) * self.rank)
245        tmp_count = <hsize_t*>emalloc(sizeof(hsize_t) * self.rank)
246        try:
247            memcpy(tmp_start, self.start, sizeof(hsize_t) * self.rank)
248            memcpy(tmp_count, self.count, sizeof(hsize_t) * self.rank)
249            tmp_count[array_ix] = 1
250
251            # Iterate over the array of indices, add each hyperslab to the selection
252            for i in array_arg:
253                tmp_start[array_ix] = i
254                H5Sselect_hyperslab(self.space, H5S_SELECT_OR, tmp_start, self.stride, tmp_count, self.block)
255        finally:
256            efree(tmp_start)
257            efree(tmp_count)
258
259
260    def make_selection(self, tuple args):
261        """Apply indexing/slicing args and create a high-level selection object
262
263        Returns an instance of SimpleSelection or FancySelection, with a copy
264        of the selector's dataspace.
265        """
266        cdef:
267            SpaceID space
268            tuple shape, start, count, step, scalar, arr_shape
269            int arr_rank, i
270            npy_intp* arr_shape_p
271
272        self.apply_args(args)
273        space = SpaceID(H5Scopy(self.space))
274
275        shape = convert_dims(self.dims, self.rank)
276        count = convert_dims(self.count, self.rank)
277        block = convert_dims(self.block, self.rank)
278        mshape = tuple(c * b for c, b in zip(count, block))
279
280        from ._hl.selections import SimpleSelection, FancySelection
281
282        if self.is_fancy:
283            arr_shape = tuple(
284                mshape[i] for i in range(self.rank) if not self.scalar[i]
285            )
286            return FancySelection(shape, space, count, arr_shape)
287        else:
288            start = convert_dims(self.start, self.rank)
289            step = convert_dims(self.stride, self.rank)
290            scalar = convert_bools(self.scalar, self.rank)
291            return SimpleSelection(shape, space, (start, mshape, step, scalar))
292
293
294cdef class Reader:
295    cdef hid_t dataset
296    cdef Selector selector
297    cdef TypeID h5_memory_datatype
298    cdef int np_typenum
299    cdef bint native_byteorder
300
301    def __cinit__(self, DatasetID dsid):
302        self.dataset = dsid.id
303        self.selector = Selector(dsid.get_space())
304
305        # HDF5 can use e.g. custom float datatypes which don't have an exact
306        # match in numpy. Translating it to a numpy dtype chooses the smallest
307        # dtype which won't lose any data, then we translate that back to a
308        # HDF5 datatype (h5_memory_datatype).
309        h5_stored_datatype = typewrap(H5Dget_type(self.dataset))
310        np_dtype = h5_stored_datatype.py_dtype()
311        self.np_typenum = np_dtype.num
312        self.native_byteorder = PyArray_IsNativeByteOrder(ord(np_dtype.byteorder))
313        self.h5_memory_datatype = py_create(np_dtype)
314
315    cdef ndarray make_array(self, hsize_t* mshape):
316        """Create an array to read the selected data into.
317
318        .apply_args() should be called first, to set self.count and self.scalar.
319        Only works for simple numeric dtypes which can be defined with typenum.
320        """
321        cdef int i, arr_rank = 0
322        cdef npy_intp* arr_shape
323
324        arr_shape = <npy_intp*>emalloc(sizeof(npy_intp) * self.selector.rank)
325        try:
326            # Copy any non-scalar selection dimensions for the array shape
327            for i in range(self.selector.rank):
328                if not self.selector.scalar[i]:
329                    arr_shape[arr_rank] = mshape[i]
330                    arr_rank += 1
331
332            arr = PyArray_SimpleNew(arr_rank, arr_shape, self.np_typenum)
333            if not self.native_byteorder:
334                arr = arr.newbyteorder()
335        finally:
336            efree(arr_shape)
337
338        return arr
339
340    def read(self, tuple args):
341        """Index the dataset using args and read into a new numpy array
342
343        Only works for simple numeric dtypes.
344        """
345        cdef void* buf
346        cdef ndarray arr
347        cdef hsize_t* mshape
348        cdef hid_t mspace
349        cdef int i
350
351        self.selector.apply_args(args)
352
353        # The selected length of each dimension is count * block
354        mshape = <hsize_t*>emalloc(sizeof(hsize_t) * self.selector.rank)
355        try:
356            for i in range(self.selector.rank):
357                mshape[i] = self.selector.count[i] * self.selector.block[i]
358            arr = self.make_array(mshape)
359            buf = PyArray_DATA(arr)
360
361            mspace = H5Screate_simple(self.selector.rank, mshape, NULL)
362        finally:
363            efree(mshape)
364
365        try:
366            H5Dread(self.dataset, self.h5_memory_datatype.id, mspace,
367                    self.selector.space, H5P_DEFAULT, buf)
368        finally:
369            H5Sclose(mspace)
370
371        if arr.ndim == 0:
372            return arr[()]
373        else:
374            return arr
375
376
377class MultiBlockSlice(object):
378    """
379        A conceptual extension of the built-in slice object to allow selections
380        using start, stride, count and block.
381
382        If given, these parameters will be passed directly to
383        H5Sselect_hyperslab. The defaults are start=0, stride=1, block=1,
384        count=length, which will select the full extent.
385
386        __init__(start, stride, count, block) => Create a new MultiBlockSlice, storing
387            any given selection parameters and using defaults for the others
388        start => The offset of the starting element of the specified hyperslab
389        stride => The number of elements between the start of one block and the next
390        count => The number of blocks to select
391        block => The number of elements in each block
392
393    """
394
395    def __init__(self, start=0, stride=1, count=None, block=1):
396        if start < 0:
397            raise ValueError("Start can't be negative")
398        if stride < 1 or (count is not None and count < 1) or block < 1:
399            raise ValueError("Stride, count and block can't be 0 or negative")
400        if block > stride:
401            raise ValueError("Blocks will overlap if block > stride")
402
403        self.start = start
404        self.stride = stride
405        self.count = count
406        self.block = block
407
408    def indices(self, length):
409        """Calculate and validate start, stride, count and block for the given length"""
410        if self.count is None:
411            # Select as many full blocks as possible without exceeding extent
412            count = (length - self.start - self.block) // self.stride + 1
413            if count < 1:
414                raise ValueError(
415                    "No full blocks can be selected using {} "
416                    "on dimension of length {}".format(self._repr(), length)
417                )
418        else:
419            count = self.count
420
421        end_index = self.start + self.block + (count - 1) * self.stride - 1
422        if end_index >= length:
423            raise ValueError(
424                "{} range ({} - {}) extends beyond maximum index ({})".format(
425                    self._repr(count), self.start, end_index, length - 1
426                ))
427
428        return self.start, self.stride, count, self.block
429
430    def _repr(self, count=None):
431        if count is None:
432            count = self.count
433        return "{}(start={}, stride={}, count={}, block={})".format(
434            self.__class__.__name__, self.start, self.stride, count, self.block
435        )
436
437    def __repr__(self):
438        return self._repr(count=None)
439