1# cython: language_level=3 2"""Class to efficiently select and read data from an HDF5 dataset 3 4This is written in Cython to reduce overhead when reading small amounts of 5data. The core of it is translating between numpy-style slicing & indexing and 6HDF5's H5Sselect_hyperslab calls. 7 8Python & numpy distinguish indexing a[3] from slicing a single element a[3:4], 9but there is no equivalent to this when selecting data in HDF5. So we store a 10separate boolean ('scalar') for each dimension to distinguish these cases. 11""" 12from numpy cimport ( 13 ndarray, npy_intp, PyArray_SimpleNew, PyArray_DATA, import_array, 14 PyArray_IsNativeByteOrder, 15) 16from cpython cimport PyNumber_Index 17 18import numpy as np 19from .defs cimport * 20from .h5d cimport DatasetID 21from .h5s cimport SpaceID 22from .h5t cimport TypeID, typewrap, py_create 23from .utils cimport emalloc, efree, convert_dims 24 25import_array() 26 27 28cdef object convert_bools(bint* data, hsize_t rank): 29 # Convert a bint array to a Python tuple of bools. 30 cdef list bools_l 31 cdef int i 32 bools_l = [] 33 34 for i in range(rank): 35 bools_l.append(bool(data[i])) 36 37 return tuple(bools_l) 38 39 40cdef class Selector: 41 cdef SpaceID spaceobj 42 cdef hid_t space 43 cdef int rank 44 cdef bint is_fancy 45 cdef hsize_t* dims 46 cdef hsize_t* start 47 cdef hsize_t* stride 48 cdef hsize_t* count 49 cdef hsize_t* block 50 cdef bint* scalar 51 52 def __cinit__(self, SpaceID space): 53 self.spaceobj = space 54 self.space = space.id 55 self.rank = H5Sget_simple_extent_ndims(self.space) 56 self.is_fancy = False 57 58 self.dims = <hsize_t*>emalloc(sizeof(hsize_t) * self.rank) 59 self.start = <hsize_t*>emalloc(sizeof(hsize_t) * self.rank) 60 self.stride = <hsize_t*>emalloc(sizeof(hsize_t) * self.rank) 61 self.count = <hsize_t*>emalloc(sizeof(hsize_t) * self.rank) 62 self.block = <hsize_t*>emalloc(sizeof(hsize_t) * self.rank) 63 self.scalar = <bint*>emalloc(sizeof(bint) * self.rank) 64 65 H5Sget_simple_extent_dims(self.space, self.dims, NULL) 66 67 def __dealloc__(self): 68 efree(self.dims) 69 efree(self.start) 70 efree(self.stride) 71 efree(self.count) 72 efree(self.block) 73 efree(self.scalar) 74 75 cdef bint apply_args(self, tuple args) except 0: 76 """Apply indexing arguments to this Selector object""" 77 cdef: 78 int nargs, ellipsis_ix, array_ix = -1 79 bint seen_ellipsis = False 80 int dim_ix = -1 81 hsize_t l 82 ndarray array_arg 83 84 # If no explicit ellipsis, implicit ellipsis is after args 85 nargs = ellipsis_ix = len(args) 86 87 for a in args: 88 dim_ix += 1 89 90 if a is Ellipsis: 91 # [...] - Ellipsis (fill any unspecified dimensions here) 92 if seen_ellipsis: 93 raise ValueError("Only one ellipsis may be used.") 94 seen_ellipsis = True 95 96 ellipsis_ix = dim_ix 97 nargs -= 1 # Don't count the ... itself 98 if nargs > self.rank: 99 raise ValueError(f"{nargs} indexing arguments for {self.rank} dimensions") 100 101 # Skip ahead to the remaining dimensions 102 # -1 because the next iteration will increment dim_ix 103 dim_ix += self.rank - nargs - 1 104 continue 105 106 if dim_ix >= self.rank: 107 raise ValueError(f"{nargs} indexing arguments for {self.rank} dimensions") 108 109 # Length of the relevant dimension 110 l = self.dims[dim_ix] 111 112 # [0:10] - slicing 113 if isinstance(a, slice): 114 start, stop, step = a.indices(l) 115 # Now if step > 0, then start and stop are in [0, length]; 116 # if step < 0, they are in [-1, length - 1] (Python 2.6b2 and later; 117 # Python issue 3004). 118 119 if step < 1: 120 raise ValueError("Step must be >= 1 (got %d)" % step) 121 if stop < start: 122 # list/tuple and numpy consider stop < start to be an empty selection 123 start, count, step = 0, 0, 1 124 else: 125 count = 1 + (stop - start - 1) // step 126 127 self.start[dim_ix] = start 128 self.stride[dim_ix] = step 129 self.count[dim_ix] = count 130 self.block[dim_ix] = 1 131 self.scalar[dim_ix] = False 132 133 continue 134 135 # [0] - simple integer indices 136 try: 137 # PyIndex_Check only checks the type - e.g. all numpy arrays 138 # pass PyIndex_Check, but only scalar arrays are valid. 139 a = PyNumber_Index(a) 140 except TypeError: 141 pass # Fall through to check for list/array 142 else: 143 if a < 0: 144 a += l 145 146 if not 0 <= a < l: 147 if l == 0: 148 msg = f"Index ({a}) out of range for empty dimension" 149 else: 150 msg = f"Index ({a}) out of range for (0-{l-1})" 151 raise IndexError(msg) 152 153 self.start[dim_ix] = a 154 self.stride[dim_ix] = 1 155 self.count[dim_ix] = 1 156 self.block[dim_ix] = 1 157 self.scalar[dim_ix] = True 158 159 continue 160 161 # MultiBlockSlice exposes h5py's separate count & block parameters 162 # to allow more complex repeating selections. 163 if isinstance(a, MultiBlockSlice): 164 ( 165 self.start[dim_ix], 166 self.stride[dim_ix], 167 self.count[dim_ix], 168 self.block[dim_ix], 169 ) = a.indices(l) 170 self.scalar[dim_ix] = False 171 172 continue 173 174 # [[0, 2, 10]] - list/array of indices ('fancy indexing') 175 if isinstance(a, (list, np.ndarray)): 176 if isinstance(a, list) and len(a) == 0: 177 a = np.asarray(a, dtype=np.intp) 178 else: 179 a = np.asarray(a) 180 if a.ndim != 1: 181 raise TypeError("Only 1D arrays allowed for fancy indexing") 182 if not np.issubdtype(a.dtype, np.integer): 183 raise TypeError("Indexing arrays must have integer dtypes") 184 if array_ix != -1: 185 raise TypeError("Only one indexing vector or array is currently allowed for fancy indexing") 186 187 # Convert negative indices to positive 188 if np.any(a < 0): 189 a = a.copy() 190 a[a < 0] += l 191 192 # Bounds check 193 if np.any((a < 0) | (a > l)): 194 if l == 0: 195 msg = "Fancy indexing out of range for empty dimension" 196 else: 197 msg = f"Fancy indexing out of range for (0-{l-1})" 198 raise IndexError(msg) 199 200 if np.any(np.diff(a) <= 0): 201 raise TypeError("Indexing elements must be in increasing order") 202 203 array_ix = dim_ix 204 array_arg = a 205 self.start[dim_ix] = 0 206 self.stride[dim_ix] = 1 207 self.count[dim_ix] = a.shape[0] 208 self.block[dim_ix] = 1 209 self.scalar[dim_ix] = False 210 211 continue 212 213 raise TypeError("Simple selection can't process %r" % a) 214 215 if nargs < self.rank: 216 # Fill in ellipsis or trailing dimensions 217 ellipsis_end = ellipsis_ix + (self.rank - nargs) 218 for dim_ix in range(ellipsis_ix, ellipsis_end): 219 self.start[dim_ix] = 0 220 self.stride[dim_ix] = 1 221 self.count[dim_ix] = self.dims[dim_ix] 222 self.block[dim_ix] = 1 223 self.scalar[dim_ix] = False 224 225 if nargs == 0: 226 H5Sselect_all(self.space) 227 self.is_fancy = False 228 elif array_ix != -1: 229 self.select_fancy(array_ix, array_arg) 230 self.is_fancy = True 231 else: 232 H5Sselect_hyperslab(self.space, H5S_SELECT_SET, self.start, self.stride, self.count, self.block) 233 self.is_fancy = False 234 return True 235 236 cdef select_fancy(self, int array_ix, ndarray array_arg): 237 """Apply a 'fancy' selection (array of indices) to the dataspace""" 238 cdef hsize_t* tmp_start 239 cdef hsize_t* tmp_count 240 cdef uint64_t i 241 242 H5Sselect_none(self.space) 243 244 tmp_start = <hsize_t*>emalloc(sizeof(hsize_t) * self.rank) 245 tmp_count = <hsize_t*>emalloc(sizeof(hsize_t) * self.rank) 246 try: 247 memcpy(tmp_start, self.start, sizeof(hsize_t) * self.rank) 248 memcpy(tmp_count, self.count, sizeof(hsize_t) * self.rank) 249 tmp_count[array_ix] = 1 250 251 # Iterate over the array of indices, add each hyperslab to the selection 252 for i in array_arg: 253 tmp_start[array_ix] = i 254 H5Sselect_hyperslab(self.space, H5S_SELECT_OR, tmp_start, self.stride, tmp_count, self.block) 255 finally: 256 efree(tmp_start) 257 efree(tmp_count) 258 259 260 def make_selection(self, tuple args): 261 """Apply indexing/slicing args and create a high-level selection object 262 263 Returns an instance of SimpleSelection or FancySelection, with a copy 264 of the selector's dataspace. 265 """ 266 cdef: 267 SpaceID space 268 tuple shape, start, count, step, scalar, arr_shape 269 int arr_rank, i 270 npy_intp* arr_shape_p 271 272 self.apply_args(args) 273 space = SpaceID(H5Scopy(self.space)) 274 275 shape = convert_dims(self.dims, self.rank) 276 count = convert_dims(self.count, self.rank) 277 block = convert_dims(self.block, self.rank) 278 mshape = tuple(c * b for c, b in zip(count, block)) 279 280 from ._hl.selections import SimpleSelection, FancySelection 281 282 if self.is_fancy: 283 arr_shape = tuple( 284 mshape[i] for i in range(self.rank) if not self.scalar[i] 285 ) 286 return FancySelection(shape, space, count, arr_shape) 287 else: 288 start = convert_dims(self.start, self.rank) 289 step = convert_dims(self.stride, self.rank) 290 scalar = convert_bools(self.scalar, self.rank) 291 return SimpleSelection(shape, space, (start, mshape, step, scalar)) 292 293 294cdef class Reader: 295 cdef hid_t dataset 296 cdef Selector selector 297 cdef TypeID h5_memory_datatype 298 cdef int np_typenum 299 cdef bint native_byteorder 300 301 def __cinit__(self, DatasetID dsid): 302 self.dataset = dsid.id 303 self.selector = Selector(dsid.get_space()) 304 305 # HDF5 can use e.g. custom float datatypes which don't have an exact 306 # match in numpy. Translating it to a numpy dtype chooses the smallest 307 # dtype which won't lose any data, then we translate that back to a 308 # HDF5 datatype (h5_memory_datatype). 309 h5_stored_datatype = typewrap(H5Dget_type(self.dataset)) 310 np_dtype = h5_stored_datatype.py_dtype() 311 self.np_typenum = np_dtype.num 312 self.native_byteorder = PyArray_IsNativeByteOrder(ord(np_dtype.byteorder)) 313 self.h5_memory_datatype = py_create(np_dtype) 314 315 cdef ndarray make_array(self, hsize_t* mshape): 316 """Create an array to read the selected data into. 317 318 .apply_args() should be called first, to set self.count and self.scalar. 319 Only works for simple numeric dtypes which can be defined with typenum. 320 """ 321 cdef int i, arr_rank = 0 322 cdef npy_intp* arr_shape 323 324 arr_shape = <npy_intp*>emalloc(sizeof(npy_intp) * self.selector.rank) 325 try: 326 # Copy any non-scalar selection dimensions for the array shape 327 for i in range(self.selector.rank): 328 if not self.selector.scalar[i]: 329 arr_shape[arr_rank] = mshape[i] 330 arr_rank += 1 331 332 arr = PyArray_SimpleNew(arr_rank, arr_shape, self.np_typenum) 333 if not self.native_byteorder: 334 arr = arr.newbyteorder() 335 finally: 336 efree(arr_shape) 337 338 return arr 339 340 def read(self, tuple args): 341 """Index the dataset using args and read into a new numpy array 342 343 Only works for simple numeric dtypes. 344 """ 345 cdef void* buf 346 cdef ndarray arr 347 cdef hsize_t* mshape 348 cdef hid_t mspace 349 cdef int i 350 351 self.selector.apply_args(args) 352 353 # The selected length of each dimension is count * block 354 mshape = <hsize_t*>emalloc(sizeof(hsize_t) * self.selector.rank) 355 try: 356 for i in range(self.selector.rank): 357 mshape[i] = self.selector.count[i] * self.selector.block[i] 358 arr = self.make_array(mshape) 359 buf = PyArray_DATA(arr) 360 361 mspace = H5Screate_simple(self.selector.rank, mshape, NULL) 362 finally: 363 efree(mshape) 364 365 try: 366 H5Dread(self.dataset, self.h5_memory_datatype.id, mspace, 367 self.selector.space, H5P_DEFAULT, buf) 368 finally: 369 H5Sclose(mspace) 370 371 if arr.ndim == 0: 372 return arr[()] 373 else: 374 return arr 375 376 377class MultiBlockSlice(object): 378 """ 379 A conceptual extension of the built-in slice object to allow selections 380 using start, stride, count and block. 381 382 If given, these parameters will be passed directly to 383 H5Sselect_hyperslab. The defaults are start=0, stride=1, block=1, 384 count=length, which will select the full extent. 385 386 __init__(start, stride, count, block) => Create a new MultiBlockSlice, storing 387 any given selection parameters and using defaults for the others 388 start => The offset of the starting element of the specified hyperslab 389 stride => The number of elements between the start of one block and the next 390 count => The number of blocks to select 391 block => The number of elements in each block 392 393 """ 394 395 def __init__(self, start=0, stride=1, count=None, block=1): 396 if start < 0: 397 raise ValueError("Start can't be negative") 398 if stride < 1 or (count is not None and count < 1) or block < 1: 399 raise ValueError("Stride, count and block can't be 0 or negative") 400 if block > stride: 401 raise ValueError("Blocks will overlap if block > stride") 402 403 self.start = start 404 self.stride = stride 405 self.count = count 406 self.block = block 407 408 def indices(self, length): 409 """Calculate and validate start, stride, count and block for the given length""" 410 if self.count is None: 411 # Select as many full blocks as possible without exceeding extent 412 count = (length - self.start - self.block) // self.stride + 1 413 if count < 1: 414 raise ValueError( 415 "No full blocks can be selected using {} " 416 "on dimension of length {}".format(self._repr(), length) 417 ) 418 else: 419 count = self.count 420 421 end_index = self.start + self.block + (count - 1) * self.stride - 1 422 if end_index >= length: 423 raise ValueError( 424 "{} range ({} - {}) extends beyond maximum index ({})".format( 425 self._repr(count), self.start, end_index, length - 1 426 )) 427 428 return self.start, self.stride, count, self.block 429 430 def _repr(self, count=None): 431 if count is None: 432 count = self.count 433 return "{}(start={}, stride={}, count={}, block={})".format( 434 self.__class__.__name__, self.start, self.stride, count, self.block 435 ) 436 437 def __repr__(self): 438 return self._repr(count=None) 439