1 #include <chrono>
2 #include <cstring>
3 #include <iostream>
4 #include <map>
5 #include <memory>
6 #include <string>
7 #include <vector>
8
9 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
10 #include "numpy/arrayobject.h"
11 #undef NPY_NO_DEPRECATED_API
12
13 #include "util.h"
14 #include <pybind11/numpy.h>
15 #include <pybind11/pybind11.h>
16 #include <pybind11/pytypes.h>
17
18 #if !defined(NDEBUG)
19 #include "debug.cc"
20 #endif
21
22 #define TILEDB_DEPRECATED
23 #define TILEDB_DEPRECATED_EXPORT
24
25 #include <tiledb/tiledb> // C++
26
27 // anonymous namespace for helper functions
28 namespace {
29
30 namespace py = pybind11;
31
issubdtype(py::dtype t1,py::dtype t2)32 bool issubdtype(py::dtype t1, py::dtype t2) {
33 // TODO importing every time is Not Great...
34 auto np = py::module::import("numpy");
35 auto npsubdtype = np.attr("issubdtype");
36
37 return py::cast<bool>(npsubdtype(t1, t2));
38 }
39
get_dtype(T obj)40 template <typename T> py::dtype get_dtype(T obj) {
41 auto &api = py::detail::npy_api::get();
42
43 if (api.PyArray_Check_(obj.ptr())) {
44 return py::cast<py::array>(obj).dtype();
45 }
46
47 return py::reinterpret_steal<py::dtype>(
48 api.PyArray_DescrFromScalar_(obj.ptr()));
49 }
50
51 // check whether dtypes are equivalent from numpy perspective
52 // note: d1::dtype.is(d2) checks *object identity* which is
53 // not what we want.
dtype_equal(py::dtype d1,py::dtype d2)54 bool dtype_equal(py::dtype d1, py::dtype d2) {
55 auto &api = py::detail::npy_api::get();
56
57 return api.PyArray_EquivTypes_(d1.ptr(), d2.ptr());
58 }
59
60 }; // namespace
61
62 namespace tiledbpy {
63
64 using namespace std;
65 using namespace tiledb;
66 namespace py = pybind11;
67 using namespace pybind11::literals;
68
69 #if PY_MAJOR_VERSION >= 3
70 class NumpyConvert {
71 private:
72 bool use_iter_ = false;
73 bool allow_unicode_ = true;
74 size_t data_nbytes_ = 0;
75 size_t input_len_ = 0;
76
77 py::array input_;
78 // we are using vector as a buffer here because they are grown in some
79 // situations
80 std::vector<uint8_t> *data_buf_;
81 std::vector<uint64_t> *offset_buf_;
82
convert_unicode()83 void convert_unicode() {
84 // Convert array of strings to UTF-8 buffer+offsets
85
86 // NOTE: NumPy fixed-length string arrays *do not support* embedded nulls.
87 // There is no string size stored, so string end is demarcated by \0
88 // and the slot is filled to the next boundary with \0.
89 // For consistency and to avoid complications in other APIs, we are storing
90 // all string arrays as var-length.
91
92 size_t input_itemsize = input_.itemsize();
93 assert(input_itemsize > 0); // must have fixed-length array
94
95 // we know exact offset count
96 offset_buf_->resize(input_len_);
97
98 // we reserve the input length as a minimum size for output
99 data_buf_->resize(input_len_);
100
101 // size (bytes) of current object data
102 Py_ssize_t sz = 0;
103 // object data (string or bytes)
104 const char *input_p = nullptr;
105
106 unsigned char *output_p = nullptr;
107 output_p = data_buf_->data();
108
109 // avoid one interpreter roundtrip
110 auto npstrencode = py::module::import("numpy").attr("str_").attr("encode");
111
112 // return status
113 int rc;
114 // encoded object: this must live outside the if block or else it may be
115 // GC'd
116 // putting outside for loop to avoid repeat unused
117 // construction
118 py::object u_encoded;
119
120 // loop over array objects and write to output buffer
121 size_t idx = 0;
122 for (auto u : input_) {
123 // don't encode if we already have bytes
124 if (PyUnicode_Check(u.ptr())) {
125 // TODO see if we can do this with PyUnicode_AsUTF8String
126 u_encoded = npstrencode(u);
127 rc = PyBytes_AsStringAndSize(u_encoded.ptr(),
128 const_cast<char **>(&input_p), &sz);
129 } else {
130 rc = PyBytes_AsStringAndSize(u.ptr(), const_cast<char **>(&input_p),
131 &sz);
132 }
133
134 if (rc == -1) {
135 throw std::runtime_error(
136 "PyBytes_AsStringAndSize failed to encode string");
137 }
138
139 // record the offset (equal to the current bytes written)
140 offset_buf_->data()[idx] = data_nbytes_;
141
142 if (data_buf_->size() < data_nbytes_ + sz) {
143 data_buf_->resize(data_nbytes_ + sz);
144 // update the output pointer and adjust for previous iteration
145 output_p = data_buf_->data() + data_nbytes_;
146 }
147
148 memcpy(output_p, input_p, sz);
149
150 data_nbytes_ += sz;
151 output_p += sz;
152 idx++;
153 }
154 }
155
convert_bytes()156 void convert_bytes() {
157 // Convert array of bytes objects or ASCII strings to buffer+offsets
158
159 size_t input_itemsize = input_.itemsize();
160 assert(input_itemsize > 0); // must have fixed-length array
161
162 // we know exact offset count
163 offset_buf_->resize(input_len_);
164
165 // we reserve the input length as a minimum size for output
166 data_buf_->resize(input_len_);
167
168 // size (bytes) of current object data
169 Py_ssize_t sz = 0;
170 // object data (string or bytes)
171 const char *input_p = nullptr;
172
173 unsigned char *output_p = nullptr;
174 output_p = data_buf_->data();
175
176 int rc;
177
178 // avoid one interpreter roundtrip
179 // auto npstrencode =
180 // py::module::import("numpy").attr("str_").attr("encode");
181
182 // TODO: ideally we would encode directly here without the intermediate
183 // unicode object
184 // TODO add test for different memory orderings
185
186 // loop over array objects and write to output buffer
187 size_t idx = 0;
188 for (auto obj : input_) {
189 auto o = obj.ptr();
190
191 // don't encode if we already have bytes
192 /*
193 if (PyUnicode_Check(u.ptr())) {
194 // TODO see if we can do this with PyUnicode_AsUTF8String
195 u_encoded = npstrencode(u);
196 }
197 */
198
199 rc = PyBytes_AsStringAndSize(o, const_cast<char **>(&input_p), &sz);
200 if (rc == -1) {
201 throw std::runtime_error(
202 "PyBytes_AsStringAndSize failed to encode string");
203 }
204
205 // record the offset (equal to the current bytes written)
206 offset_buf_->data()[idx] = data_nbytes_;
207
208 if (data_buf_->size() < data_nbytes_ + sz) {
209 data_buf_->resize(data_nbytes_ + sz);
210 // update the output pointer and adjust for previous iteration
211 output_p = data_buf_->data() + data_nbytes_;
212 }
213
214 memcpy(output_p, input_p, sz);
215
216 data_nbytes_ += sz;
217 output_p += sz;
218 idx++;
219 }
220 }
221
convert_object()222 void convert_object() {
223 // Convert np.dtype("O") array of objects to buffer+offsets
224
225 auto &api = py::detail::npy_api::get();
226
227 offset_buf_->resize(input_len_);
228
229 auto input_unchecked = input_.unchecked<py::object, 1>();
230
231 // size (bytes) of current object data
232 Py_ssize_t sz = 0;
233 // current data
234 const char *input_p = nullptr;
235
236 auto input_size = input_.size();
237 py::dtype first_dtype;
238
239 // first pass: calculate final buffer length and cache UTF-8 representations
240 for (int64_t idx = 0; idx < input_size; idx++) {
241 offset_buf_->data()[idx] = data_nbytes_;
242
243 PyObject *o = input_unchecked.data(idx)->ptr();
244 assert(o != nullptr);
245
246 // NOTE: every branch below *must* initialize first_dtype
247
248 if (PyUnicode_Check(o)) {
249 if (!allow_unicode_) {
250 // TODO TPY_ERROR_LOC
251 auto errmsg = std::string(
252 "Unexpected unicode object for TILEDB_STRING_ASCII attribute");
253 throw std::runtime_error(errmsg);
254 }
255
256 if (idx < 1)
257 first_dtype = py::dtype("unicode");
258
259 // this will cache a utf-8 representation owned by the PyObject
260 input_p = PyUnicode_AsUTF8AndSize(o, &sz);
261 if (!input_p) {
262 TPY_ERROR_LOC("Internal error: failed to convert unicode to UTF-8");
263 }
264 } else if (PyBytes_Check(o)) {
265 // ASCII only
266 auto res =
267 PyBytes_AsStringAndSize(o, const_cast<char **>(&input_p), &sz);
268
269 if (idx < 1)
270 first_dtype = py::dtype("bytes");
271
272 if (res == -1) {
273 // TODO TPY_ERROR_LOC
274 throw std::runtime_error(
275 "Internal error: failed to get char* from bytes object");
276 }
277 } else if (api.PyArray_Check_(o)) {
278 auto a = py::cast<py::array>(o);
279 // handle (potentially) var-len embedded arrays
280 if (idx < 1) {
281 first_dtype = get_dtype(a);
282 } else if (!dtype_equal(get_dtype(a), first_dtype)) {
283 throw py::type_error(
284 "Mismatched dtype in object array to buffer conversion!");
285 }
286
287 sz = a.nbytes();
288 } else {
289 // TODO write the type in the error here
290 // auto o_h = py::reinterpret_borrow<py::object>(o);
291 // auto o_t = py::type::of(o);
292 auto errmsg =
293 std::string("Unexpected object type in string conversion");
294 TPY_ERROR_LOC(errmsg);
295 }
296
297 data_nbytes_ += sz;
298 }
299
300 data_buf_->resize(data_nbytes_);
301
302 // second pass: copy the data to output buffer
303 unsigned char *output_p = data_buf_->data();
304
305 // copy data to output buffers
306 for (int64_t idx = 0; idx < input_size; idx++) {
307 PyObject *pyobj_p = input_unchecked.data(idx)->ptr();
308
309 assert(pyobj_p != nullptr);
310
311 if (PyUnicode_Check(pyobj_p)) {
312 input_p = PyUnicode_AsUTF8AndSize(pyobj_p, &sz);
313 assert(input_p != nullptr);
314 } else if (PyBytes_Check(pyobj_p)) {
315 // TODO error check?
316 PyBytes_AsStringAndSize(pyobj_p, const_cast<char **>(&input_p), &sz);
317 } else if (api.PyArray_Check_(pyobj_p)) {
318 auto pao = (PyArrayObject *)pyobj_p;
319 auto arr = py::cast<py::array>(pyobj_p);
320 sz = arr.nbytes();
321 input_p = (const char *)arr.data();
322 } else {
323 // TODO add object type
324 TPY_ERROR_LOC("Unexpected object type in buffer conversion");
325 }
326
327 memcpy(output_p, input_p, sz);
328 // increment the output pointer for the next object
329 output_p += sz;
330 }
331 }
332
convert_iter()333 void convert_iter() {
334 // Convert array of non-contiguous objects to buffer+offsets
335 // using iterator protocol.
336 // For non-contiguous arrays (such as views) we must iterate rather
337 // than indexing directly.
338
339 auto &npy_api = py::detail::npy_api::get();
340
341 offset_buf_->resize(input_.size());
342
343 auto iter = input_.attr("flat");
344
345 // size (bytes) of current object data
346 Py_ssize_t sz = 0;
347 // current data
348 const char *input_p = nullptr;
349
350 size_t idx = 0;
351
352 py::dtype first_dtype;
353
354 for (auto obj_h : iter) {
355 if (idx < 1) {
356 // record the first dtype for consistency check
357 first_dtype = get_dtype(obj_h);
358 }
359 offset_buf_->data()[idx] = data_nbytes_;
360
361 PyObject *obj_p = obj_h.ptr();
362
363 // we must check each dtype because object arrays are not guaranteed to
364 // be homogenous
365 auto cur_dtype = get_dtype(obj_h);
366 auto err_str =
367 std::string("Mismatched element type in buffer conversion!");
368 if ((first_dtype.kind() == cur_dtype.kind()) ||
369 (first_dtype.kind() == cur_dtype.kind())) {
370 // pass
371 } else if (!dtype_equal(cur_dtype, first_dtype)) {
372 throw py::type_error(err_str);
373 }
374
375 if (PyUnicode_Check(obj_p)) {
376 if (!allow_unicode_) {
377 // TODO TPY_ERROR_LOC
378 auto errmsg = std::string(
379 "Unexpected unicode object for TILEDB_STRING_ASCII attribute");
380 throw std::runtime_error(errmsg);
381 }
382
383 // this will cache a utf-8 representation owned by the PyObject
384 input_p = PyUnicode_AsUTF8AndSize(obj_p, &sz);
385 if (!input_p) {
386 TPY_ERROR_LOC("Internal error: failed to convert unicode to UTF-8");
387 }
388 } else if (PyBytes_Check(obj_p)) {
389 // ASCII only
390 auto res =
391 PyBytes_AsStringAndSize(obj_p, const_cast<char **>(&input_p), &sz);
392
393 if (res == -1) {
394 // TODO TPY_ERROR_LOC
395 throw std::runtime_error(
396 "Internal error: failed to get char* from bytes object");
397 }
398 } else if (npy_api.PyArray_Check_(obj_p)) {
399 // handle (potentially) var-len embedded arrays
400 sz = py::cast<py::array>(obj_p).nbytes();
401 } else {
402 auto errmsg =
403 std::string("Unexpected object type in string conversion");
404 TPY_ERROR_LOC(errmsg);
405 }
406 data_nbytes_ += sz;
407 idx++;
408 }
409
410 data_buf_->resize(data_nbytes_);
411 // second pass: write the data to output buffer
412 unsigned char *output_p = data_buf_->data();
413
414 // reset the iterator
415 iter = input_.attr("flat");
416
417 // copy data to output buffers
418 for (auto obj_h : iter) {
419 auto obj_p = obj_h.ptr();
420
421 if (PyUnicode_Check(obj_p)) {
422 input_p = PyUnicode_AsUTF8AndSize(obj_p, &sz);
423 assert(input_p != nullptr);
424 } else if (PyBytes_Check(obj_p)) {
425 // TODO error check?
426 PyBytes_AsStringAndSize(obj_p, const_cast<char **>(&input_p), &sz);
427 } else if (npy_api.PyArray_Check_(obj_p)) {
428 // auto pao = (PyArrayObject*)o;
429 // input_p = (const char*)PyArray_DATA(pao);
430 // sz = PyArray_NBYTES(pao);
431 auto o_a = py::cast<py::array>(obj_h);
432 sz = o_a.nbytes();
433 input_p = (const char *)o_a.data();
434 } else {
435 TPY_ERROR_LOC("Unexpected object type in buffer conversion");
436 }
437
438 memcpy(output_p, input_p, sz);
439 // increment the output pointer for the next object
440 output_p += sz;
441 }
442 }
443
444 public:
445 /*
446 Initialize the converter
447 */
NumpyConvert(py::array input)448 NumpyConvert(py::array input) {
449 // require a flat buffer
450 if (input.ndim() != 1) {
451 // try to take a 1D view on the input
452 auto v = input.attr("view")();
453 // this will throw if the shape cannot be modified zero-copy,
454 // which is what we want
455 try {
456 v.attr("shape") = py::int_(input.size());
457 } catch (py::error_already_set &e) {
458 if (e.matches(PyExc_AttributeError)) {
459 use_iter_ = true;
460 } else {
461 throw;
462 }
463 } catch (std::exception &e) {
464 std::cout << e.what() << std::endl;
465 }
466 input_ = v;
467 } else {
468 input_ = input;
469 }
470
471 input_len_ = py::len(input_);
472
473 data_buf_ = new std::vector<uint8_t>();
474 offset_buf_ = new std::vector<uint64_t>(input_len_);
475 }
476
~NumpyConvert()477 ~NumpyConvert() {
478 if (data_buf_)
479 delete data_buf_;
480 if (offset_buf_)
481 delete offset_buf_;
482 }
483
484 /*
485 Set allow_unicode_ flag
486 */
allow_unicode()487 bool allow_unicode() { return allow_unicode_; }
allow_unicode(bool allow_unicode)488 void allow_unicode(bool allow_unicode) { allow_unicode_ = allow_unicode; }
489
490 /*
491 Returns a tuple of py::array containing
492 (data:array_t<uint8>, offsets:array_t<uint64_t>)
493 */
get()494 py::tuple get() {
495 auto input_dtype = input_.dtype();
496
497 if (use_iter_) {
498 // slow, safe path
499 convert_iter();
500 } else if (issubdtype(input_dtype, py::dtype("unicode"))) {
501 if (allow_unicode_) {
502 convert_unicode();
503 } else {
504 throw std::runtime_error("Unexpected fixed-length unicode array");
505 }
506 } else if (issubdtype(input_dtype, py::dtype("bytes"))) {
507 convert_bytes();
508 } else if (!input_dtype.is(py::dtype("O"))) {
509 // TODO TPY_ERROR_LOC
510 throw std::runtime_error("expected object array");
511 } else {
512 convert_object();
513 }
514
515 auto tmp_data_buf_p = data_buf_;
516 auto data_ref = py::capsule(data_buf_, [](void *v) {
517 delete reinterpret_cast<std::vector<uint8_t> *>(v);
518 });
519 data_buf_ = nullptr; // disown: capsule owns it
520
521 auto tmp_offset_buf_p = offset_buf_;
522 auto offset_ref = py::capsule(offset_buf_, [](void *v) {
523 delete reinterpret_cast<std::vector<uint64_t> *>(v);
524 });
525 offset_buf_ = nullptr; // disown: capsule owns it now
526
527 auto data_np = py::array_t<uint8_t>(tmp_data_buf_p->size(),
528 tmp_data_buf_p->data(), data_ref);
529 auto offset_np = py::array_t<uint64_t>(
530 tmp_offset_buf_p->size(), tmp_offset_buf_p->data(), offset_ref);
531
532 return py::make_tuple(data_np, offset_np);
533 }
534 };
535 #endif
536
convert_np(py::array input,bool allow_unicode,bool use_fallback=false)537 py::tuple convert_np(py::array input, bool allow_unicode,
538 bool use_fallback = false) {
539 #if PY_MAJOR_VERSION >= 3
540 if (use_fallback) {
541 #endif
542 auto tiledb = py::module::import("tiledb");
543 auto libtiledb = tiledb.attr("libtiledb");
544 auto array_to_buffer = libtiledb.attr("array_to_buffer");
545 return array_to_buffer(input);
546 #if PY_MAJOR_VERSION >= 3
547 } else {
548 NumpyConvert cvt(input);
549 cvt.allow_unicode(allow_unicode);
550 return cvt.get();
551 }
552 #endif
553 }
554
555 }; // namespace tiledbpy