1 #include <chrono>
2 #include <cstring>
3 #include <iostream>
4 #include <map>
5 #include <memory>
6 #include <string>
7 #include <vector>
8 
9 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
10 #include "numpy/arrayobject.h"
11 #undef NPY_NO_DEPRECATED_API
12 
13 #include "util.h"
14 #include <pybind11/numpy.h>
15 #include <pybind11/pybind11.h>
16 #include <pybind11/pytypes.h>
17 
18 #if !defined(NDEBUG)
19 #include "debug.cc"
20 #endif
21 
22 #define TILEDB_DEPRECATED
23 #define TILEDB_DEPRECATED_EXPORT
24 
25 #include <tiledb/tiledb> // C++
26 
27 // anonymous namespace for helper functions
28 namespace {
29 
30 namespace py = pybind11;
31 
issubdtype(py::dtype t1,py::dtype t2)32 bool issubdtype(py::dtype t1, py::dtype t2) {
33   // TODO importing every time is Not Great...
34   auto np = py::module::import("numpy");
35   auto npsubdtype = np.attr("issubdtype");
36 
37   return py::cast<bool>(npsubdtype(t1, t2));
38 }
39 
get_dtype(T obj)40 template <typename T> py::dtype get_dtype(T obj) {
41   auto &api = py::detail::npy_api::get();
42 
43   if (api.PyArray_Check_(obj.ptr())) {
44     return py::cast<py::array>(obj).dtype();
45   }
46 
47   return py::reinterpret_steal<py::dtype>(
48       api.PyArray_DescrFromScalar_(obj.ptr()));
49 }
50 
51 // check whether dtypes are equivalent from numpy perspective
52 // note: d1::dtype.is(d2) checks *object identity* which is
53 //       not what we want.
dtype_equal(py::dtype d1,py::dtype d2)54 bool dtype_equal(py::dtype d1, py::dtype d2) {
55   auto &api = py::detail::npy_api::get();
56 
57   return api.PyArray_EquivTypes_(d1.ptr(), d2.ptr());
58 }
59 
60 }; // namespace
61 
62 namespace tiledbpy {
63 
64 using namespace std;
65 using namespace tiledb;
66 namespace py = pybind11;
67 using namespace pybind11::literals;
68 
69 #if PY_MAJOR_VERSION >= 3
70 class NumpyConvert {
71 private:
72   bool use_iter_ = false;
73   bool allow_unicode_ = true;
74   size_t data_nbytes_ = 0;
75   size_t input_len_ = 0;
76 
77   py::array input_;
78   // we are using vector as a buffer here because they are grown in some
79   // situations
80   std::vector<uint8_t> *data_buf_;
81   std::vector<uint64_t> *offset_buf_;
82 
convert_unicode()83   void convert_unicode() {
84     // Convert array of strings to UTF-8 buffer+offsets
85 
86     // NOTE: NumPy fixed-length string arrays *do not support* embedded nulls.
87     //       There is no string size stored, so string end is demarcated by \0
88     //       and the slot is filled to the next boundary with \0.
89     // For consistency and to avoid complications in other APIs, we are storing
90     // all string arrays as var-length.
91 
92     size_t input_itemsize = input_.itemsize();
93     assert(input_itemsize > 0); // must have fixed-length array
94 
95     // we know exact offset count
96     offset_buf_->resize(input_len_);
97 
98     // we reserve the input length as a minimum size for output
99     data_buf_->resize(input_len_);
100 
101     // size (bytes) of current object data
102     Py_ssize_t sz = 0;
103     // object data (string or bytes)
104     const char *input_p = nullptr;
105 
106     unsigned char *output_p = nullptr;
107     output_p = data_buf_->data();
108 
109     // avoid one interpreter roundtrip
110     auto npstrencode = py::module::import("numpy").attr("str_").attr("encode");
111 
112     // return status
113     int rc;
114     // encoded object: this must live outside the if block or else it may be
115     // GC'd
116     //                 putting outside for loop to avoid repeat unused
117     //                 construction
118     py::object u_encoded;
119 
120     // loop over array objects and write to output buffer
121     size_t idx = 0;
122     for (auto u : input_) {
123       // don't encode if we already have bytes
124       if (PyUnicode_Check(u.ptr())) {
125         // TODO see if we can do this with PyUnicode_AsUTF8String
126         u_encoded = npstrencode(u);
127         rc = PyBytes_AsStringAndSize(u_encoded.ptr(),
128                                      const_cast<char **>(&input_p), &sz);
129       } else {
130         rc = PyBytes_AsStringAndSize(u.ptr(), const_cast<char **>(&input_p),
131                                      &sz);
132       }
133 
134       if (rc == -1) {
135         throw std::runtime_error(
136             "PyBytes_AsStringAndSize failed to encode string");
137       }
138 
139       // record the offset (equal to the current bytes written)
140       offset_buf_->data()[idx] = data_nbytes_;
141 
142       if (data_buf_->size() < data_nbytes_ + sz) {
143         data_buf_->resize(data_nbytes_ + sz);
144         // update the output pointer and adjust for previous iteration
145         output_p = data_buf_->data() + data_nbytes_;
146       }
147 
148       memcpy(output_p, input_p, sz);
149 
150       data_nbytes_ += sz;
151       output_p += sz;
152       idx++;
153     }
154   }
155 
convert_bytes()156   void convert_bytes() {
157     // Convert array of bytes objects or ASCII strings to buffer+offsets
158 
159     size_t input_itemsize = input_.itemsize();
160     assert(input_itemsize > 0); // must have fixed-length array
161 
162     // we know exact offset count
163     offset_buf_->resize(input_len_);
164 
165     // we reserve the input length as a minimum size for output
166     data_buf_->resize(input_len_);
167 
168     // size (bytes) of current object data
169     Py_ssize_t sz = 0;
170     // object data (string or bytes)
171     const char *input_p = nullptr;
172 
173     unsigned char *output_p = nullptr;
174     output_p = data_buf_->data();
175 
176     int rc;
177 
178     // avoid one interpreter roundtrip
179     // auto npstrencode =
180     // py::module::import("numpy").attr("str_").attr("encode");
181 
182     // TODO: ideally we would encode directly here without the intermediate
183     // unicode object
184     // TODO add test for different memory orderings
185 
186     // loop over array objects and write to output buffer
187     size_t idx = 0;
188     for (auto obj : input_) {
189       auto o = obj.ptr();
190 
191       // don't encode if we already have bytes
192       /*
193       if (PyUnicode_Check(u.ptr())) {
194         // TODO see if we can do this with PyUnicode_AsUTF8String
195         u_encoded = npstrencode(u);
196       }
197       */
198 
199       rc = PyBytes_AsStringAndSize(o, const_cast<char **>(&input_p), &sz);
200       if (rc == -1) {
201         throw std::runtime_error(
202             "PyBytes_AsStringAndSize failed to encode string");
203       }
204 
205       // record the offset (equal to the current bytes written)
206       offset_buf_->data()[idx] = data_nbytes_;
207 
208       if (data_buf_->size() < data_nbytes_ + sz) {
209         data_buf_->resize(data_nbytes_ + sz);
210         // update the output pointer and adjust for previous iteration
211         output_p = data_buf_->data() + data_nbytes_;
212       }
213 
214       memcpy(output_p, input_p, sz);
215 
216       data_nbytes_ += sz;
217       output_p += sz;
218       idx++;
219     }
220   }
221 
convert_object()222   void convert_object() {
223     // Convert np.dtype("O") array of objects to buffer+offsets
224 
225     auto &api = py::detail::npy_api::get();
226 
227     offset_buf_->resize(input_len_);
228 
229     auto input_unchecked = input_.unchecked<py::object, 1>();
230 
231     // size (bytes) of current object data
232     Py_ssize_t sz = 0;
233     // current data
234     const char *input_p = nullptr;
235 
236     auto input_size = input_.size();
237     py::dtype first_dtype;
238 
239     // first pass: calculate final buffer length and cache UTF-8 representations
240     for (int64_t idx = 0; idx < input_size; idx++) {
241       offset_buf_->data()[idx] = data_nbytes_;
242 
243       PyObject *o = input_unchecked.data(idx)->ptr();
244       assert(o != nullptr);
245 
246       // NOTE: every branch below *must* initialize first_dtype
247 
248       if (PyUnicode_Check(o)) {
249         if (!allow_unicode_) {
250           // TODO TPY_ERROR_LOC
251           auto errmsg = std::string(
252               "Unexpected unicode object for TILEDB_STRING_ASCII attribute");
253           throw std::runtime_error(errmsg);
254         }
255 
256         if (idx < 1)
257           first_dtype = py::dtype("unicode");
258 
259         // this will cache a utf-8 representation owned by the PyObject
260         input_p = PyUnicode_AsUTF8AndSize(o, &sz);
261         if (!input_p) {
262           TPY_ERROR_LOC("Internal error: failed to convert unicode to UTF-8");
263         }
264       } else if (PyBytes_Check(o)) {
265         // ASCII only
266         auto res =
267             PyBytes_AsStringAndSize(o, const_cast<char **>(&input_p), &sz);
268 
269         if (idx < 1)
270           first_dtype = py::dtype("bytes");
271 
272         if (res == -1) {
273           // TODO TPY_ERROR_LOC
274           throw std::runtime_error(
275               "Internal error: failed to get char* from bytes object");
276         }
277       } else if (api.PyArray_Check_(o)) {
278         auto a = py::cast<py::array>(o);
279         // handle (potentially) var-len embedded arrays
280         if (idx < 1) {
281           first_dtype = get_dtype(a);
282         } else if (!dtype_equal(get_dtype(a), first_dtype)) {
283           throw py::type_error(
284               "Mismatched dtype in object array to buffer conversion!");
285         }
286 
287         sz = a.nbytes();
288       } else {
289         // TODO write the type in the error here
290         // auto o_h = py::reinterpret_borrow<py::object>(o);
291         // auto o_t = py::type::of(o);
292         auto errmsg =
293             std::string("Unexpected object type in string conversion");
294         TPY_ERROR_LOC(errmsg);
295       }
296 
297       data_nbytes_ += sz;
298     }
299 
300     data_buf_->resize(data_nbytes_);
301 
302     // second pass: copy the data to output buffer
303     unsigned char *output_p = data_buf_->data();
304 
305     // copy data to output buffers
306     for (int64_t idx = 0; idx < input_size; idx++) {
307       PyObject *pyobj_p = input_unchecked.data(idx)->ptr();
308 
309       assert(pyobj_p != nullptr);
310 
311       if (PyUnicode_Check(pyobj_p)) {
312         input_p = PyUnicode_AsUTF8AndSize(pyobj_p, &sz);
313         assert(input_p != nullptr);
314       } else if (PyBytes_Check(pyobj_p)) {
315         // TODO error check?
316         PyBytes_AsStringAndSize(pyobj_p, const_cast<char **>(&input_p), &sz);
317       } else if (api.PyArray_Check_(pyobj_p)) {
318         auto pao = (PyArrayObject *)pyobj_p;
319         auto arr = py::cast<py::array>(pyobj_p);
320         sz = arr.nbytes();
321         input_p = (const char *)arr.data();
322       } else {
323         // TODO add object type
324         TPY_ERROR_LOC("Unexpected object type in buffer conversion");
325       }
326 
327       memcpy(output_p, input_p, sz);
328       // increment the output pointer for the next object
329       output_p += sz;
330     }
331   }
332 
convert_iter()333   void convert_iter() {
334     // Convert array of non-contiguous objects to buffer+offsets
335     // using iterator protocol.
336     // For non-contiguous arrays (such as views) we must iterate rather
337     // than indexing directly.
338 
339     auto &npy_api = py::detail::npy_api::get();
340 
341     offset_buf_->resize(input_.size());
342 
343     auto iter = input_.attr("flat");
344 
345     // size (bytes) of current object data
346     Py_ssize_t sz = 0;
347     // current data
348     const char *input_p = nullptr;
349 
350     size_t idx = 0;
351 
352     py::dtype first_dtype;
353 
354     for (auto obj_h : iter) {
355       if (idx < 1) {
356         // record the first dtype for consistency check
357         first_dtype = get_dtype(obj_h);
358       }
359       offset_buf_->data()[idx] = data_nbytes_;
360 
361       PyObject *obj_p = obj_h.ptr();
362 
363       // we must check each dtype because object arrays are not guaranteed to
364       // be homogenous
365       auto cur_dtype = get_dtype(obj_h);
366       auto err_str =
367           std::string("Mismatched element type in buffer conversion!");
368       if ((first_dtype.kind() == cur_dtype.kind()) ||
369           (first_dtype.kind() == cur_dtype.kind())) {
370         // pass
371       } else if (!dtype_equal(cur_dtype, first_dtype)) {
372         throw py::type_error(err_str);
373       }
374 
375       if (PyUnicode_Check(obj_p)) {
376         if (!allow_unicode_) {
377           // TODO TPY_ERROR_LOC
378           auto errmsg = std::string(
379               "Unexpected unicode object for TILEDB_STRING_ASCII attribute");
380           throw std::runtime_error(errmsg);
381         }
382 
383         // this will cache a utf-8 representation owned by the PyObject
384         input_p = PyUnicode_AsUTF8AndSize(obj_p, &sz);
385         if (!input_p) {
386           TPY_ERROR_LOC("Internal error: failed to convert unicode to UTF-8");
387         }
388       } else if (PyBytes_Check(obj_p)) {
389         // ASCII only
390         auto res =
391             PyBytes_AsStringAndSize(obj_p, const_cast<char **>(&input_p), &sz);
392 
393         if (res == -1) {
394           // TODO TPY_ERROR_LOC
395           throw std::runtime_error(
396               "Internal error: failed to get char* from bytes object");
397         }
398       } else if (npy_api.PyArray_Check_(obj_p)) {
399         // handle (potentially) var-len embedded arrays
400         sz = py::cast<py::array>(obj_p).nbytes();
401       } else {
402         auto errmsg =
403             std::string("Unexpected object type in string conversion");
404         TPY_ERROR_LOC(errmsg);
405       }
406       data_nbytes_ += sz;
407       idx++;
408     }
409 
410     data_buf_->resize(data_nbytes_);
411     // second pass: write the data to output buffer
412     unsigned char *output_p = data_buf_->data();
413 
414     // reset the iterator
415     iter = input_.attr("flat");
416 
417     // copy data to output buffers
418     for (auto obj_h : iter) {
419       auto obj_p = obj_h.ptr();
420 
421       if (PyUnicode_Check(obj_p)) {
422         input_p = PyUnicode_AsUTF8AndSize(obj_p, &sz);
423         assert(input_p != nullptr);
424       } else if (PyBytes_Check(obj_p)) {
425         // TODO error check?
426         PyBytes_AsStringAndSize(obj_p, const_cast<char **>(&input_p), &sz);
427       } else if (npy_api.PyArray_Check_(obj_p)) {
428         // auto pao = (PyArrayObject*)o;
429         // input_p = (const char*)PyArray_DATA(pao);
430         // sz = PyArray_NBYTES(pao);
431         auto o_a = py::cast<py::array>(obj_h);
432         sz = o_a.nbytes();
433         input_p = (const char *)o_a.data();
434       } else {
435         TPY_ERROR_LOC("Unexpected object type in buffer conversion");
436       }
437 
438       memcpy(output_p, input_p, sz);
439       // increment the output pointer for the next object
440       output_p += sz;
441     }
442   }
443 
444 public:
445   /*
446     Initialize the converter
447   */
NumpyConvert(py::array input)448   NumpyConvert(py::array input) {
449     // require a flat buffer
450     if (input.ndim() != 1) {
451       // try to take a 1D view on the input
452       auto v = input.attr("view")();
453       // this will throw if the shape cannot be modified zero-copy,
454       // which is what we want
455       try {
456         v.attr("shape") = py::int_(input.size());
457       } catch (py::error_already_set &e) {
458         if (e.matches(PyExc_AttributeError)) {
459           use_iter_ = true;
460         } else {
461           throw;
462         }
463       } catch (std::exception &e) {
464         std::cout << e.what() << std::endl;
465       }
466       input_ = v;
467     } else {
468       input_ = input;
469     }
470 
471     input_len_ = py::len(input_);
472 
473     data_buf_ = new std::vector<uint8_t>();
474     offset_buf_ = new std::vector<uint64_t>(input_len_);
475   }
476 
~NumpyConvert()477   ~NumpyConvert() {
478     if (data_buf_)
479       delete data_buf_;
480     if (offset_buf_)
481       delete offset_buf_;
482   }
483 
484   /*
485     Set allow_unicode_ flag
486   */
allow_unicode()487   bool allow_unicode() { return allow_unicode_; }
allow_unicode(bool allow_unicode)488   void allow_unicode(bool allow_unicode) { allow_unicode_ = allow_unicode; }
489 
490   /*
491     Returns a tuple of py::array containing
492       (data:array_t<uint8>, offsets:array_t<uint64_t>)
493   */
get()494   py::tuple get() {
495     auto input_dtype = input_.dtype();
496 
497     if (use_iter_) {
498       // slow, safe path
499       convert_iter();
500     } else if (issubdtype(input_dtype, py::dtype("unicode"))) {
501       if (allow_unicode_) {
502         convert_unicode();
503       } else {
504         throw std::runtime_error("Unexpected fixed-length unicode array");
505       }
506     } else if (issubdtype(input_dtype, py::dtype("bytes"))) {
507       convert_bytes();
508     } else if (!input_dtype.is(py::dtype("O"))) {
509       // TODO TPY_ERROR_LOC
510       throw std::runtime_error("expected object array");
511     } else {
512       convert_object();
513     }
514 
515     auto tmp_data_buf_p = data_buf_;
516     auto data_ref = py::capsule(data_buf_, [](void *v) {
517       delete reinterpret_cast<std::vector<uint8_t> *>(v);
518     });
519     data_buf_ = nullptr; // disown: capsule owns it
520 
521     auto tmp_offset_buf_p = offset_buf_;
522     auto offset_ref = py::capsule(offset_buf_, [](void *v) {
523       delete reinterpret_cast<std::vector<uint64_t> *>(v);
524     });
525     offset_buf_ = nullptr; // disown: capsule owns it now
526 
527     auto data_np = py::array_t<uint8_t>(tmp_data_buf_p->size(),
528                                         tmp_data_buf_p->data(), data_ref);
529     auto offset_np = py::array_t<uint64_t>(
530         tmp_offset_buf_p->size(), tmp_offset_buf_p->data(), offset_ref);
531 
532     return py::make_tuple(data_np, offset_np);
533   }
534 };
535 #endif
536 
convert_np(py::array input,bool allow_unicode,bool use_fallback=false)537 py::tuple convert_np(py::array input, bool allow_unicode,
538                      bool use_fallback = false) {
539 #if PY_MAJOR_VERSION >= 3
540   if (use_fallback) {
541 #endif
542     auto tiledb = py::module::import("tiledb");
543     auto libtiledb = tiledb.attr("libtiledb");
544     auto array_to_buffer = libtiledb.attr("array_to_buffer");
545     return array_to_buffer(input);
546 #if PY_MAJOR_VERSION >= 3
547   } else {
548     NumpyConvert cvt(input);
549     cvt.allow_unicode(allow_unicode);
550     return cvt.get();
551   }
552 #endif
553 }
554 
555 }; // namespace tiledbpy