1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 #pragma once 19 20 #include <memory> 21 #include <vector> 22 23 #include "arrow/ipc/options.h" 24 #include "arrow/python/visibility.h" 25 #include "arrow/sparse_tensor.h" 26 #include "arrow/status.h" 27 28 // Forward declaring PyObject, see 29 // https://mail.python.org/pipermail/python-dev/2003-August/037601.html 30 #ifndef PyObject_HEAD 31 struct _object; 32 typedef _object PyObject; 33 #endif 34 35 namespace arrow { 36 37 class Buffer; 38 class DataType; 39 class MemoryPool; 40 class RecordBatch; 41 class Tensor; 42 43 namespace io { 44 45 class OutputStream; 46 47 } // namespace io 48 49 namespace py { 50 51 struct ARROW_PYTHON_EXPORT SerializedPyObject { 52 std::shared_ptr<RecordBatch> batch; 53 std::vector<std::shared_ptr<Tensor>> tensors; 54 std::vector<std::shared_ptr<SparseTensor>> sparse_tensors; 55 std::vector<std::shared_ptr<Tensor>> ndarrays; 56 std::vector<std::shared_ptr<Buffer>> buffers; 57 ipc::IpcWriteOptions ipc_options; 58 59 SerializedPyObject(); 60 61 /// \brief Write serialized Python object to OutputStream 62 /// \param[in,out] dst an OutputStream 63 /// \return Status 64 Status WriteTo(io::OutputStream* dst); 65 66 /// \brief Convert SerializedPyObject to a dict containing the message 67 /// components as Buffer instances with minimal memory allocation 68 /// 69 /// { 70 /// 'num_tensors': M, 71 /// 'num_sparse_tensors': N, 72 /// 'num_buffers': K, 73 /// 'data': [Buffer] 74 /// } 75 /// 76 /// Each tensor is written as two buffers, one for the metadata and one for 77 /// the body. Therefore, the number of buffers in 'data' is 2 * M + 2 * N + K + 1, 78 /// with the first buffer containing the serialized record batch containing 79 /// the UnionArray that describes the whole object 80 Status GetComponents(MemoryPool* pool, PyObject** out); 81 }; 82 83 /// \brief Serialize Python sequence as a SerializedPyObject. 84 /// \param[in] context Serialization context which contains custom serialization 85 /// and deserialization callbacks. Can be any Python object with a 86 /// _serialize_callback method for serialization and a _deserialize_callback 87 /// method for deserialization. If context is None, no custom serialization 88 /// will be attempted. 89 /// \param[in] sequence A Python sequence object to serialize to Arrow data 90 /// structures 91 /// \param[out] out The serialized representation 92 /// \return Status 93 /// 94 /// Release GIL before calling 95 ARROW_PYTHON_EXPORT 96 Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out); 97 98 /// \brief Serialize an Arrow Tensor as a SerializedPyObject. 99 /// \param[in] tensor Tensor to be serialized 100 /// \param[out] out The serialized representation 101 /// \return Status 102 ARROW_PYTHON_EXPORT 103 Status SerializeTensor(std::shared_ptr<Tensor> tensor, py::SerializedPyObject* out); 104 105 /// \brief Write the Tensor metadata header to an OutputStream. 106 /// \param[in] dtype DataType of the Tensor 107 /// \param[in] shape The shape of the tensor 108 /// \param[in] tensor_num_bytes The length of the Tensor data in bytes 109 /// \param[in] dst The OutputStream to write the Tensor header to 110 /// \return Status 111 ARROW_PYTHON_EXPORT 112 Status WriteNdarrayHeader(std::shared_ptr<DataType> dtype, 113 const std::vector<int64_t>& shape, int64_t tensor_num_bytes, 114 io::OutputStream* dst); 115 116 struct PythonType { 117 enum type { 118 BOOL, 119 INT, 120 PY2INT, // Kept for compatibility 121 BYTES, 122 STRING, 123 HALF_FLOAT, 124 FLOAT, 125 DOUBLE, 126 DATE64, 127 LIST, 128 DICT, 129 TUPLE, 130 SET, 131 TENSOR, 132 NDARRAY, 133 BUFFER, 134 SPARSECOOTENSOR, 135 SPARSECSRMATRIX, 136 SPARSECSCMATRIX, 137 SPARSECSFTENSOR, 138 NUM_PYTHON_TYPES 139 }; 140 }; 141 142 } // namespace py 143 144 } // namespace arrow 145