1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #pragma once
19 
20 #include <memory>
21 #include <vector>
22 
23 #include "arrow/ipc/options.h"
24 #include "arrow/python/visibility.h"
25 #include "arrow/sparse_tensor.h"
26 #include "arrow/status.h"
27 
28 // Forward declaring PyObject, see
29 // https://mail.python.org/pipermail/python-dev/2003-August/037601.html
30 #ifndef PyObject_HEAD
31 struct _object;
32 typedef _object PyObject;
33 #endif
34 
35 namespace arrow {
36 
37 class Buffer;
38 class DataType;
39 class MemoryPool;
40 class RecordBatch;
41 class Tensor;
42 
43 namespace io {
44 
45 class OutputStream;
46 
47 }  // namespace io
48 
49 namespace py {
50 
51 struct ARROW_PYTHON_EXPORT SerializedPyObject {
52   std::shared_ptr<RecordBatch> batch;
53   std::vector<std::shared_ptr<Tensor>> tensors;
54   std::vector<std::shared_ptr<SparseTensor>> sparse_tensors;
55   std::vector<std::shared_ptr<Tensor>> ndarrays;
56   std::vector<std::shared_ptr<Buffer>> buffers;
57   ipc::IpcWriteOptions ipc_options;
58 
59   SerializedPyObject();
60 
61   /// \brief Write serialized Python object to OutputStream
62   /// \param[in,out] dst an OutputStream
63   /// \return Status
64   Status WriteTo(io::OutputStream* dst);
65 
66   /// \brief Convert SerializedPyObject to a dict containing the message
67   /// components as Buffer instances with minimal memory allocation
68   ///
69   /// {
70   ///   'num_tensors': M,
71   ///   'num_sparse_tensors': N,
72   ///   'num_buffers': K,
73   ///   'data': [Buffer]
74   /// }
75   ///
76   /// Each tensor is written as two buffers, one for the metadata and one for
77   /// the body. Therefore, the number of buffers in 'data' is 2 * M + 2 * N + K + 1,
78   /// with the first buffer containing the serialized record batch containing
79   /// the UnionArray that describes the whole object
80   Status GetComponents(MemoryPool* pool, PyObject** out);
81 };
82 
83 /// \brief Serialize Python sequence as a SerializedPyObject.
84 /// \param[in] context Serialization context which contains custom serialization
85 /// and deserialization callbacks. Can be any Python object with a
86 /// _serialize_callback method for serialization and a _deserialize_callback
87 /// method for deserialization. If context is None, no custom serialization
88 /// will be attempted.
89 /// \param[in] sequence A Python sequence object to serialize to Arrow data
90 /// structures
91 /// \param[out] out The serialized representation
92 /// \return Status
93 ///
94 /// Release GIL before calling
95 ARROW_PYTHON_EXPORT
96 Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out);
97 
98 /// \brief Serialize an Arrow Tensor as a SerializedPyObject.
99 /// \param[in] tensor Tensor to be serialized
100 /// \param[out] out The serialized representation
101 /// \return Status
102 ARROW_PYTHON_EXPORT
103 Status SerializeTensor(std::shared_ptr<Tensor> tensor, py::SerializedPyObject* out);
104 
105 /// \brief Write the Tensor metadata header to an OutputStream.
106 /// \param[in] dtype DataType of the Tensor
107 /// \param[in] shape The shape of the tensor
108 /// \param[in] tensor_num_bytes The length of the Tensor data in bytes
109 /// \param[in] dst The OutputStream to write the Tensor header to
110 /// \return Status
111 ARROW_PYTHON_EXPORT
112 Status WriteNdarrayHeader(std::shared_ptr<DataType> dtype,
113                           const std::vector<int64_t>& shape, int64_t tensor_num_bytes,
114                           io::OutputStream* dst);
115 
116 struct PythonType {
117   enum type {
118     BOOL,
119     INT,
120     PY2INT,  // Kept for compatibility
121     BYTES,
122     STRING,
123     HALF_FLOAT,
124     FLOAT,
125     DOUBLE,
126     DATE64,
127     LIST,
128     DICT,
129     TUPLE,
130     SET,
131     TENSOR,
132     NDARRAY,
133     BUFFER,
134     SPARSECOOTENSOR,
135     SPARSECSRMATRIX,
136     SPARSECSCMATRIX,
137     SPARSECSFTENSOR,
138     NUM_PYTHON_TYPES
139   };
140 };
141 
142 }  // namespace py
143 
144 }  // namespace arrow
145