1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 #include "arrow/array/data.h"
19
20 #include <algorithm>
21 #include <cstddef>
22 #include <cstdint>
23 #include <memory>
24 #include <string>
25 #include <utility>
26 #include <vector>
27
28 #include "arrow/buffer.h"
29 #include "arrow/status.h"
30 #include "arrow/type.h"
31 #include "arrow/util/bitmap_ops.h"
32 #include "arrow/util/int_util_internal.h"
33 #include "arrow/util/logging.h"
34 #include "arrow/util/macros.h"
35
36 namespace arrow {
37
38 using internal::CountSetBits;
39
AdjustNonNullable(Type::type type_id,int64_t length,std::vector<std::shared_ptr<Buffer>> * buffers,int64_t * null_count)40 static inline void AdjustNonNullable(Type::type type_id, int64_t length,
41 std::vector<std::shared_ptr<Buffer>>* buffers,
42 int64_t* null_count) {
43 if (type_id == Type::NA) {
44 *null_count = length;
45 (*buffers)[0] = nullptr;
46 } else if (internal::HasValidityBitmap(type_id)) {
47 if (*null_count == 0) {
48 // In case there are no nulls, don't keep an allocated null bitmap around
49 (*buffers)[0] = nullptr;
50 } else if (*null_count == kUnknownNullCount && buffers->at(0) == nullptr) {
51 // Conversely, if no null bitmap is provided, set the null count to 0
52 *null_count = 0;
53 }
54 } else {
55 *null_count = 0;
56 }
57 }
58
Make(std::shared_ptr<DataType> type,int64_t length,std::vector<std::shared_ptr<Buffer>> buffers,int64_t null_count,int64_t offset)59 std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
60 std::vector<std::shared_ptr<Buffer>> buffers,
61 int64_t null_count, int64_t offset) {
62 AdjustNonNullable(type->id(), length, &buffers, &null_count);
63 return std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
64 null_count, offset);
65 }
66
Make(std::shared_ptr<DataType> type,int64_t length,std::vector<std::shared_ptr<Buffer>> buffers,std::vector<std::shared_ptr<ArrayData>> child_data,int64_t null_count,int64_t offset)67 std::shared_ptr<ArrayData> ArrayData::Make(
68 std::shared_ptr<DataType> type, int64_t length,
69 std::vector<std::shared_ptr<Buffer>> buffers,
70 std::vector<std::shared_ptr<ArrayData>> child_data, int64_t null_count,
71 int64_t offset) {
72 AdjustNonNullable(type->id(), length, &buffers, &null_count);
73 return std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
74 std::move(child_data), null_count, offset);
75 }
76
Make(std::shared_ptr<DataType> type,int64_t length,std::vector<std::shared_ptr<Buffer>> buffers,std::vector<std::shared_ptr<ArrayData>> child_data,std::shared_ptr<ArrayData> dictionary,int64_t null_count,int64_t offset)77 std::shared_ptr<ArrayData> ArrayData::Make(
78 std::shared_ptr<DataType> type, int64_t length,
79 std::vector<std::shared_ptr<Buffer>> buffers,
80 std::vector<std::shared_ptr<ArrayData>> child_data,
81 std::shared_ptr<ArrayData> dictionary, int64_t null_count, int64_t offset) {
82 AdjustNonNullable(type->id(), length, &buffers, &null_count);
83 auto data = std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
84 std::move(child_data), null_count, offset);
85 data->dictionary = std::move(dictionary);
86 return data;
87 }
88
Make(std::shared_ptr<DataType> type,int64_t length,int64_t null_count,int64_t offset)89 std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
90 int64_t null_count, int64_t offset) {
91 return std::make_shared<ArrayData>(std::move(type), length, null_count, offset);
92 }
93
Slice(int64_t off,int64_t len) const94 std::shared_ptr<ArrayData> ArrayData::Slice(int64_t off, int64_t len) const {
95 ARROW_CHECK_LE(off, length) << "Slice offset greater than array length";
96 len = std::min(length - off, len);
97 off += offset;
98
99 auto copy = this->Copy();
100 copy->length = len;
101 copy->offset = off;
102 if (null_count == length) {
103 copy->null_count = len;
104 } else if (off == offset && len == length) { // A copy of current.
105 copy->null_count = null_count.load();
106 } else {
107 copy->null_count = null_count != 0 ? kUnknownNullCount : 0;
108 }
109 return copy;
110 }
111
SliceSafe(int64_t off,int64_t len) const112 Result<std::shared_ptr<ArrayData>> ArrayData::SliceSafe(int64_t off, int64_t len) const {
113 RETURN_NOT_OK(internal::CheckSliceParams(length, off, len, "array"));
114 return Slice(off, len);
115 }
116
GetNullCount() const117 int64_t ArrayData::GetNullCount() const {
118 int64_t precomputed = this->null_count.load();
119 if (ARROW_PREDICT_FALSE(precomputed == kUnknownNullCount)) {
120 if (this->buffers[0]) {
121 precomputed = this->length -
122 CountSetBits(this->buffers[0]->data(), this->offset, this->length);
123 } else {
124 precomputed = 0;
125 }
126 this->null_count.store(precomputed);
127 }
128 return precomputed;
129 }
130
131 // ----------------------------------------------------------------------
132 // Implement ArrayData::View
133
134 namespace {
135
AccumulateLayouts(const std::shared_ptr<DataType> & type,std::vector<DataTypeLayout> * layouts)136 void AccumulateLayouts(const std::shared_ptr<DataType>& type,
137 std::vector<DataTypeLayout>* layouts) {
138 layouts->push_back(type->layout());
139 for (const auto& child : type->fields()) {
140 AccumulateLayouts(child->type(), layouts);
141 }
142 }
143
AccumulateArrayData(const std::shared_ptr<ArrayData> & data,std::vector<std::shared_ptr<ArrayData>> * out)144 void AccumulateArrayData(const std::shared_ptr<ArrayData>& data,
145 std::vector<std::shared_ptr<ArrayData>>* out) {
146 out->push_back(data);
147 for (const auto& child : data->child_data) {
148 AccumulateArrayData(child, out);
149 }
150 }
151
152 struct ViewDataImpl {
153 std::shared_ptr<DataType> root_in_type;
154 std::shared_ptr<DataType> root_out_type;
155 std::vector<DataTypeLayout> in_layouts;
156 std::vector<std::shared_ptr<ArrayData>> in_data;
157 int64_t in_data_length;
158 size_t in_layout_idx = 0;
159 size_t in_buffer_idx = 0;
160 bool input_exhausted = false;
161
InvalidViewarrow::__anon85748ac30111::ViewDataImpl162 Status InvalidView(const std::string& msg) {
163 return Status::Invalid("Can't view array of type ", root_in_type->ToString(), " as ",
164 root_out_type->ToString(), ": ", msg);
165 }
166
AdjustInputPointerarrow::__anon85748ac30111::ViewDataImpl167 void AdjustInputPointer() {
168 if (input_exhausted) {
169 return;
170 }
171 while (true) {
172 // Skip exhausted layout (might be empty layout)
173 while (in_buffer_idx >= in_layouts[in_layout_idx].buffers.size()) {
174 in_buffer_idx = 0;
175 ++in_layout_idx;
176 if (in_layout_idx >= in_layouts.size()) {
177 input_exhausted = true;
178 return;
179 }
180 }
181 const auto& in_spec = in_layouts[in_layout_idx].buffers[in_buffer_idx];
182 if (in_spec.kind != DataTypeLayout::ALWAYS_NULL) {
183 return;
184 }
185 // Skip always-null input buffers
186 // (e.g. buffer 0 of a null type or buffer 2 of a sparse union)
187 ++in_buffer_idx;
188 }
189 }
190
CheckInputAvailablearrow::__anon85748ac30111::ViewDataImpl191 Status CheckInputAvailable() {
192 if (input_exhausted) {
193 return InvalidView("not enough buffers for view type");
194 }
195 return Status::OK();
196 }
197
CheckInputExhaustedarrow::__anon85748ac30111::ViewDataImpl198 Status CheckInputExhausted() {
199 if (!input_exhausted) {
200 return InvalidView("too many buffers for view type");
201 }
202 return Status::OK();
203 }
204
GetDictionaryViewarrow::__anon85748ac30111::ViewDataImpl205 Result<std::shared_ptr<ArrayData>> GetDictionaryView(const DataType& out_type) {
206 if (in_data[in_layout_idx]->type->id() != Type::DICTIONARY) {
207 return InvalidView("Cannot get view as dictionary type");
208 }
209 const auto& dict_out_type = static_cast<const DictionaryType&>(out_type);
210 return internal::GetArrayView(in_data[in_layout_idx]->dictionary,
211 dict_out_type.value_type());
212 }
213
MakeDataViewarrow::__anon85748ac30111::ViewDataImpl214 Status MakeDataView(const std::shared_ptr<Field>& out_field,
215 std::shared_ptr<ArrayData>* out) {
216 const auto& out_type = out_field->type();
217 const auto out_layout = out_type->layout();
218
219 AdjustInputPointer();
220 int64_t out_length = in_data_length;
221 int64_t out_offset = 0;
222 int64_t out_null_count;
223
224 std::shared_ptr<ArrayData> dictionary;
225 if (out_type->id() == Type::DICTIONARY) {
226 ARROW_ASSIGN_OR_RAISE(dictionary, GetDictionaryView(*out_type));
227 }
228
229 // No type has a purely empty layout
230 DCHECK_GT(out_layout.buffers.size(), 0);
231
232 std::vector<std::shared_ptr<Buffer>> out_buffers;
233
234 // Process null bitmap
235 if (in_buffer_idx == 0 && out_layout.buffers[0].kind == DataTypeLayout::BITMAP) {
236 // Copy input null bitmap
237 RETURN_NOT_OK(CheckInputAvailable());
238 const auto& in_data_item = in_data[in_layout_idx];
239 if (!out_field->nullable() && in_data_item->GetNullCount() != 0) {
240 return InvalidView("nulls in input cannot be viewed as non-nullable");
241 }
242 DCHECK_GT(in_data_item->buffers.size(), in_buffer_idx);
243 out_buffers.push_back(in_data_item->buffers[in_buffer_idx]);
244 out_length = in_data_item->length;
245 out_offset = in_data_item->offset;
246 out_null_count = in_data_item->null_count;
247 ++in_buffer_idx;
248 AdjustInputPointer();
249 } else {
250 // No null bitmap in input, append no-nulls bitmap
251 out_buffers.push_back(nullptr);
252 if (out_type->id() == Type::NA) {
253 out_null_count = out_length;
254 } else {
255 out_null_count = 0;
256 }
257 }
258
259 // Process other buffers in output layout
260 for (size_t out_buffer_idx = 1; out_buffer_idx < out_layout.buffers.size();
261 ++out_buffer_idx) {
262 const auto& out_spec = out_layout.buffers[out_buffer_idx];
263 // If always-null buffer is expected, just construct it
264 if (out_spec.kind == DataTypeLayout::ALWAYS_NULL) {
265 out_buffers.push_back(nullptr);
266 continue;
267 }
268
269 // If input buffer is null bitmap, try to ignore it
270 while (in_buffer_idx == 0) {
271 RETURN_NOT_OK(CheckInputAvailable());
272 if (in_data[in_layout_idx]->GetNullCount() != 0) {
273 return InvalidView("cannot represent nested nulls");
274 }
275 ++in_buffer_idx;
276 AdjustInputPointer();
277 }
278
279 RETURN_NOT_OK(CheckInputAvailable());
280 const auto& in_spec = in_layouts[in_layout_idx].buffers[in_buffer_idx];
281 if (out_spec != in_spec) {
282 return InvalidView("incompatible layouts");
283 }
284 // Copy input buffer
285 const auto& in_data_item = in_data[in_layout_idx];
286 out_length = in_data_item->length;
287 out_offset = in_data_item->offset;
288 DCHECK_GT(in_data_item->buffers.size(), in_buffer_idx);
289 out_buffers.push_back(in_data_item->buffers[in_buffer_idx]);
290 ++in_buffer_idx;
291 AdjustInputPointer();
292 }
293
294 std::shared_ptr<ArrayData> out_data = ArrayData::Make(
295 out_type, out_length, std::move(out_buffers), out_null_count, out_offset);
296 out_data->dictionary = dictionary;
297
298 // Process children recursively, depth-first
299 for (const auto& child_field : out_type->fields()) {
300 std::shared_ptr<ArrayData> child_data;
301 RETURN_NOT_OK(MakeDataView(child_field, &child_data));
302 out_data->child_data.push_back(std::move(child_data));
303 }
304 *out = std::move(out_data);
305 return Status::OK();
306 }
307 };
308
309 } // namespace
310
311 namespace internal {
312
GetArrayView(const std::shared_ptr<ArrayData> & data,const std::shared_ptr<DataType> & out_type)313 Result<std::shared_ptr<ArrayData>> GetArrayView(
314 const std::shared_ptr<ArrayData>& data, const std::shared_ptr<DataType>& out_type) {
315 ViewDataImpl impl;
316 impl.root_in_type = data->type;
317 impl.root_out_type = out_type;
318 AccumulateLayouts(impl.root_in_type, &impl.in_layouts);
319 AccumulateArrayData(data, &impl.in_data);
320 impl.in_data_length = data->length;
321
322 std::shared_ptr<ArrayData> out_data;
323 // Dummy field for output type
324 auto out_field = field("", out_type);
325 RETURN_NOT_OK(impl.MakeDataView(out_field, &out_data));
326 RETURN_NOT_OK(impl.CheckInputExhausted());
327 return out_data;
328 }
329
330 } // namespace internal
331 } // namespace arrow
332