1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 #include "arrow/array/data.h"
19 
20 #include <algorithm>
21 #include <cstddef>
22 #include <cstdint>
23 #include <memory>
24 #include <string>
25 #include <utility>
26 #include <vector>
27 
28 #include "arrow/buffer.h"
29 #include "arrow/status.h"
30 #include "arrow/type.h"
31 #include "arrow/util/bitmap_ops.h"
32 #include "arrow/util/int_util_internal.h"
33 #include "arrow/util/logging.h"
34 #include "arrow/util/macros.h"
35 
36 namespace arrow {
37 
38 using internal::CountSetBits;
39 
AdjustNonNullable(Type::type type_id,int64_t length,std::vector<std::shared_ptr<Buffer>> * buffers,int64_t * null_count)40 static inline void AdjustNonNullable(Type::type type_id, int64_t length,
41                                      std::vector<std::shared_ptr<Buffer>>* buffers,
42                                      int64_t* null_count) {
43   if (type_id == Type::NA) {
44     *null_count = length;
45     (*buffers)[0] = nullptr;
46   } else if (internal::HasValidityBitmap(type_id)) {
47     if (*null_count == 0) {
48       // In case there are no nulls, don't keep an allocated null bitmap around
49       (*buffers)[0] = nullptr;
50     } else if (*null_count == kUnknownNullCount && buffers->at(0) == nullptr) {
51       // Conversely, if no null bitmap is provided, set the null count to 0
52       *null_count = 0;
53     }
54   } else {
55     *null_count = 0;
56   }
57 }
58 
Make(std::shared_ptr<DataType> type,int64_t length,std::vector<std::shared_ptr<Buffer>> buffers,int64_t null_count,int64_t offset)59 std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
60                                            std::vector<std::shared_ptr<Buffer>> buffers,
61                                            int64_t null_count, int64_t offset) {
62   AdjustNonNullable(type->id(), length, &buffers, &null_count);
63   return std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
64                                      null_count, offset);
65 }
66 
Make(std::shared_ptr<DataType> type,int64_t length,std::vector<std::shared_ptr<Buffer>> buffers,std::vector<std::shared_ptr<ArrayData>> child_data,int64_t null_count,int64_t offset)67 std::shared_ptr<ArrayData> ArrayData::Make(
68     std::shared_ptr<DataType> type, int64_t length,
69     std::vector<std::shared_ptr<Buffer>> buffers,
70     std::vector<std::shared_ptr<ArrayData>> child_data, int64_t null_count,
71     int64_t offset) {
72   AdjustNonNullable(type->id(), length, &buffers, &null_count);
73   return std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
74                                      std::move(child_data), null_count, offset);
75 }
76 
Make(std::shared_ptr<DataType> type,int64_t length,std::vector<std::shared_ptr<Buffer>> buffers,std::vector<std::shared_ptr<ArrayData>> child_data,std::shared_ptr<ArrayData> dictionary,int64_t null_count,int64_t offset)77 std::shared_ptr<ArrayData> ArrayData::Make(
78     std::shared_ptr<DataType> type, int64_t length,
79     std::vector<std::shared_ptr<Buffer>> buffers,
80     std::vector<std::shared_ptr<ArrayData>> child_data,
81     std::shared_ptr<ArrayData> dictionary, int64_t null_count, int64_t offset) {
82   AdjustNonNullable(type->id(), length, &buffers, &null_count);
83   auto data = std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
84                                           std::move(child_data), null_count, offset);
85   data->dictionary = std::move(dictionary);
86   return data;
87 }
88 
Make(std::shared_ptr<DataType> type,int64_t length,int64_t null_count,int64_t offset)89 std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
90                                            int64_t null_count, int64_t offset) {
91   return std::make_shared<ArrayData>(std::move(type), length, null_count, offset);
92 }
93 
Slice(int64_t off,int64_t len) const94 std::shared_ptr<ArrayData> ArrayData::Slice(int64_t off, int64_t len) const {
95   ARROW_CHECK_LE(off, length) << "Slice offset greater than array length";
96   len = std::min(length - off, len);
97   off += offset;
98 
99   auto copy = this->Copy();
100   copy->length = len;
101   copy->offset = off;
102   if (null_count == length) {
103     copy->null_count = len;
104   } else if (off == offset && len == length) {  // A copy of current.
105     copy->null_count = null_count.load();
106   } else {
107     copy->null_count = null_count != 0 ? kUnknownNullCount : 0;
108   }
109   return copy;
110 }
111 
SliceSafe(int64_t off,int64_t len) const112 Result<std::shared_ptr<ArrayData>> ArrayData::SliceSafe(int64_t off, int64_t len) const {
113   RETURN_NOT_OK(internal::CheckSliceParams(length, off, len, "array"));
114   return Slice(off, len);
115 }
116 
GetNullCount() const117 int64_t ArrayData::GetNullCount() const {
118   int64_t precomputed = this->null_count.load();
119   if (ARROW_PREDICT_FALSE(precomputed == kUnknownNullCount)) {
120     if (this->buffers[0]) {
121       precomputed = this->length -
122                     CountSetBits(this->buffers[0]->data(), this->offset, this->length);
123     } else {
124       precomputed = 0;
125     }
126     this->null_count.store(precomputed);
127   }
128   return precomputed;
129 }
130 
131 // ----------------------------------------------------------------------
132 // Implement ArrayData::View
133 
134 namespace {
135 
AccumulateLayouts(const std::shared_ptr<DataType> & type,std::vector<DataTypeLayout> * layouts)136 void AccumulateLayouts(const std::shared_ptr<DataType>& type,
137                        std::vector<DataTypeLayout>* layouts) {
138   layouts->push_back(type->layout());
139   for (const auto& child : type->fields()) {
140     AccumulateLayouts(child->type(), layouts);
141   }
142 }
143 
AccumulateArrayData(const std::shared_ptr<ArrayData> & data,std::vector<std::shared_ptr<ArrayData>> * out)144 void AccumulateArrayData(const std::shared_ptr<ArrayData>& data,
145                          std::vector<std::shared_ptr<ArrayData>>* out) {
146   out->push_back(data);
147   for (const auto& child : data->child_data) {
148     AccumulateArrayData(child, out);
149   }
150 }
151 
152 struct ViewDataImpl {
153   std::shared_ptr<DataType> root_in_type;
154   std::shared_ptr<DataType> root_out_type;
155   std::vector<DataTypeLayout> in_layouts;
156   std::vector<std::shared_ptr<ArrayData>> in_data;
157   int64_t in_data_length;
158   size_t in_layout_idx = 0;
159   size_t in_buffer_idx = 0;
160   bool input_exhausted = false;
161 
InvalidViewarrow::__anon85748ac30111::ViewDataImpl162   Status InvalidView(const std::string& msg) {
163     return Status::Invalid("Can't view array of type ", root_in_type->ToString(), " as ",
164                            root_out_type->ToString(), ": ", msg);
165   }
166 
AdjustInputPointerarrow::__anon85748ac30111::ViewDataImpl167   void AdjustInputPointer() {
168     if (input_exhausted) {
169       return;
170     }
171     while (true) {
172       // Skip exhausted layout (might be empty layout)
173       while (in_buffer_idx >= in_layouts[in_layout_idx].buffers.size()) {
174         in_buffer_idx = 0;
175         ++in_layout_idx;
176         if (in_layout_idx >= in_layouts.size()) {
177           input_exhausted = true;
178           return;
179         }
180       }
181       const auto& in_spec = in_layouts[in_layout_idx].buffers[in_buffer_idx];
182       if (in_spec.kind != DataTypeLayout::ALWAYS_NULL) {
183         return;
184       }
185       // Skip always-null input buffers
186       // (e.g. buffer 0 of a null type or buffer 2 of a sparse union)
187       ++in_buffer_idx;
188     }
189   }
190 
CheckInputAvailablearrow::__anon85748ac30111::ViewDataImpl191   Status CheckInputAvailable() {
192     if (input_exhausted) {
193       return InvalidView("not enough buffers for view type");
194     }
195     return Status::OK();
196   }
197 
CheckInputExhaustedarrow::__anon85748ac30111::ViewDataImpl198   Status CheckInputExhausted() {
199     if (!input_exhausted) {
200       return InvalidView("too many buffers for view type");
201     }
202     return Status::OK();
203   }
204 
GetDictionaryViewarrow::__anon85748ac30111::ViewDataImpl205   Result<std::shared_ptr<ArrayData>> GetDictionaryView(const DataType& out_type) {
206     if (in_data[in_layout_idx]->type->id() != Type::DICTIONARY) {
207       return InvalidView("Cannot get view as dictionary type");
208     }
209     const auto& dict_out_type = static_cast<const DictionaryType&>(out_type);
210     return internal::GetArrayView(in_data[in_layout_idx]->dictionary,
211                                   dict_out_type.value_type());
212   }
213 
MakeDataViewarrow::__anon85748ac30111::ViewDataImpl214   Status MakeDataView(const std::shared_ptr<Field>& out_field,
215                       std::shared_ptr<ArrayData>* out) {
216     const auto& out_type = out_field->type();
217     const auto out_layout = out_type->layout();
218 
219     AdjustInputPointer();
220     int64_t out_length = in_data_length;
221     int64_t out_offset = 0;
222     int64_t out_null_count;
223 
224     std::shared_ptr<ArrayData> dictionary;
225     if (out_type->id() == Type::DICTIONARY) {
226       ARROW_ASSIGN_OR_RAISE(dictionary, GetDictionaryView(*out_type));
227     }
228 
229     // No type has a purely empty layout
230     DCHECK_GT(out_layout.buffers.size(), 0);
231 
232     std::vector<std::shared_ptr<Buffer>> out_buffers;
233 
234     // Process null bitmap
235     if (in_buffer_idx == 0 && out_layout.buffers[0].kind == DataTypeLayout::BITMAP) {
236       // Copy input null bitmap
237       RETURN_NOT_OK(CheckInputAvailable());
238       const auto& in_data_item = in_data[in_layout_idx];
239       if (!out_field->nullable() && in_data_item->GetNullCount() != 0) {
240         return InvalidView("nulls in input cannot be viewed as non-nullable");
241       }
242       DCHECK_GT(in_data_item->buffers.size(), in_buffer_idx);
243       out_buffers.push_back(in_data_item->buffers[in_buffer_idx]);
244       out_length = in_data_item->length;
245       out_offset = in_data_item->offset;
246       out_null_count = in_data_item->null_count;
247       ++in_buffer_idx;
248       AdjustInputPointer();
249     } else {
250       // No null bitmap in input, append no-nulls bitmap
251       out_buffers.push_back(nullptr);
252       if (out_type->id() == Type::NA) {
253         out_null_count = out_length;
254       } else {
255         out_null_count = 0;
256       }
257     }
258 
259     // Process other buffers in output layout
260     for (size_t out_buffer_idx = 1; out_buffer_idx < out_layout.buffers.size();
261          ++out_buffer_idx) {
262       const auto& out_spec = out_layout.buffers[out_buffer_idx];
263       // If always-null buffer is expected, just construct it
264       if (out_spec.kind == DataTypeLayout::ALWAYS_NULL) {
265         out_buffers.push_back(nullptr);
266         continue;
267       }
268 
269       // If input buffer is null bitmap, try to ignore it
270       while (in_buffer_idx == 0) {
271         RETURN_NOT_OK(CheckInputAvailable());
272         if (in_data[in_layout_idx]->GetNullCount() != 0) {
273           return InvalidView("cannot represent nested nulls");
274         }
275         ++in_buffer_idx;
276         AdjustInputPointer();
277       }
278 
279       RETURN_NOT_OK(CheckInputAvailable());
280       const auto& in_spec = in_layouts[in_layout_idx].buffers[in_buffer_idx];
281       if (out_spec != in_spec) {
282         return InvalidView("incompatible layouts");
283       }
284       // Copy input buffer
285       const auto& in_data_item = in_data[in_layout_idx];
286       out_length = in_data_item->length;
287       out_offset = in_data_item->offset;
288       DCHECK_GT(in_data_item->buffers.size(), in_buffer_idx);
289       out_buffers.push_back(in_data_item->buffers[in_buffer_idx]);
290       ++in_buffer_idx;
291       AdjustInputPointer();
292     }
293 
294     std::shared_ptr<ArrayData> out_data = ArrayData::Make(
295         out_type, out_length, std::move(out_buffers), out_null_count, out_offset);
296     out_data->dictionary = dictionary;
297 
298     // Process children recursively, depth-first
299     for (const auto& child_field : out_type->fields()) {
300       std::shared_ptr<ArrayData> child_data;
301       RETURN_NOT_OK(MakeDataView(child_field, &child_data));
302       out_data->child_data.push_back(std::move(child_data));
303     }
304     *out = std::move(out_data);
305     return Status::OK();
306   }
307 };
308 
309 }  // namespace
310 
311 namespace internal {
312 
GetArrayView(const std::shared_ptr<ArrayData> & data,const std::shared_ptr<DataType> & out_type)313 Result<std::shared_ptr<ArrayData>> GetArrayView(
314     const std::shared_ptr<ArrayData>& data, const std::shared_ptr<DataType>& out_type) {
315   ViewDataImpl impl;
316   impl.root_in_type = data->type;
317   impl.root_out_type = out_type;
318   AccumulateLayouts(impl.root_in_type, &impl.in_layouts);
319   AccumulateArrayData(data, &impl.in_data);
320   impl.in_data_length = data->length;
321 
322   std::shared_ptr<ArrayData> out_data;
323   // Dummy field for output type
324   auto out_field = field("", out_type);
325   RETURN_NOT_OK(impl.MakeDataView(out_field, &out_data));
326   RETURN_NOT_OK(impl.CheckInputExhausted());
327   return out_data;
328 }
329 
330 }  // namespace internal
331 }  // namespace arrow
332