1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 // Array accessor classes for Binary, LargeBinart, String, LargeString,
19 // FixedSizeBinary
20 
21 #pragma once
22 
23 #include <cstdint>
24 #include <memory>
25 #include <string>
26 #include <vector>
27 
28 #include "arrow/array/array_base.h"
29 #include "arrow/array/data.h"
30 #include "arrow/buffer.h"
31 #include "arrow/stl_iterator.h"
32 #include "arrow/type.h"
33 #include "arrow/util/checked_cast.h"
34 #include "arrow/util/macros.h"
35 #include "arrow/util/string_view.h"  // IWYU pragma: export
36 #include "arrow/util/visibility.h"
37 
38 namespace arrow {
39 
40 /// \addtogroup binary-arrays
41 ///
42 /// @{
43 
44 // ----------------------------------------------------------------------
45 // Binary and String
46 
47 /// Base class for variable-sized binary arrays, regardless of offset size
48 /// and logical interpretation.
49 template <typename TYPE>
50 class BaseBinaryArray : public FlatArray {
51  public:
52   using TypeClass = TYPE;
53   using offset_type = typename TypeClass::offset_type;
54   using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
55 
56   /// Return the pointer to the given elements bytes
57   // XXX should GetValue(int64_t i) return a string_view?
GetValue(int64_t i,offset_type * out_length)58   const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
59     // Account for base offset
60     i += data_->offset;
61     const offset_type pos = raw_value_offsets_[i];
62     *out_length = raw_value_offsets_[i + 1] - pos;
63     return raw_data_ + pos;
64   }
65 
66   /// \brief Get binary value as a string_view
67   ///
68   /// \param i the value index
69   /// \return the view over the selected value
GetView(int64_t i)70   util::string_view GetView(int64_t i) const {
71     // Account for base offset
72     i += data_->offset;
73     const offset_type pos = raw_value_offsets_[i];
74     return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
75                              raw_value_offsets_[i + 1] - pos);
76   }
77 
78   /// \brief Get binary value as a string_view
79   /// Provided for consistency with other arrays.
80   ///
81   /// \param i the value index
82   /// \return the view over the selected value
Value(int64_t i)83   util::string_view Value(int64_t i) const { return GetView(i); }
84 
85   /// \brief Get binary value as a std::string
86   ///
87   /// \param i the value index
88   /// \return the value copied into a std::string
GetString(int64_t i)89   std::string GetString(int64_t i) const { return std::string(GetView(i)); }
90 
91   /// Note that this buffer does not account for any slice offset
value_offsets()92   std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
93 
94   /// Note that this buffer does not account for any slice offset
value_data()95   std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
96 
raw_value_offsets()97   const offset_type* raw_value_offsets() const {
98     return raw_value_offsets_ + data_->offset;
99   }
100 
raw_data()101   const uint8_t* raw_data() const { return raw_data_; }
102 
103   /// \brief Return the data buffer absolute offset of the data for the value
104   /// at the passed index.
105   ///
106   /// Does not perform boundschecking
value_offset(int64_t i)107   offset_type value_offset(int64_t i) const {
108     return raw_value_offsets_[i + data_->offset];
109   }
110 
111   /// \brief Return the length of the data for the value at the passed index.
112   ///
113   /// Does not perform boundschecking
value_length(int64_t i)114   offset_type value_length(int64_t i) const {
115     i += data_->offset;
116     return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
117   }
118 
119   /// \brief Return the total length of the memory in the data buffer
120   /// referenced by this array. If the array has been sliced then this may be
121   /// less than the size of the data buffer (data_->buffers[2]).
total_values_length()122   offset_type total_values_length() const {
123     if (data_->length > 0) {
124       return raw_value_offsets_[data_->length + data_->offset] -
125              raw_value_offsets_[data_->offset];
126     } else {
127       return 0;
128     }
129   }
130 
begin()131   IteratorType begin() const { return IteratorType(*this); }
132 
end()133   IteratorType end() const { return IteratorType(*this, length()); }
134 
135  protected:
136   // For subclasses
137   BaseBinaryArray() = default;
138 
139   // Protected method for constructors
SetData(const std::shared_ptr<ArrayData> & data)140   void SetData(const std::shared_ptr<ArrayData>& data) {
141     this->Array::SetData(data);
142     raw_value_offsets_ = data->GetValuesSafe<offset_type>(1, /*offset=*/0);
143     raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
144   }
145 
146   const offset_type* raw_value_offsets_ = NULLPTR;
147   const uint8_t* raw_data_ = NULLPTR;
148 };
149 
150 /// Concrete Array class for variable-size binary data
151 class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
152  public:
153   explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
154 
155   BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
156               const std::shared_ptr<Buffer>& data,
157               const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
158               int64_t null_count = kUnknownNullCount, int64_t offset = 0);
159 
160  protected:
161   // For subclasses such as StringArray
BinaryArray()162   BinaryArray() : BaseBinaryArray() {}
163 };
164 
165 /// Concrete Array class for variable-size string (utf-8) data
166 class ARROW_EXPORT StringArray : public BinaryArray {
167  public:
168   using TypeClass = StringType;
169 
170   explicit StringArray(const std::shared_ptr<ArrayData>& data);
171 
172   StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
173               const std::shared_ptr<Buffer>& data,
174               const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
175               int64_t null_count = kUnknownNullCount, int64_t offset = 0);
176 
177   /// \brief Validate that this array contains only valid UTF8 entries
178   ///
179   /// This check is also implied by ValidateFull()
180   Status ValidateUTF8() const;
181 };
182 
183 /// Concrete Array class for large variable-size binary data
184 class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
185  public:
186   explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);
187 
188   LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
189                    const std::shared_ptr<Buffer>& data,
190                    const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
191                    int64_t null_count = kUnknownNullCount, int64_t offset = 0);
192 
193  protected:
194   // For subclasses such as LargeStringArray
LargeBinaryArray()195   LargeBinaryArray() : BaseBinaryArray() {}
196 };
197 
198 /// Concrete Array class for large variable-size string (utf-8) data
199 class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
200  public:
201   using TypeClass = LargeStringType;
202 
203   explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);
204 
205   LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
206                    const std::shared_ptr<Buffer>& data,
207                    const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
208                    int64_t null_count = kUnknownNullCount, int64_t offset = 0);
209 
210   /// \brief Validate that this array contains only valid UTF8 entries
211   ///
212   /// This check is also implied by ValidateFull()
213   Status ValidateUTF8() const;
214 };
215 
216 // ----------------------------------------------------------------------
217 // Fixed width binary
218 
219 /// Concrete Array class for fixed-size binary data
220 class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
221  public:
222   using TypeClass = FixedSizeBinaryType;
223   using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
224 
225   explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
226 
227   FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
228                        const std::shared_ptr<Buffer>& data,
229                        const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
230                        int64_t null_count = kUnknownNullCount, int64_t offset = 0);
231 
232   const uint8_t* GetValue(int64_t i) const;
Value(int64_t i)233   const uint8_t* Value(int64_t i) const { return GetValue(i); }
234 
GetView(int64_t i)235   util::string_view GetView(int64_t i) const {
236     return util::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width());
237   }
238 
GetString(int64_t i)239   std::string GetString(int64_t i) const { return std::string(GetView(i)); }
240 
byte_width()241   int32_t byte_width() const { return byte_width_; }
242 
raw_values()243   const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; }
244 
begin()245   IteratorType begin() const { return IteratorType(*this); }
246 
end()247   IteratorType end() const { return IteratorType(*this, length()); }
248 
249  protected:
SetData(const std::shared_ptr<ArrayData> & data)250   void SetData(const std::shared_ptr<ArrayData>& data) {
251     this->PrimitiveArray::SetData(data);
252     byte_width_ =
253         internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
254   }
255 
256   int32_t byte_width_;
257 };
258 
259 /// @}
260 
261 }  // namespace arrow
262