1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 // Array accessor classes for Binary, LargeBinart, String, LargeString, 19 // FixedSizeBinary 20 21 #pragma once 22 23 #include <cstdint> 24 #include <memory> 25 #include <string> 26 #include <vector> 27 28 #include "arrow/array/array_base.h" 29 #include "arrow/array/data.h" 30 #include "arrow/buffer.h" 31 #include "arrow/stl_iterator.h" 32 #include "arrow/type.h" 33 #include "arrow/util/checked_cast.h" 34 #include "arrow/util/macros.h" 35 #include "arrow/util/string_view.h" // IWYU pragma: export 36 #include "arrow/util/visibility.h" 37 38 namespace arrow { 39 40 /// \addtogroup binary-arrays 41 /// 42 /// @{ 43 44 // ---------------------------------------------------------------------- 45 // Binary and String 46 47 /// Base class for variable-sized binary arrays, regardless of offset size 48 /// and logical interpretation. 49 template <typename TYPE> 50 class BaseBinaryArray : public FlatArray { 51 public: 52 using TypeClass = TYPE; 53 using offset_type = typename TypeClass::offset_type; 54 using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>; 55 56 /// Return the pointer to the given elements bytes 57 // XXX should GetValue(int64_t i) return a string_view? GetValue(int64_t i,offset_type * out_length)58 const uint8_t* GetValue(int64_t i, offset_type* out_length) const { 59 // Account for base offset 60 i += data_->offset; 61 const offset_type pos = raw_value_offsets_[i]; 62 *out_length = raw_value_offsets_[i + 1] - pos; 63 return raw_data_ + pos; 64 } 65 66 /// \brief Get binary value as a string_view 67 /// 68 /// \param i the value index 69 /// \return the view over the selected value GetView(int64_t i)70 util::string_view GetView(int64_t i) const { 71 // Account for base offset 72 i += data_->offset; 73 const offset_type pos = raw_value_offsets_[i]; 74 return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos), 75 raw_value_offsets_[i + 1] - pos); 76 } 77 78 /// \brief Get binary value as a string_view 79 /// Provided for consistency with other arrays. 80 /// 81 /// \param i the value index 82 /// \return the view over the selected value Value(int64_t i)83 util::string_view Value(int64_t i) const { return GetView(i); } 84 85 /// \brief Get binary value as a std::string 86 /// 87 /// \param i the value index 88 /// \return the value copied into a std::string GetString(int64_t i)89 std::string GetString(int64_t i) const { return std::string(GetView(i)); } 90 91 /// Note that this buffer does not account for any slice offset value_offsets()92 std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; } 93 94 /// Note that this buffer does not account for any slice offset value_data()95 std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; } 96 raw_value_offsets()97 const offset_type* raw_value_offsets() const { 98 return raw_value_offsets_ + data_->offset; 99 } 100 raw_data()101 const uint8_t* raw_data() const { return raw_data_; } 102 103 /// \brief Return the data buffer absolute offset of the data for the value 104 /// at the passed index. 105 /// 106 /// Does not perform boundschecking value_offset(int64_t i)107 offset_type value_offset(int64_t i) const { 108 return raw_value_offsets_[i + data_->offset]; 109 } 110 111 /// \brief Return the length of the data for the value at the passed index. 112 /// 113 /// Does not perform boundschecking value_length(int64_t i)114 offset_type value_length(int64_t i) const { 115 i += data_->offset; 116 return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; 117 } 118 119 /// \brief Return the total length of the memory in the data buffer 120 /// referenced by this array. If the array has been sliced then this may be 121 /// less than the size of the data buffer (data_->buffers[2]). total_values_length()122 offset_type total_values_length() const { 123 if (data_->length > 0) { 124 return raw_value_offsets_[data_->length + data_->offset] - 125 raw_value_offsets_[data_->offset]; 126 } else { 127 return 0; 128 } 129 } 130 begin()131 IteratorType begin() const { return IteratorType(*this); } 132 end()133 IteratorType end() const { return IteratorType(*this, length()); } 134 135 protected: 136 // For subclasses 137 BaseBinaryArray() = default; 138 139 // Protected method for constructors SetData(const std::shared_ptr<ArrayData> & data)140 void SetData(const std::shared_ptr<ArrayData>& data) { 141 this->Array::SetData(data); 142 raw_value_offsets_ = data->GetValuesSafe<offset_type>(1, /*offset=*/0); 143 raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0); 144 } 145 146 const offset_type* raw_value_offsets_ = NULLPTR; 147 const uint8_t* raw_data_ = NULLPTR; 148 }; 149 150 /// Concrete Array class for variable-size binary data 151 class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> { 152 public: 153 explicit BinaryArray(const std::shared_ptr<ArrayData>& data); 154 155 BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, 156 const std::shared_ptr<Buffer>& data, 157 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 158 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 159 160 protected: 161 // For subclasses such as StringArray BinaryArray()162 BinaryArray() : BaseBinaryArray() {} 163 }; 164 165 /// Concrete Array class for variable-size string (utf-8) data 166 class ARROW_EXPORT StringArray : public BinaryArray { 167 public: 168 using TypeClass = StringType; 169 170 explicit StringArray(const std::shared_ptr<ArrayData>& data); 171 172 StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, 173 const std::shared_ptr<Buffer>& data, 174 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 175 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 176 177 /// \brief Validate that this array contains only valid UTF8 entries 178 /// 179 /// This check is also implied by ValidateFull() 180 Status ValidateUTF8() const; 181 }; 182 183 /// Concrete Array class for large variable-size binary data 184 class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> { 185 public: 186 explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data); 187 188 LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, 189 const std::shared_ptr<Buffer>& data, 190 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 191 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 192 193 protected: 194 // For subclasses such as LargeStringArray LargeBinaryArray()195 LargeBinaryArray() : BaseBinaryArray() {} 196 }; 197 198 /// Concrete Array class for large variable-size string (utf-8) data 199 class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { 200 public: 201 using TypeClass = LargeStringType; 202 203 explicit LargeStringArray(const std::shared_ptr<ArrayData>& data); 204 205 LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets, 206 const std::shared_ptr<Buffer>& data, 207 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 208 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 209 210 /// \brief Validate that this array contains only valid UTF8 entries 211 /// 212 /// This check is also implied by ValidateFull() 213 Status ValidateUTF8() const; 214 }; 215 216 // ---------------------------------------------------------------------- 217 // Fixed width binary 218 219 /// Concrete Array class for fixed-size binary data 220 class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { 221 public: 222 using TypeClass = FixedSizeBinaryType; 223 using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>; 224 225 explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data); 226 227 FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length, 228 const std::shared_ptr<Buffer>& data, 229 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, 230 int64_t null_count = kUnknownNullCount, int64_t offset = 0); 231 232 const uint8_t* GetValue(int64_t i) const; Value(int64_t i)233 const uint8_t* Value(int64_t i) const { return GetValue(i); } 234 GetView(int64_t i)235 util::string_view GetView(int64_t i) const { 236 return util::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width()); 237 } 238 GetString(int64_t i)239 std::string GetString(int64_t i) const { return std::string(GetView(i)); } 240 byte_width()241 int32_t byte_width() const { return byte_width_; } 242 raw_values()243 const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; } 244 begin()245 IteratorType begin() const { return IteratorType(*this); } 246 end()247 IteratorType end() const { return IteratorType(*this, length()); } 248 249 protected: SetData(const std::shared_ptr<ArrayData> & data)250 void SetData(const std::shared_ptr<ArrayData>& data) { 251 this->PrimitiveArray::SetData(data); 252 byte_width_ = 253 internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width(); 254 } 255 256 int32_t byte_width_; 257 }; 258 259 /// @} 260 261 } // namespace arrow 262