1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied. See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17
18 //! Contains declarations to bind to the [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html).
19 //!
20 //! Generally, this module is divided in two main interfaces:
21 //! One interface maps C ABI to native Rust types, i.e. convert c-pointers, c_char, to native rust.
22 //! This is handled by [FFI_ArrowSchema] and [FFI_ArrowArray].
23 //!
24 //! The second interface maps native Rust types to the Rust-specific implementation of Arrow such as `format` to `Datatype`,
25 //! `Buffer`, etc. This is handled by `ArrowArray`.
26 //!
27 //! ```rust
28 //! # use std::sync::Arc;
29 //! # use arrow::array::{Int32Array, Array, ArrayData, make_array_from_raw};
30 //! # use arrow::error::{Result, ArrowError};
31 //! # use arrow::compute::kernels::arithmetic;
32 //! # use std::convert::TryFrom;
33 //! # fn main() -> Result<()> {
34 //! // create an array natively
35 //! let array = Int32Array::from(vec![Some(1), None, Some(3)]);
36 //!
37 //! // export it
38 //! let (array_ptr, schema_ptr) = array.to_raw()?;
39 //!
40 //! // consumed and used by something else...
41 //!
42 //! // import it
43 //! let array = unsafe { make_array_from_raw(array_ptr, schema_ptr)? };
44 //!
45 //! // perform some operation
46 //! let array = array.as_any().downcast_ref::<Int32Array>().ok_or(
47 //! ArrowError::ParseError("Expects an int32".to_string()),
48 //! )?;
49 //! let array = arithmetic::add(&array, &array)?;
50 //!
51 //! // verify
52 //! assert_eq!(array, Int32Array::from(vec![Some(2), None, Some(6)]));
53 //!
54 //! // (drop/release)
55 //! Ok(())
56 //! }
57 //! ```
58
59 /*
60 # Design:
61
62 Main assumptions:
63 * A memory region is deallocated according it its own release mechanism.
64 * Rust shares memory regions between arrays.
65 * A memory region should be deallocated when no-one is using it.
66
67 The design of this module is as follows:
68
69 `ArrowArray` contains two `Arc`s, one per ABI-compatible `struct`, each containing data
70 according to the C Data Interface. These Arcs are used for ref counting of the structs
71 within Rust and lifetime management.
72
73 Each ABI-compatible `struct` knowns how to `drop` itself, calling `release`.
74
75 To import an array, unsafely create an `ArrowArray` from two pointers using [ArrowArray::try_from_raw].
76 To export an array, create an `ArrowArray` using [ArrowArray::try_new].
77 */
78
79 use std::{
80 ffi::CStr,
81 ffi::CString,
82 iter,
83 mem::size_of,
84 ptr::{self, NonNull},
85 sync::Arc,
86 };
87
88 use crate::buffer::Buffer;
89 use crate::datatypes::{DataType, TimeUnit};
90 use crate::error::{ArrowError, Result};
91 use crate::util::bit_util;
92
93 /// ABI-compatible struct for `ArrowSchema` from C Data Interface
94 /// See <https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions>
95 /// This was created by bindgen
96 #[repr(C)]
97 #[derive(Debug)]
98 pub struct FFI_ArrowSchema {
99 format: *const ::std::os::raw::c_char,
100 name: *const ::std::os::raw::c_char,
101 metadata: *const ::std::os::raw::c_char,
102 flags: i64,
103 n_children: i64,
104 children: *mut *mut FFI_ArrowSchema,
105 dictionary: *mut FFI_ArrowSchema,
106 release: ::std::option::Option<unsafe extern "C" fn(arg1: *mut FFI_ArrowSchema)>,
107 private_data: *mut ::std::os::raw::c_void,
108 }
109
110 // callback used to drop [FFI_ArrowSchema] when it is exported.
release_schema(schema: *mut FFI_ArrowSchema)111 unsafe extern "C" fn release_schema(schema: *mut FFI_ArrowSchema) {
112 let schema = &mut *schema;
113
114 // take ownership back to release it.
115 CString::from_raw(schema.format as *mut std::os::raw::c_char);
116
117 schema.release = None;
118 }
119
120 impl FFI_ArrowSchema {
121 /// create a new [FFI_ArrowSchema] from a format.
new(format: &str) -> FFI_ArrowSchema122 fn new(format: &str) -> FFI_ArrowSchema {
123 // <https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema>
124 FFI_ArrowSchema {
125 format: CString::new(format).unwrap().into_raw(),
126 name: std::ptr::null_mut(),
127 metadata: std::ptr::null_mut(),
128 flags: 0,
129 n_children: 0,
130 children: ptr::null_mut(),
131 dictionary: std::ptr::null_mut(),
132 release: Some(release_schema),
133 private_data: std::ptr::null_mut(),
134 }
135 }
136
137 /// create an empty [FFI_ArrowSchema]
empty() -> Self138 fn empty() -> Self {
139 Self {
140 format: std::ptr::null_mut(),
141 name: std::ptr::null_mut(),
142 metadata: std::ptr::null_mut(),
143 flags: 0,
144 n_children: 0,
145 children: ptr::null_mut(),
146 dictionary: std::ptr::null_mut(),
147 release: None,
148 private_data: std::ptr::null_mut(),
149 }
150 }
151
152 /// returns the format of this schema.
format(&self) -> &str153 pub fn format(&self) -> &str {
154 unsafe { CStr::from_ptr(self.format) }
155 .to_str()
156 .expect("The external API has a non-utf8 as format")
157 }
158 }
159
160 impl Drop for FFI_ArrowSchema {
drop(&mut self)161 fn drop(&mut self) {
162 match self.release {
163 None => (),
164 Some(release) => unsafe { release(self) },
165 };
166 }
167 }
168
169 /// maps a DataType `format` to a [DataType](arrow::datatypes::DataType).
170 /// See https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
to_datatype(format: &str) -> Result<DataType>171 fn to_datatype(format: &str) -> Result<DataType> {
172 Ok(match format {
173 "n" => DataType::Null,
174 "b" => DataType::Boolean,
175 "c" => DataType::Int8,
176 "C" => DataType::UInt8,
177 "s" => DataType::Int16,
178 "S" => DataType::UInt16,
179 "i" => DataType::Int32,
180 "I" => DataType::UInt32,
181 "l" => DataType::Int64,
182 "L" => DataType::UInt64,
183 "e" => DataType::Float16,
184 "f" => DataType::Float32,
185 "g" => DataType::Float64,
186 "z" => DataType::Binary,
187 "Z" => DataType::LargeBinary,
188 "u" => DataType::Utf8,
189 "U" => DataType::LargeUtf8,
190 "tdD" => DataType::Date32,
191 "tdm" => DataType::Date64,
192 "tts" => DataType::Time32(TimeUnit::Second),
193 "ttm" => DataType::Time32(TimeUnit::Millisecond),
194 "ttu" => DataType::Time64(TimeUnit::Microsecond),
195 "ttn" => DataType::Time64(TimeUnit::Nanosecond),
196 dt => {
197 return Err(ArrowError::CDataInterface(format!(
198 "The datatype \"{}\" is not supported in the Rust implementation",
199 dt
200 )))
201 }
202 })
203 }
204
205 /// the inverse of [to_datatype]
from_datatype(datatype: &DataType) -> Result<String>206 fn from_datatype(datatype: &DataType) -> Result<String> {
207 Ok(match datatype {
208 DataType::Null => "n",
209 DataType::Boolean => "b",
210 DataType::Int8 => "c",
211 DataType::UInt8 => "C",
212 DataType::Int16 => "s",
213 DataType::UInt16 => "S",
214 DataType::Int32 => "i",
215 DataType::UInt32 => "I",
216 DataType::Int64 => "l",
217 DataType::UInt64 => "L",
218 DataType::Float16 => "e",
219 DataType::Float32 => "f",
220 DataType::Float64 => "g",
221 DataType::Binary => "z",
222 DataType::LargeBinary => "Z",
223 DataType::Utf8 => "u",
224 DataType::LargeUtf8 => "U",
225 DataType::Date32 => "tdD",
226 DataType::Date64 => "tdm",
227 DataType::Time32(TimeUnit::Second) => "tts",
228 DataType::Time32(TimeUnit::Millisecond) => "ttm",
229 DataType::Time64(TimeUnit::Microsecond) => "ttu",
230 DataType::Time64(TimeUnit::Nanosecond) => "ttn",
231 z => {
232 return Err(ArrowError::CDataInterface(format!(
233 "The datatype \"{:?}\" is still not supported in Rust implementation",
234 z
235 )))
236 }
237 }
238 .to_string())
239 }
240
241 // returns the number of bits that buffer `i` (in the C data interface) is expected to have.
242 // This is set by the Arrow specification
bit_width(data_type: &DataType, i: usize) -> Result<usize>243 fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
244 Ok(match (data_type, i) {
245 // the null buffer is bit sized
246 (_, 0) => 1,
247 // primitive types first buffer's size is given by the native types
248 (DataType::Boolean, 1) => 1,
249 (DataType::UInt8, 1) => size_of::<u8>() * 8,
250 (DataType::UInt16, 1) => size_of::<u16>() * 8,
251 (DataType::UInt32, 1) => size_of::<u32>() * 8,
252 (DataType::UInt64, 1) => size_of::<u64>() * 8,
253 (DataType::Int8, 1) => size_of::<i8>() * 8,
254 (DataType::Int16, 1) => size_of::<i16>() * 8,
255 (DataType::Int32, 1) | (DataType::Date32, 1) | (DataType::Time32(_), 1) => size_of::<i32>() * 8,
256 (DataType::Int64, 1) | (DataType::Date64, 1) | (DataType::Time64(_), 1) => size_of::<i64>() * 8,
257 (DataType::Float32, 1) => size_of::<f32>() * 8,
258 (DataType::Float64, 1) => size_of::<f64>() * 8,
259 // primitive types have a single buffer
260 (DataType::Boolean, _) |
261 (DataType::UInt8, _) |
262 (DataType::UInt16, _) |
263 (DataType::UInt32, _) |
264 (DataType::UInt64, _) |
265 (DataType::Int8, _) |
266 (DataType::Int16, _) |
267 (DataType::Int32, _) | (DataType::Date32, _) | (DataType::Time32(_), _) |
268 (DataType::Int64, _) | (DataType::Date64, _) | (DataType::Time64(_), _) |
269 (DataType::Float32, _) |
270 (DataType::Float64, _) => {
271 return Err(ArrowError::CDataInterface(format!(
272 "The datatype \"{:?}\" expects 2 buffers, but requested {}. Please verify that the C data interface is correctly implemented.",
273 data_type, i
274 )))
275 }
276 // Variable-sized binaries: have two buffers.
277 // "small": first buffer is i32, second is in bytes
278 (DataType::Utf8, 1) | (DataType::Binary, 1) => size_of::<i32>() * 8,
279 (DataType::Utf8, 2) | (DataType::Binary, 2) => size_of::<u8>() * 8,
280 (DataType::Utf8, _) | (DataType::Binary, _) => {
281 return Err(ArrowError::CDataInterface(format!(
282 "The datatype \"{:?}\" expects 3 buffers, but requested {}. Please verify that the C data interface is correctly implemented.",
283 data_type, i
284 )))
285 }
286 // Variable-sized binaries: have two buffers.
287 // LargeUtf8: first buffer is i64, second is in bytes
288 (DataType::LargeUtf8, 1) | (DataType::LargeBinary, 1) => size_of::<i64>() * 8,
289 (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => size_of::<u8>() * 8,
290 (DataType::LargeUtf8, _) | (DataType::LargeBinary, _) => {
291 return Err(ArrowError::CDataInterface(format!(
292 "The datatype \"{:?}\" expects 3 buffers, but requested {}. Please verify that the C data interface is correctly implemented.",
293 data_type, i
294 )))
295 }
296 _ => {
297 return Err(ArrowError::CDataInterface(format!(
298 "The datatype \"{:?}\" is still not supported in Rust implementation",
299 data_type
300 )))
301 }
302 })
303 }
304
305 /// ABI-compatible struct for ArrowArray from C Data Interface
306 /// See <https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions>
307 /// This was created by bindgen
308 #[repr(C)]
309 #[derive(Debug)]
310 pub struct FFI_ArrowArray {
311 pub(crate) length: i64,
312 pub(crate) null_count: i64,
313 pub(crate) offset: i64,
314 pub(crate) n_buffers: i64,
315 pub(crate) n_children: i64,
316 pub(crate) buffers: *mut *const ::std::os::raw::c_void,
317 children: *mut *mut FFI_ArrowArray,
318 dictionary: *mut FFI_ArrowArray,
319 release: ::std::option::Option<unsafe extern "C" fn(arg1: *mut FFI_ArrowArray)>,
320 // When exported, this MUST contain everything that is owned by this array.
321 // for example, any buffer pointed to in `buffers` must be here, as well as the `buffers` pointer
322 // itself.
323 // In other words, everything in [FFI_ArrowArray] must be owned by `private_data` and can assume
324 // that they do not outlive `private_data`.
325 private_data: *mut ::std::os::raw::c_void,
326 }
327
328 // callback used to drop [FFI_ArrowArray] when it is exported
release_array(array: *mut FFI_ArrowArray)329 unsafe extern "C" fn release_array(array: *mut FFI_ArrowArray) {
330 if array.is_null() {
331 return;
332 }
333 let array = &mut *array;
334 // take ownership of `private_data`, therefore dropping it
335 Box::from_raw(array.private_data as *mut PrivateData);
336
337 array.release = None;
338 }
339
340 struct PrivateData {
341 buffers: Vec<Option<Buffer>>,
342 buffers_ptr: Box<[*const std::os::raw::c_void]>,
343 }
344
345 impl FFI_ArrowArray {
346 /// creates a new `FFI_ArrowArray` from existing data.
347 /// # Safety
348 /// This method releases `buffers`. Consumers of this struct *must* call `release` before
349 /// releasing this struct, or contents in `buffers` leak.
new( length: i64, null_count: i64, offset: i64, n_buffers: i64, buffers: Vec<Option<Buffer>>, ) -> Self350 unsafe fn new(
351 length: i64,
352 null_count: i64,
353 offset: i64,
354 n_buffers: i64,
355 buffers: Vec<Option<Buffer>>,
356 ) -> Self {
357 let buffers_ptr = buffers
358 .iter()
359 .map(|maybe_buffer| match maybe_buffer {
360 // note that `raw_data` takes into account the buffer's offset
361 Some(b) => b.as_ptr() as *const std::os::raw::c_void,
362 None => std::ptr::null(),
363 })
364 .collect::<Box<[_]>>();
365 let pointer = buffers_ptr.as_ptr() as *mut *const std::ffi::c_void;
366
367 // create the private data owning everything.
368 // any other data must be added here, e.g. via a struct, to track lifetime.
369 let private_data = Box::new(PrivateData {
370 buffers,
371 buffers_ptr,
372 });
373
374 Self {
375 length,
376 null_count,
377 offset,
378 n_buffers,
379 n_children: 0,
380 buffers: pointer,
381 children: std::ptr::null_mut(),
382 dictionary: std::ptr::null_mut(),
383 release: Some(release_array),
384 private_data: Box::into_raw(private_data) as *mut ::std::os::raw::c_void,
385 }
386 }
387
388 // create an empty `FFI_ArrowArray`, which can be used to import data into
empty() -> Self389 fn empty() -> Self {
390 Self {
391 length: 0,
392 null_count: 0,
393 offset: 0,
394 n_buffers: 0,
395 n_children: 0,
396 buffers: std::ptr::null_mut(),
397 children: std::ptr::null_mut(),
398 dictionary: std::ptr::null_mut(),
399 release: None,
400 private_data: std::ptr::null_mut(),
401 }
402 }
403 }
404
405 /// returns a new buffer corresponding to the index `i` of the FFI array. It may not exist (null pointer).
406 /// `bits` is the number of bits that the native type of this buffer has.
407 /// The size of the buffer will be `ceil(self.length * bits, 8)`.
408 /// # Panic
409 /// This function panics if `i` is larger or equal to `n_buffers`.
410 /// # Safety
411 /// This function assumes that `ceil(self.length * bits, 8)` is the size of the buffer
create_buffer( array: Arc<FFI_ArrowArray>, index: usize, len: usize, ) -> Option<Buffer>412 unsafe fn create_buffer(
413 array: Arc<FFI_ArrowArray>,
414 index: usize,
415 len: usize,
416 ) -> Option<Buffer> {
417 if array.buffers.is_null() {
418 return None;
419 }
420 let buffers = array.buffers as *mut *const u8;
421
422 assert!(index < array.n_buffers as usize);
423 let ptr = *buffers.add(index);
424
425 NonNull::new(ptr as *mut u8).map(|ptr| Buffer::from_unowned(ptr, len, array))
426 }
427
428 impl Drop for FFI_ArrowArray {
drop(&mut self)429 fn drop(&mut self) {
430 match self.release {
431 None => (),
432 Some(release) => unsafe { release(self) },
433 };
434 }
435 }
436
437 /// Struct used to move an Array from and to the C Data Interface.
438 /// Its main responsibility is to expose functionality that requires
439 /// both [FFI_ArrowArray] and [FFI_ArrowSchema].
440 ///
441 /// This struct has two main paths:
442 ///
443 /// ## Import from the C Data Interface
444 /// * [ArrowArray::empty] to allocate memory to be filled by an external call
445 /// * [ArrowArray::try_from_raw] to consume two non-null allocated pointers
446 /// ## Export to the C Data Interface
447 /// * [ArrowArray::try_new] to create a new [ArrowArray] from Rust-specific information
448 /// * [ArrowArray::into_raw] to expose two pointers for [FFI_ArrowArray] and [FFI_ArrowSchema].
449 ///
450 /// # Safety
451 /// Whoever creates this struct is responsible for releasing their resources. Specifically,
452 /// consumers *must* call [ArrowArray::into_raw] and take ownership of the individual pointers,
453 /// calling [FFI_ArrowArray::release] and [FFI_ArrowSchema::release] accordingly.
454 ///
455 /// Furthermore, this struct assumes that the incoming data agrees with the C data interface.
456 #[derive(Debug)]
457 pub struct ArrowArray {
458 // these are ref-counted because they can be shared by multiple buffers.
459 array: Arc<FFI_ArrowArray>,
460 schema: Arc<FFI_ArrowSchema>,
461 }
462
463 impl ArrowArray {
464 /// creates a new `ArrowArray`. This is used to export to the C Data Interface.
465 /// # Safety
466 /// See safety of [ArrowArray]
try_new( data_type: &DataType, len: usize, null_count: usize, null_buffer: Option<Buffer>, offset: usize, buffers: Vec<Buffer>, _child_data: Vec<ArrowArray>, ) -> Result<Self>467 pub unsafe fn try_new(
468 data_type: &DataType,
469 len: usize,
470 null_count: usize,
471 null_buffer: Option<Buffer>,
472 offset: usize,
473 buffers: Vec<Buffer>,
474 _child_data: Vec<ArrowArray>,
475 ) -> Result<Self> {
476 let format = from_datatype(data_type)?;
477 // * insert the null buffer at the start
478 // * make all others `Option<Buffer>`.
479 let new_buffers = iter::once(null_buffer)
480 .chain(buffers.iter().map(|b| Some(b.clone())))
481 .collect::<Vec<_>>();
482
483 let schema = Arc::new(FFI_ArrowSchema::new(&format));
484 let array = Arc::new(FFI_ArrowArray::new(
485 len as i64,
486 null_count as i64,
487 offset as i64,
488 new_buffers.len() as i64,
489 new_buffers,
490 ));
491
492 Ok(ArrowArray { schema, array })
493 }
494
495 /// creates a new [ArrowArray] from two pointers. Used to import from the C Data Interface.
496 /// # Safety
497 /// See safety of [ArrowArray]
498 /// # Error
499 /// Errors if any of the pointers is null
try_from_raw( array: *const FFI_ArrowArray, schema: *const FFI_ArrowSchema, ) -> Result<Self>500 pub unsafe fn try_from_raw(
501 array: *const FFI_ArrowArray,
502 schema: *const FFI_ArrowSchema,
503 ) -> Result<Self> {
504 if array.is_null() || schema.is_null() {
505 return Err(ArrowError::MemoryError(
506 "At least one of the pointers passed to `try_from_raw` is null"
507 .to_string(),
508 ));
509 };
510 Ok(Self {
511 array: Arc::from_raw(array as *mut FFI_ArrowArray),
512 schema: Arc::from_raw(schema as *mut FFI_ArrowSchema),
513 })
514 }
515
516 /// creates a new empty [ArrowArray]. Used to import from the C Data Interface.
517 /// # Safety
518 /// See safety of [ArrowArray]
empty() -> Self519 pub unsafe fn empty() -> Self {
520 let schema = Arc::new(FFI_ArrowSchema::empty());
521 let array = Arc::new(FFI_ArrowArray::empty());
522 ArrowArray { schema, array }
523 }
524
525 /// exports [ArrowArray] to the C Data Interface
into_raw(this: ArrowArray) -> (*const FFI_ArrowArray, *const FFI_ArrowSchema)526 pub fn into_raw(this: ArrowArray) -> (*const FFI_ArrowArray, *const FFI_ArrowSchema) {
527 (Arc::into_raw(this.array), Arc::into_raw(this.schema))
528 }
529
530 /// returns the null bit buffer.
531 /// Rust implementation uses a buffer that is not part of the array of buffers.
532 /// The C Data interface's null buffer is part of the array of buffers.
null_bit_buffer(&self) -> Option<Buffer>533 pub fn null_bit_buffer(&self) -> Option<Buffer> {
534 // similar to `self.buffer_len(0)`, but without `Result`.
535 let buffer_len = bit_util::ceil(self.array.length as usize, 8);
536
537 unsafe { create_buffer(self.array.clone(), 0, buffer_len) }
538 }
539
540 /// Returns the length, in bytes, of the buffer `i` (indexed according to the C data interface)
541 // Rust implementation uses fixed-sized buffers, which require knowledge of their `len`.
542 // for variable-sized buffers, such as the second buffer of a stringArray, we need
543 // to fetch offset buffer's len to build the second buffer.
buffer_len(&self, i: usize) -> Result<usize>544 fn buffer_len(&self, i: usize) -> Result<usize> {
545 let data_type = &self.data_type()?;
546
547 Ok(match (data_type, i) {
548 (DataType::Utf8, 1)
549 | (DataType::LargeUtf8, 1)
550 | (DataType::Binary, 1)
551 | (DataType::LargeBinary, 1) => {
552 // the len of the offset buffer (buffer 1) equals length + 1
553 let bits = bit_width(data_type, i)?;
554 debug_assert_eq!(bits % 8, 0);
555 (self.array.length as usize + 1) * (bits / 8)
556 }
557 (DataType::Utf8, 2) | (DataType::Binary, 2) => {
558 // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
559 let len = self.buffer_len(1)?;
560 // first buffer is the null buffer => add(1)
561 // we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets.
562 #[allow(clippy::cast_ptr_alignment)]
563 let offset_buffer = unsafe {
564 *(self.array.buffers as *mut *const u8).add(1) as *const i32
565 };
566 // get last offset
567 (unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize
568 }
569 (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => {
570 // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
571 let len = self.buffer_len(1)?;
572 // first buffer is the null buffer => add(1)
573 // we assume that pointer is aligned for `i64`, as Large uses `i64` offsets.
574 #[allow(clippy::cast_ptr_alignment)]
575 let offset_buffer = unsafe {
576 *(self.array.buffers as *mut *const u8).add(1) as *const i64
577 };
578 // get last offset
579 (unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize
580 }
581 // buffer len of primitive types
582 _ => {
583 let bits = bit_width(data_type, i)?;
584 bit_util::ceil(self.array.length as usize * bits, 8)
585 }
586 })
587 }
588
589 /// returns all buffers, as organized by Rust (i.e. null buffer is skipped)
buffers(&self) -> Result<Vec<Buffer>>590 pub fn buffers(&self) -> Result<Vec<Buffer>> {
591 (0..self.array.n_buffers - 1)
592 .map(|index| {
593 // + 1: skip null buffer
594 let index = (index + 1) as usize;
595
596 let len = self.buffer_len(index)?;
597
598 unsafe { create_buffer(self.array.clone(), index, len) }.ok_or_else(
599 || {
600 ArrowError::CDataInterface(format!(
601 "The external buffer at position {} is null.",
602 index - 1
603 ))
604 },
605 )
606 })
607 .collect()
608 }
609
610 /// the length of the array
len(&self) -> usize611 pub fn len(&self) -> usize {
612 self.array.length as usize
613 }
614
615 /// whether the array is empty
is_empty(&self) -> bool616 pub fn is_empty(&self) -> bool {
617 self.array.length == 0
618 }
619
620 /// the offset of the array
offset(&self) -> usize621 pub fn offset(&self) -> usize {
622 self.array.offset as usize
623 }
624
625 /// the null count of the array
null_count(&self) -> usize626 pub fn null_count(&self) -> usize {
627 self.array.null_count as usize
628 }
629
630 /// the data_type as declared in the schema
data_type(&self) -> Result<DataType>631 pub fn data_type(&self) -> Result<DataType> {
632 to_datatype(self.schema.format())
633 }
634 }
635
636 #[cfg(test)]
637 mod tests {
638 use super::*;
639 use crate::array::{
640 make_array, Array, ArrayData, BinaryOffsetSizeTrait, BooleanArray,
641 GenericBinaryArray, GenericStringArray, Int32Array, StringOffsetSizeTrait,
642 Time32MillisecondArray,
643 };
644 use crate::compute::kernels;
645 use std::convert::TryFrom;
646 use std::sync::Arc;
647
648 #[test]
test_round_trip() -> Result<()>649 fn test_round_trip() -> Result<()> {
650 // create an array natively
651 let array = Int32Array::from(vec![1, 2, 3]);
652
653 // export it
654 let array = ArrowArray::try_from(array.data().as_ref().clone())?;
655
656 // (simulate consumer) import it
657 let data = Arc::new(ArrayData::try_from(array)?);
658 let array = make_array(data);
659
660 // perform some operation
661 let array = array.as_any().downcast_ref::<Int32Array>().unwrap();
662 let array = kernels::arithmetic::add(&array, &array).unwrap();
663
664 // verify
665 assert_eq!(array, Int32Array::from(vec![2, 4, 6]));
666
667 // (drop/release)
668 Ok(())
669 }
670 // case with nulls is tested in the docs, through the example on this module.
671
test_generic_string<Offset: StringOffsetSizeTrait>() -> Result<()>672 fn test_generic_string<Offset: StringOffsetSizeTrait>() -> Result<()> {
673 // create an array natively
674 let array =
675 GenericStringArray::<Offset>::from(vec![Some("a"), None, Some("aaa")]);
676
677 // export it
678 let array = ArrowArray::try_from(array.data().as_ref().clone())?;
679
680 // (simulate consumer) import it
681 let data = Arc::new(ArrayData::try_from(array)?);
682 let array = make_array(data);
683
684 // perform some operation
685 let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap();
686 let array = array
687 .as_any()
688 .downcast_ref::<GenericStringArray<Offset>>()
689 .unwrap();
690
691 // verify
692 let expected = GenericStringArray::<Offset>::from(vec![
693 Some("a"),
694 None,
695 Some("aaa"),
696 Some("a"),
697 None,
698 Some("aaa"),
699 ]);
700 assert_eq!(array, &expected);
701
702 // (drop/release)
703 Ok(())
704 }
705
706 #[test]
test_string() -> Result<()>707 fn test_string() -> Result<()> {
708 test_generic_string::<i32>()
709 }
710
711 #[test]
test_large_string() -> Result<()>712 fn test_large_string() -> Result<()> {
713 test_generic_string::<i64>()
714 }
715
test_generic_binary<Offset: BinaryOffsetSizeTrait>() -> Result<()>716 fn test_generic_binary<Offset: BinaryOffsetSizeTrait>() -> Result<()> {
717 // create an array natively
718 let array: Vec<Option<&[u8]>> = vec![Some(b"a"), None, Some(b"aaa")];
719 let array = GenericBinaryArray::<Offset>::from(array);
720
721 // export it
722 let array = ArrowArray::try_from(array.data().as_ref().clone())?;
723
724 // (simulate consumer) import it
725 let data = Arc::new(ArrayData::try_from(array)?);
726 let array = make_array(data);
727
728 // perform some operation
729 let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap();
730 let array = array
731 .as_any()
732 .downcast_ref::<GenericBinaryArray<Offset>>()
733 .unwrap();
734
735 // verify
736 let expected: Vec<Option<&[u8]>> = vec![
737 Some(b"a"),
738 None,
739 Some(b"aaa"),
740 Some(b"a"),
741 None,
742 Some(b"aaa"),
743 ];
744 let expected = GenericBinaryArray::<Offset>::from(expected);
745 assert_eq!(array, &expected);
746
747 // (drop/release)
748 Ok(())
749 }
750
751 #[test]
test_binary() -> Result<()>752 fn test_binary() -> Result<()> {
753 test_generic_binary::<i32>()
754 }
755
756 #[test]
test_large_binary() -> Result<()>757 fn test_large_binary() -> Result<()> {
758 test_generic_binary::<i64>()
759 }
760
761 #[test]
test_bool() -> Result<()>762 fn test_bool() -> Result<()> {
763 // create an array natively
764 let array = BooleanArray::from(vec![None, Some(true), Some(false)]);
765
766 // export it
767 let array = ArrowArray::try_from(array.data().as_ref().clone())?;
768
769 // (simulate consumer) import it
770 let data = Arc::new(ArrayData::try_from(array)?);
771 let array = make_array(data);
772
773 // perform some operation
774 let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
775 let array = kernels::boolean::not(&array)?;
776
777 // verify
778 assert_eq!(
779 array,
780 BooleanArray::from(vec![None, Some(false), Some(true)])
781 );
782
783 // (drop/release)
784 Ok(())
785 }
786
787 #[test]
test_time32() -> Result<()>788 fn test_time32() -> Result<()> {
789 // create an array natively
790 let array = Time32MillisecondArray::from(vec![None, Some(1), Some(2)]);
791
792 // export it
793 let array = ArrowArray::try_from(array.data().as_ref().clone())?;
794
795 // (simulate consumer) import it
796 let data = Arc::new(ArrayData::try_from(array)?);
797 let array = make_array(data);
798
799 // perform some operation
800 let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap();
801 let array = array
802 .as_any()
803 .downcast_ref::<Time32MillisecondArray>()
804 .unwrap();
805
806 // verify
807 assert_eq!(
808 array,
809 &Time32MillisecondArray::from(vec![
810 None,
811 Some(1),
812 Some(2),
813 None,
814 Some(1),
815 Some(2)
816 ])
817 );
818
819 // (drop/release)
820 Ok(())
821 }
822 }
823