1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 //! Contains declarations to bind to the [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html).
19 //!
20 //! Generally, this module is divided in two main interfaces:
21 //! One interface maps C ABI to native Rust types, i.e. convert c-pointers, c_char, to native rust.
22 //! This is handled by [FFI_ArrowSchema] and [FFI_ArrowArray].
23 //!
24 //! The second interface maps native Rust types to the Rust-specific implementation of Arrow such as `format` to `Datatype`,
25 //! `Buffer`, etc. This is handled by `ArrowArray`.
26 //!
27 //! ```rust
28 //! # use std::sync::Arc;
29 //! # use arrow::array::{Int32Array, Array, ArrayData, make_array_from_raw};
30 //! # use arrow::error::{Result, ArrowError};
31 //! # use arrow::compute::kernels::arithmetic;
32 //! # use std::convert::TryFrom;
33 //! # fn main() -> Result<()> {
34 //! // create an array natively
35 //! let array = Int32Array::from(vec![Some(1), None, Some(3)]);
36 //!
37 //! // export it
38 //! let (array_ptr, schema_ptr) = array.to_raw()?;
39 //!
40 //! // consumed and used by something else...
41 //!
42 //! // import it
43 //! let array = unsafe { make_array_from_raw(array_ptr, schema_ptr)? };
44 //!
45 //! // perform some operation
46 //! let array = array.as_any().downcast_ref::<Int32Array>().ok_or(
47 //!     ArrowError::ParseError("Expects an int32".to_string()),
48 //! )?;
49 //! let array = arithmetic::add(&array, &array)?;
50 //!
51 //! // verify
52 //! assert_eq!(array, Int32Array::from(vec![Some(2), None, Some(6)]));
53 //!
54 //! // (drop/release)
55 //! Ok(())
56 //! }
57 //! ```
58 
59 /*
60 # Design:
61 
62 Main assumptions:
63 * A memory region is deallocated according it its own release mechanism.
64 * Rust shares memory regions between arrays.
65 * A memory region should be deallocated when no-one is using it.
66 
67 The design of this module is as follows:
68 
69 `ArrowArray` contains two `Arc`s, one per ABI-compatible `struct`, each containing data
70 according to the C Data Interface. These Arcs are used for ref counting of the structs
71 within Rust and lifetime management.
72 
73 Each ABI-compatible `struct` knowns how to `drop` itself, calling `release`.
74 
75 To import an array, unsafely create an `ArrowArray` from two pointers using [ArrowArray::try_from_raw].
76 To export an array, create an `ArrowArray` using [ArrowArray::try_new].
77 */
78 
79 use std::{
80     ffi::CStr,
81     ffi::CString,
82     iter,
83     mem::size_of,
84     ptr::{self, NonNull},
85     sync::Arc,
86 };
87 
88 use crate::buffer::Buffer;
89 use crate::datatypes::{DataType, TimeUnit};
90 use crate::error::{ArrowError, Result};
91 use crate::util::bit_util;
92 
93 /// ABI-compatible struct for `ArrowSchema` from C Data Interface
94 /// See <https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions>
95 /// This was created by bindgen
96 #[repr(C)]
97 #[derive(Debug)]
98 pub struct FFI_ArrowSchema {
99     format: *const ::std::os::raw::c_char,
100     name: *const ::std::os::raw::c_char,
101     metadata: *const ::std::os::raw::c_char,
102     flags: i64,
103     n_children: i64,
104     children: *mut *mut FFI_ArrowSchema,
105     dictionary: *mut FFI_ArrowSchema,
106     release: ::std::option::Option<unsafe extern "C" fn(arg1: *mut FFI_ArrowSchema)>,
107     private_data: *mut ::std::os::raw::c_void,
108 }
109 
110 // callback used to drop [FFI_ArrowSchema] when it is exported.
release_schema(schema: *mut FFI_ArrowSchema)111 unsafe extern "C" fn release_schema(schema: *mut FFI_ArrowSchema) {
112     let schema = &mut *schema;
113 
114     // take ownership back to release it.
115     CString::from_raw(schema.format as *mut std::os::raw::c_char);
116 
117     schema.release = None;
118 }
119 
120 impl FFI_ArrowSchema {
121     /// create a new [FFI_ArrowSchema] from a format.
new(format: &str) -> FFI_ArrowSchema122     fn new(format: &str) -> FFI_ArrowSchema {
123         // <https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema>
124         FFI_ArrowSchema {
125             format: CString::new(format).unwrap().into_raw(),
126             name: std::ptr::null_mut(),
127             metadata: std::ptr::null_mut(),
128             flags: 0,
129             n_children: 0,
130             children: ptr::null_mut(),
131             dictionary: std::ptr::null_mut(),
132             release: Some(release_schema),
133             private_data: std::ptr::null_mut(),
134         }
135     }
136 
137     /// create an empty [FFI_ArrowSchema]
empty() -> Self138     fn empty() -> Self {
139         Self {
140             format: std::ptr::null_mut(),
141             name: std::ptr::null_mut(),
142             metadata: std::ptr::null_mut(),
143             flags: 0,
144             n_children: 0,
145             children: ptr::null_mut(),
146             dictionary: std::ptr::null_mut(),
147             release: None,
148             private_data: std::ptr::null_mut(),
149         }
150     }
151 
152     /// returns the format of this schema.
format(&self) -> &str153     pub fn format(&self) -> &str {
154         unsafe { CStr::from_ptr(self.format) }
155             .to_str()
156             .expect("The external API has a non-utf8 as format")
157     }
158 }
159 
160 impl Drop for FFI_ArrowSchema {
drop(&mut self)161     fn drop(&mut self) {
162         match self.release {
163             None => (),
164             Some(release) => unsafe { release(self) },
165         };
166     }
167 }
168 
169 /// maps a DataType `format` to a [DataType](arrow::datatypes::DataType).
170 /// See https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
to_datatype(format: &str) -> Result<DataType>171 fn to_datatype(format: &str) -> Result<DataType> {
172     Ok(match format {
173         "n" => DataType::Null,
174         "b" => DataType::Boolean,
175         "c" => DataType::Int8,
176         "C" => DataType::UInt8,
177         "s" => DataType::Int16,
178         "S" => DataType::UInt16,
179         "i" => DataType::Int32,
180         "I" => DataType::UInt32,
181         "l" => DataType::Int64,
182         "L" => DataType::UInt64,
183         "e" => DataType::Float16,
184         "f" => DataType::Float32,
185         "g" => DataType::Float64,
186         "z" => DataType::Binary,
187         "Z" => DataType::LargeBinary,
188         "u" => DataType::Utf8,
189         "U" => DataType::LargeUtf8,
190         "tdD" => DataType::Date32,
191         "tdm" => DataType::Date64,
192         "tts" => DataType::Time32(TimeUnit::Second),
193         "ttm" => DataType::Time32(TimeUnit::Millisecond),
194         "ttu" => DataType::Time64(TimeUnit::Microsecond),
195         "ttn" => DataType::Time64(TimeUnit::Nanosecond),
196         dt => {
197             return Err(ArrowError::CDataInterface(format!(
198                 "The datatype \"{}\" is not supported in the Rust implementation",
199                 dt
200             )))
201         }
202     })
203 }
204 
205 /// the inverse of [to_datatype]
from_datatype(datatype: &DataType) -> Result<String>206 fn from_datatype(datatype: &DataType) -> Result<String> {
207     Ok(match datatype {
208         DataType::Null => "n",
209         DataType::Boolean => "b",
210         DataType::Int8 => "c",
211         DataType::UInt8 => "C",
212         DataType::Int16 => "s",
213         DataType::UInt16 => "S",
214         DataType::Int32 => "i",
215         DataType::UInt32 => "I",
216         DataType::Int64 => "l",
217         DataType::UInt64 => "L",
218         DataType::Float16 => "e",
219         DataType::Float32 => "f",
220         DataType::Float64 => "g",
221         DataType::Binary => "z",
222         DataType::LargeBinary => "Z",
223         DataType::Utf8 => "u",
224         DataType::LargeUtf8 => "U",
225         DataType::Date32 => "tdD",
226         DataType::Date64 => "tdm",
227         DataType::Time32(TimeUnit::Second) => "tts",
228         DataType::Time32(TimeUnit::Millisecond) => "ttm",
229         DataType::Time64(TimeUnit::Microsecond) => "ttu",
230         DataType::Time64(TimeUnit::Nanosecond) => "ttn",
231         z => {
232             return Err(ArrowError::CDataInterface(format!(
233                 "The datatype \"{:?}\" is still not supported in Rust implementation",
234                 z
235             )))
236         }
237     }
238     .to_string())
239 }
240 
241 // returns the number of bits that buffer `i` (in the C data interface) is expected to have.
242 // This is set by the Arrow specification
bit_width(data_type: &DataType, i: usize) -> Result<usize>243 fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
244     Ok(match (data_type, i) {
245         // the null buffer is bit sized
246         (_, 0) => 1,
247         // primitive types first buffer's size is given by the native types
248         (DataType::Boolean, 1) => 1,
249         (DataType::UInt8, 1) => size_of::<u8>() * 8,
250         (DataType::UInt16, 1) => size_of::<u16>() * 8,
251         (DataType::UInt32, 1) => size_of::<u32>() * 8,
252         (DataType::UInt64, 1) => size_of::<u64>() * 8,
253         (DataType::Int8, 1) => size_of::<i8>() * 8,
254         (DataType::Int16, 1) => size_of::<i16>() * 8,
255         (DataType::Int32, 1) | (DataType::Date32, 1) | (DataType::Time32(_), 1) => size_of::<i32>() * 8,
256         (DataType::Int64, 1) | (DataType::Date64, 1) | (DataType::Time64(_), 1) => size_of::<i64>() * 8,
257         (DataType::Float32, 1) => size_of::<f32>() * 8,
258         (DataType::Float64, 1) => size_of::<f64>() * 8,
259         // primitive types have a single buffer
260         (DataType::Boolean, _) |
261         (DataType::UInt8, _) |
262         (DataType::UInt16, _) |
263         (DataType::UInt32, _) |
264         (DataType::UInt64, _) |
265         (DataType::Int8, _) |
266         (DataType::Int16, _) |
267         (DataType::Int32, _) | (DataType::Date32, _) | (DataType::Time32(_), _) |
268         (DataType::Int64, _) | (DataType::Date64, _) | (DataType::Time64(_), _) |
269         (DataType::Float32, _) |
270         (DataType::Float64, _) => {
271             return Err(ArrowError::CDataInterface(format!(
272                 "The datatype \"{:?}\" expects 2 buffers, but requested {}. Please verify that the C data interface is correctly implemented.",
273                 data_type, i
274             )))
275         }
276         // Variable-sized binaries: have two buffers.
277         // "small": first buffer is i32, second is in bytes
278         (DataType::Utf8, 1) | (DataType::Binary, 1) => size_of::<i32>() * 8,
279         (DataType::Utf8, 2) | (DataType::Binary, 2) => size_of::<u8>() * 8,
280         (DataType::Utf8, _) | (DataType::Binary, _) => {
281             return Err(ArrowError::CDataInterface(format!(
282                 "The datatype \"{:?}\" expects 3 buffers, but requested {}. Please verify that the C data interface is correctly implemented.",
283                 data_type, i
284             )))
285         }
286         // Variable-sized binaries: have two buffers.
287         // LargeUtf8: first buffer is i64, second is in bytes
288         (DataType::LargeUtf8, 1) | (DataType::LargeBinary, 1) => size_of::<i64>() * 8,
289         (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => size_of::<u8>() * 8,
290         (DataType::LargeUtf8, _) | (DataType::LargeBinary, _) => {
291             return Err(ArrowError::CDataInterface(format!(
292                 "The datatype \"{:?}\" expects 3 buffers, but requested {}. Please verify that the C data interface is correctly implemented.",
293                 data_type, i
294             )))
295         }
296         _ => {
297             return Err(ArrowError::CDataInterface(format!(
298                 "The datatype \"{:?}\" is still not supported in Rust implementation",
299                 data_type
300             )))
301         }
302     })
303 }
304 
305 /// ABI-compatible struct for ArrowArray from C Data Interface
306 /// See <https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions>
307 /// This was created by bindgen
308 #[repr(C)]
309 #[derive(Debug)]
310 pub struct FFI_ArrowArray {
311     pub(crate) length: i64,
312     pub(crate) null_count: i64,
313     pub(crate) offset: i64,
314     pub(crate) n_buffers: i64,
315     pub(crate) n_children: i64,
316     pub(crate) buffers: *mut *const ::std::os::raw::c_void,
317     children: *mut *mut FFI_ArrowArray,
318     dictionary: *mut FFI_ArrowArray,
319     release: ::std::option::Option<unsafe extern "C" fn(arg1: *mut FFI_ArrowArray)>,
320     // When exported, this MUST contain everything that is owned by this array.
321     // for example, any buffer pointed to in `buffers` must be here, as well as the `buffers` pointer
322     // itself.
323     // In other words, everything in [FFI_ArrowArray] must be owned by `private_data` and can assume
324     // that they do not outlive `private_data`.
325     private_data: *mut ::std::os::raw::c_void,
326 }
327 
328 // callback used to drop [FFI_ArrowArray] when it is exported
release_array(array: *mut FFI_ArrowArray)329 unsafe extern "C" fn release_array(array: *mut FFI_ArrowArray) {
330     if array.is_null() {
331         return;
332     }
333     let array = &mut *array;
334     // take ownership of `private_data`, therefore dropping it
335     Box::from_raw(array.private_data as *mut PrivateData);
336 
337     array.release = None;
338 }
339 
340 struct PrivateData {
341     buffers: Vec<Option<Buffer>>,
342     buffers_ptr: Box<[*const std::os::raw::c_void]>,
343 }
344 
345 impl FFI_ArrowArray {
346     /// creates a new `FFI_ArrowArray` from existing data.
347     /// # Safety
348     /// This method releases `buffers`. Consumers of this struct *must* call `release` before
349     /// releasing this struct, or contents in `buffers` leak.
new( length: i64, null_count: i64, offset: i64, n_buffers: i64, buffers: Vec<Option<Buffer>>, ) -> Self350     unsafe fn new(
351         length: i64,
352         null_count: i64,
353         offset: i64,
354         n_buffers: i64,
355         buffers: Vec<Option<Buffer>>,
356     ) -> Self {
357         let buffers_ptr = buffers
358             .iter()
359             .map(|maybe_buffer| match maybe_buffer {
360                 // note that `raw_data` takes into account the buffer's offset
361                 Some(b) => b.as_ptr() as *const std::os::raw::c_void,
362                 None => std::ptr::null(),
363             })
364             .collect::<Box<[_]>>();
365         let pointer = buffers_ptr.as_ptr() as *mut *const std::ffi::c_void;
366 
367         // create the private data owning everything.
368         // any other data must be added here, e.g. via a struct, to track lifetime.
369         let private_data = Box::new(PrivateData {
370             buffers,
371             buffers_ptr,
372         });
373 
374         Self {
375             length,
376             null_count,
377             offset,
378             n_buffers,
379             n_children: 0,
380             buffers: pointer,
381             children: std::ptr::null_mut(),
382             dictionary: std::ptr::null_mut(),
383             release: Some(release_array),
384             private_data: Box::into_raw(private_data) as *mut ::std::os::raw::c_void,
385         }
386     }
387 
388     // create an empty `FFI_ArrowArray`, which can be used to import data into
empty() -> Self389     fn empty() -> Self {
390         Self {
391             length: 0,
392             null_count: 0,
393             offset: 0,
394             n_buffers: 0,
395             n_children: 0,
396             buffers: std::ptr::null_mut(),
397             children: std::ptr::null_mut(),
398             dictionary: std::ptr::null_mut(),
399             release: None,
400             private_data: std::ptr::null_mut(),
401         }
402     }
403 }
404 
405 /// returns a new buffer corresponding to the index `i` of the FFI array. It may not exist (null pointer).
406 /// `bits` is the number of bits that the native type of this buffer has.
407 /// The size of the buffer will be `ceil(self.length * bits, 8)`.
408 /// # Panic
409 /// This function panics if `i` is larger or equal to `n_buffers`.
410 /// # Safety
411 /// This function assumes that `ceil(self.length * bits, 8)` is the size of the buffer
create_buffer( array: Arc<FFI_ArrowArray>, index: usize, len: usize, ) -> Option<Buffer>412 unsafe fn create_buffer(
413     array: Arc<FFI_ArrowArray>,
414     index: usize,
415     len: usize,
416 ) -> Option<Buffer> {
417     if array.buffers.is_null() {
418         return None;
419     }
420     let buffers = array.buffers as *mut *const u8;
421 
422     assert!(index < array.n_buffers as usize);
423     let ptr = *buffers.add(index);
424 
425     NonNull::new(ptr as *mut u8).map(|ptr| Buffer::from_unowned(ptr, len, array))
426 }
427 
428 impl Drop for FFI_ArrowArray {
drop(&mut self)429     fn drop(&mut self) {
430         match self.release {
431             None => (),
432             Some(release) => unsafe { release(self) },
433         };
434     }
435 }
436 
437 /// Struct used to move an Array from and to the C Data Interface.
438 /// Its main responsibility is to expose functionality that requires
439 /// both [FFI_ArrowArray] and [FFI_ArrowSchema].
440 ///
441 /// This struct has two main paths:
442 ///
443 /// ## Import from the C Data Interface
444 /// * [ArrowArray::empty] to allocate memory to be filled by an external call
445 /// * [ArrowArray::try_from_raw] to consume two non-null allocated pointers
446 /// ## Export to the C Data Interface
447 /// * [ArrowArray::try_new] to create a new [ArrowArray] from Rust-specific information
448 /// * [ArrowArray::into_raw] to expose two pointers for [FFI_ArrowArray] and [FFI_ArrowSchema].
449 ///
450 /// # Safety
451 /// Whoever creates this struct is responsible for releasing their resources. Specifically,
452 /// consumers *must* call [ArrowArray::into_raw] and take ownership of the individual pointers,
453 /// calling [FFI_ArrowArray::release] and [FFI_ArrowSchema::release] accordingly.
454 ///
455 /// Furthermore, this struct assumes that the incoming data agrees with the C data interface.
456 #[derive(Debug)]
457 pub struct ArrowArray {
458     // these are ref-counted because they can be shared by multiple buffers.
459     array: Arc<FFI_ArrowArray>,
460     schema: Arc<FFI_ArrowSchema>,
461 }
462 
463 impl ArrowArray {
464     /// creates a new `ArrowArray`. This is used to export to the C Data Interface.
465     /// # Safety
466     /// See safety of [ArrowArray]
try_new( data_type: &DataType, len: usize, null_count: usize, null_buffer: Option<Buffer>, offset: usize, buffers: Vec<Buffer>, _child_data: Vec<ArrowArray>, ) -> Result<Self>467     pub unsafe fn try_new(
468         data_type: &DataType,
469         len: usize,
470         null_count: usize,
471         null_buffer: Option<Buffer>,
472         offset: usize,
473         buffers: Vec<Buffer>,
474         _child_data: Vec<ArrowArray>,
475     ) -> Result<Self> {
476         let format = from_datatype(data_type)?;
477         // * insert the null buffer at the start
478         // * make all others `Option<Buffer>`.
479         let new_buffers = iter::once(null_buffer)
480             .chain(buffers.iter().map(|b| Some(b.clone())))
481             .collect::<Vec<_>>();
482 
483         let schema = Arc::new(FFI_ArrowSchema::new(&format));
484         let array = Arc::new(FFI_ArrowArray::new(
485             len as i64,
486             null_count as i64,
487             offset as i64,
488             new_buffers.len() as i64,
489             new_buffers,
490         ));
491 
492         Ok(ArrowArray { schema, array })
493     }
494 
495     /// creates a new [ArrowArray] from two pointers. Used to import from the C Data Interface.
496     /// # Safety
497     /// See safety of [ArrowArray]
498     /// # Error
499     /// Errors if any of the pointers is null
try_from_raw( array: *const FFI_ArrowArray, schema: *const FFI_ArrowSchema, ) -> Result<Self>500     pub unsafe fn try_from_raw(
501         array: *const FFI_ArrowArray,
502         schema: *const FFI_ArrowSchema,
503     ) -> Result<Self> {
504         if array.is_null() || schema.is_null() {
505             return Err(ArrowError::MemoryError(
506                 "At least one of the pointers passed to `try_from_raw` is null"
507                     .to_string(),
508             ));
509         };
510         Ok(Self {
511             array: Arc::from_raw(array as *mut FFI_ArrowArray),
512             schema: Arc::from_raw(schema as *mut FFI_ArrowSchema),
513         })
514     }
515 
516     /// creates a new empty [ArrowArray]. Used to import from the C Data Interface.
517     /// # Safety
518     /// See safety of [ArrowArray]
empty() -> Self519     pub unsafe fn empty() -> Self {
520         let schema = Arc::new(FFI_ArrowSchema::empty());
521         let array = Arc::new(FFI_ArrowArray::empty());
522         ArrowArray { schema, array }
523     }
524 
525     /// exports [ArrowArray] to the C Data Interface
into_raw(this: ArrowArray) -> (*const FFI_ArrowArray, *const FFI_ArrowSchema)526     pub fn into_raw(this: ArrowArray) -> (*const FFI_ArrowArray, *const FFI_ArrowSchema) {
527         (Arc::into_raw(this.array), Arc::into_raw(this.schema))
528     }
529 
530     /// returns the null bit buffer.
531     /// Rust implementation uses a buffer that is not part of the array of buffers.
532     /// The C Data interface's null buffer is part of the array of buffers.
null_bit_buffer(&self) -> Option<Buffer>533     pub fn null_bit_buffer(&self) -> Option<Buffer> {
534         // similar to `self.buffer_len(0)`, but without `Result`.
535         let buffer_len = bit_util::ceil(self.array.length as usize, 8);
536 
537         unsafe { create_buffer(self.array.clone(), 0, buffer_len) }
538     }
539 
540     /// Returns the length, in bytes, of the buffer `i` (indexed according to the C data interface)
541     // Rust implementation uses fixed-sized buffers, which require knowledge of their `len`.
542     // for variable-sized buffers, such as the second buffer of a stringArray, we need
543     // to fetch offset buffer's len to build the second buffer.
buffer_len(&self, i: usize) -> Result<usize>544     fn buffer_len(&self, i: usize) -> Result<usize> {
545         let data_type = &self.data_type()?;
546 
547         Ok(match (data_type, i) {
548             (DataType::Utf8, 1)
549             | (DataType::LargeUtf8, 1)
550             | (DataType::Binary, 1)
551             | (DataType::LargeBinary, 1) => {
552                 // the len of the offset buffer (buffer 1) equals length + 1
553                 let bits = bit_width(data_type, i)?;
554                 debug_assert_eq!(bits % 8, 0);
555                 (self.array.length as usize + 1) * (bits / 8)
556             }
557             (DataType::Utf8, 2) | (DataType::Binary, 2) => {
558                 // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
559                 let len = self.buffer_len(1)?;
560                 // first buffer is the null buffer => add(1)
561                 // we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets.
562                 #[allow(clippy::cast_ptr_alignment)]
563                 let offset_buffer = unsafe {
564                     *(self.array.buffers as *mut *const u8).add(1) as *const i32
565                 };
566                 // get last offset
567                 (unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize
568             }
569             (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => {
570                 // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
571                 let len = self.buffer_len(1)?;
572                 // first buffer is the null buffer => add(1)
573                 // we assume that pointer is aligned for `i64`, as Large uses `i64` offsets.
574                 #[allow(clippy::cast_ptr_alignment)]
575                 let offset_buffer = unsafe {
576                     *(self.array.buffers as *mut *const u8).add(1) as *const i64
577                 };
578                 // get last offset
579                 (unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize
580             }
581             // buffer len of primitive types
582             _ => {
583                 let bits = bit_width(data_type, i)?;
584                 bit_util::ceil(self.array.length as usize * bits, 8)
585             }
586         })
587     }
588 
589     /// returns all buffers, as organized by Rust (i.e. null buffer is skipped)
buffers(&self) -> Result<Vec<Buffer>>590     pub fn buffers(&self) -> Result<Vec<Buffer>> {
591         (0..self.array.n_buffers - 1)
592             .map(|index| {
593                 // + 1: skip null buffer
594                 let index = (index + 1) as usize;
595 
596                 let len = self.buffer_len(index)?;
597 
598                 unsafe { create_buffer(self.array.clone(), index, len) }.ok_or_else(
599                     || {
600                         ArrowError::CDataInterface(format!(
601                             "The external buffer at position {} is null.",
602                             index - 1
603                         ))
604                     },
605                 )
606             })
607             .collect()
608     }
609 
610     /// the length of the array
len(&self) -> usize611     pub fn len(&self) -> usize {
612         self.array.length as usize
613     }
614 
615     /// whether the array is empty
is_empty(&self) -> bool616     pub fn is_empty(&self) -> bool {
617         self.array.length == 0
618     }
619 
620     /// the offset of the array
offset(&self) -> usize621     pub fn offset(&self) -> usize {
622         self.array.offset as usize
623     }
624 
625     /// the null count of the array
null_count(&self) -> usize626     pub fn null_count(&self) -> usize {
627         self.array.null_count as usize
628     }
629 
630     /// the data_type as declared in the schema
data_type(&self) -> Result<DataType>631     pub fn data_type(&self) -> Result<DataType> {
632         to_datatype(self.schema.format())
633     }
634 }
635 
636 #[cfg(test)]
637 mod tests {
638     use super::*;
639     use crate::array::{
640         make_array, Array, ArrayData, BinaryOffsetSizeTrait, BooleanArray,
641         GenericBinaryArray, GenericStringArray, Int32Array, StringOffsetSizeTrait,
642         Time32MillisecondArray,
643     };
644     use crate::compute::kernels;
645     use std::convert::TryFrom;
646     use std::sync::Arc;
647 
648     #[test]
test_round_trip() -> Result<()>649     fn test_round_trip() -> Result<()> {
650         // create an array natively
651         let array = Int32Array::from(vec![1, 2, 3]);
652 
653         // export it
654         let array = ArrowArray::try_from(array.data().as_ref().clone())?;
655 
656         // (simulate consumer) import it
657         let data = Arc::new(ArrayData::try_from(array)?);
658         let array = make_array(data);
659 
660         // perform some operation
661         let array = array.as_any().downcast_ref::<Int32Array>().unwrap();
662         let array = kernels::arithmetic::add(&array, &array).unwrap();
663 
664         // verify
665         assert_eq!(array, Int32Array::from(vec![2, 4, 6]));
666 
667         // (drop/release)
668         Ok(())
669     }
670     // case with nulls is tested in the docs, through the example on this module.
671 
test_generic_string<Offset: StringOffsetSizeTrait>() -> Result<()>672     fn test_generic_string<Offset: StringOffsetSizeTrait>() -> Result<()> {
673         // create an array natively
674         let array =
675             GenericStringArray::<Offset>::from(vec![Some("a"), None, Some("aaa")]);
676 
677         // export it
678         let array = ArrowArray::try_from(array.data().as_ref().clone())?;
679 
680         // (simulate consumer) import it
681         let data = Arc::new(ArrayData::try_from(array)?);
682         let array = make_array(data);
683 
684         // perform some operation
685         let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap();
686         let array = array
687             .as_any()
688             .downcast_ref::<GenericStringArray<Offset>>()
689             .unwrap();
690 
691         // verify
692         let expected = GenericStringArray::<Offset>::from(vec![
693             Some("a"),
694             None,
695             Some("aaa"),
696             Some("a"),
697             None,
698             Some("aaa"),
699         ]);
700         assert_eq!(array, &expected);
701 
702         // (drop/release)
703         Ok(())
704     }
705 
706     #[test]
test_string() -> Result<()>707     fn test_string() -> Result<()> {
708         test_generic_string::<i32>()
709     }
710 
711     #[test]
test_large_string() -> Result<()>712     fn test_large_string() -> Result<()> {
713         test_generic_string::<i64>()
714     }
715 
test_generic_binary<Offset: BinaryOffsetSizeTrait>() -> Result<()>716     fn test_generic_binary<Offset: BinaryOffsetSizeTrait>() -> Result<()> {
717         // create an array natively
718         let array: Vec<Option<&[u8]>> = vec![Some(b"a"), None, Some(b"aaa")];
719         let array = GenericBinaryArray::<Offset>::from(array);
720 
721         // export it
722         let array = ArrowArray::try_from(array.data().as_ref().clone())?;
723 
724         // (simulate consumer) import it
725         let data = Arc::new(ArrayData::try_from(array)?);
726         let array = make_array(data);
727 
728         // perform some operation
729         let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap();
730         let array = array
731             .as_any()
732             .downcast_ref::<GenericBinaryArray<Offset>>()
733             .unwrap();
734 
735         // verify
736         let expected: Vec<Option<&[u8]>> = vec![
737             Some(b"a"),
738             None,
739             Some(b"aaa"),
740             Some(b"a"),
741             None,
742             Some(b"aaa"),
743         ];
744         let expected = GenericBinaryArray::<Offset>::from(expected);
745         assert_eq!(array, &expected);
746 
747         // (drop/release)
748         Ok(())
749     }
750 
751     #[test]
test_binary() -> Result<()>752     fn test_binary() -> Result<()> {
753         test_generic_binary::<i32>()
754     }
755 
756     #[test]
test_large_binary() -> Result<()>757     fn test_large_binary() -> Result<()> {
758         test_generic_binary::<i64>()
759     }
760 
761     #[test]
test_bool() -> Result<()>762     fn test_bool() -> Result<()> {
763         // create an array natively
764         let array = BooleanArray::from(vec![None, Some(true), Some(false)]);
765 
766         // export it
767         let array = ArrowArray::try_from(array.data().as_ref().clone())?;
768 
769         // (simulate consumer) import it
770         let data = Arc::new(ArrayData::try_from(array)?);
771         let array = make_array(data);
772 
773         // perform some operation
774         let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
775         let array = kernels::boolean::not(&array)?;
776 
777         // verify
778         assert_eq!(
779             array,
780             BooleanArray::from(vec![None, Some(false), Some(true)])
781         );
782 
783         // (drop/release)
784         Ok(())
785     }
786 
787     #[test]
test_time32() -> Result<()>788     fn test_time32() -> Result<()> {
789         // create an array natively
790         let array = Time32MillisecondArray::from(vec![None, Some(1), Some(2)]);
791 
792         // export it
793         let array = ArrowArray::try_from(array.data().as_ref().clone())?;
794 
795         // (simulate consumer) import it
796         let data = Arc::new(ArrayData::try_from(array)?);
797         let array = make_array(data);
798 
799         // perform some operation
800         let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap();
801         let array = array
802             .as_any()
803             .downcast_ref::<Time32MillisecondArray>()
804             .unwrap();
805 
806         // verify
807         assert_eq!(
808             array,
809             &Time32MillisecondArray::from(vec![
810                 None,
811                 Some(1),
812                 Some(2),
813                 None,
814                 Some(1),
815                 Some(2)
816             ])
817         );
818 
819         // (drop/release)
820         Ok(())
821     }
822 }
823