1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements.  See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership.  The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License.  You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing,
12 // software distributed under the License is distributed on an
13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, either express or implied.  See the License for the
15 // specific language governing permissions and limitations
16 // under the License.
17 
18 use std::convert::From;
19 use std::fmt;
20 use std::mem;
21 use std::{any::Any, iter::FromIterator};
22 
23 use super::{
24     array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, ArrayDataRef,
25     GenericListArray, GenericStringIter, OffsetSizeTrait,
26 };
27 use crate::buffer::Buffer;
28 use crate::util::bit_util;
29 use crate::{buffer::MutableBuffer, datatypes::DataType};
30 
31 /// Like OffsetSizeTrait, but specialized for Strings
32 // This allow us to expose a constant datatype for the GenericStringArray
33 pub trait StringOffsetSizeTrait: OffsetSizeTrait {
34     const DATA_TYPE: DataType;
35 }
36 
37 impl StringOffsetSizeTrait for i32 {
38     const DATA_TYPE: DataType = DataType::Utf8;
39 }
40 
41 impl StringOffsetSizeTrait for i64 {
42     const DATA_TYPE: DataType = DataType::LargeUtf8;
43 }
44 
45 /// Generic struct for \[Large\]StringArray
46 pub struct GenericStringArray<OffsetSize: StringOffsetSizeTrait> {
47     data: ArrayDataRef,
48     value_offsets: RawPtrBox<OffsetSize>,
49     value_data: RawPtrBox<u8>,
50 }
51 
52 impl<OffsetSize: StringOffsetSizeTrait> GenericStringArray<OffsetSize> {
53     /// Returns the length for the element at index `i`.
54     #[inline]
value_length(&self, i: usize) -> OffsetSize55     pub fn value_length(&self, i: usize) -> OffsetSize {
56         let offsets = self.value_offsets();
57         offsets[i + 1] - offsets[i]
58     }
59 
60     /// Returns the offset values in the offsets buffer
61     #[inline]
value_offsets(&self) -> &[OffsetSize]62     pub fn value_offsets(&self) -> &[OffsetSize] {
63         // Soundness
64         //     pointer alignment & location is ensured by RawPtrBox
65         //     buffer bounds/offset is ensured by the ArrayData instance.
66         unsafe {
67             std::slice::from_raw_parts(
68                 self.value_offsets.as_ptr().add(self.data.offset()),
69                 self.len() + 1,
70             )
71         }
72     }
73 
74     /// Returns a clone of the value data buffer
value_data(&self) -> Buffer75     pub fn value_data(&self) -> Buffer {
76         self.data.buffers()[1].clone()
77     }
78 
79     /// Returns the element at index
80     /// # Safety
81     /// caller is responsible for ensuring that index is within the array bounds
value_unchecked(&self, i: usize) -> &str82     pub unsafe fn value_unchecked(&self, i: usize) -> &str {
83         let end = self.value_offsets().get_unchecked(i + 1);
84         let start = self.value_offsets().get_unchecked(i);
85 
86         // Soundness
87         // pointer alignment & location is ensured by RawPtrBox
88         // buffer bounds/offset is ensured by the value_offset invariants
89         // ISSUE: utf-8 well formedness is not checked
90         let slice = std::slice::from_raw_parts(
91             self.value_data.as_ptr().offset(start.to_isize()),
92             (*end - *start).to_usize().unwrap(),
93         );
94         std::str::from_utf8_unchecked(slice)
95     }
96 
97     /// Returns the element at index `i` as &str
value(&self, i: usize) -> &str98     pub fn value(&self, i: usize) -> &str {
99         assert!(i < self.data.len(), "StringArray out of bounds access");
100         //Soundness: length checked above, offset buffer length is 1 larger than logical array length
101         let end = unsafe { self.value_offsets().get_unchecked(i + 1) };
102         let start = unsafe { self.value_offsets().get_unchecked(i) };
103 
104         // Soundness
105         // pointer alignment & location is ensured by RawPtrBox
106         // buffer bounds/offset is ensured by the value_offset invariants
107         // ISSUE: utf-8 well formedness is not checked
108         unsafe {
109             let slice = std::slice::from_raw_parts(
110                 self.value_data.as_ptr().offset(start.to_isize()),
111                 (*end - *start).to_usize().unwrap(),
112             );
113             std::str::from_utf8_unchecked(slice)
114         }
115     }
116 
from_list(v: GenericListArray<OffsetSize>) -> Self117     fn from_list(v: GenericListArray<OffsetSize>) -> Self {
118         assert_eq!(
119             v.data().child_data()[0].child_data().len(),
120             0,
121             "StringArray can only be created from list array of u8 values \
122              (i.e. List<PrimitiveArray<u8>>)."
123         );
124         assert_eq!(
125             v.data_ref().child_data()[0].data_type(),
126             &DataType::UInt8,
127             "StringArray can only be created from List<u8> arrays, mismatched data types."
128         );
129 
130         let mut builder = ArrayData::builder(OffsetSize::DATA_TYPE)
131             .len(v.len())
132             .add_buffer(v.data_ref().buffers()[0].clone())
133             .add_buffer(v.data_ref().child_data()[0].buffers()[0].clone());
134         if let Some(bitmap) = v.data().null_bitmap() {
135             builder = builder.null_bit_buffer(bitmap.bits.clone())
136         }
137 
138         let data = builder.build();
139         Self::from(data)
140     }
141 
from_vec(v: Vec<&str>) -> Self142     pub(crate) fn from_vec(v: Vec<&str>) -> Self {
143         let mut offsets =
144             MutableBuffer::new((v.len() + 1) * std::mem::size_of::<OffsetSize>());
145         let mut values = MutableBuffer::new(0);
146 
147         let mut length_so_far = OffsetSize::zero();
148         offsets.push(length_so_far);
149 
150         for s in &v {
151             length_so_far += OffsetSize::from_usize(s.len()).unwrap();
152             offsets.push(length_so_far);
153             values.extend_from_slice(s.as_bytes());
154         }
155         let array_data = ArrayData::builder(OffsetSize::DATA_TYPE)
156             .len(v.len())
157             .add_buffer(offsets.into())
158             .add_buffer(values.into())
159             .build();
160         Self::from(array_data)
161     }
162 
from_opt_vec(v: Vec<Option<&str>>) -> Self163     pub(crate) fn from_opt_vec(v: Vec<Option<&str>>) -> Self {
164         v.into_iter().collect()
165     }
166 
167     /// Creates a `GenericStringArray` based on an iterator of values without nulls
from_iter_values<Ptr, I: IntoIterator<Item = Ptr>>(iter: I) -> Self where Ptr: AsRef<str>,168     pub fn from_iter_values<Ptr, I: IntoIterator<Item = Ptr>>(iter: I) -> Self
169     where
170         Ptr: AsRef<str>,
171     {
172         let iter = iter.into_iter();
173         let (_, data_len) = iter.size_hint();
174         let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.
175 
176         let mut offsets =
177             MutableBuffer::new((data_len + 1) * std::mem::size_of::<OffsetSize>());
178         let mut values = MutableBuffer::new(0);
179 
180         let mut length_so_far = OffsetSize::zero();
181         offsets.push(length_so_far);
182 
183         for i in iter {
184             let s = i.as_ref();
185             length_so_far += OffsetSize::from_usize(s.len()).unwrap();
186             offsets.push(length_so_far);
187             values.extend_from_slice(s.as_bytes());
188         }
189         let array_data = ArrayData::builder(OffsetSize::DATA_TYPE)
190             .len(data_len)
191             .add_buffer(offsets.into())
192             .add_buffer(values.into())
193             .build();
194         Self::from(array_data)
195     }
196 }
197 
198 impl<'a, Ptr, OffsetSize: StringOffsetSizeTrait> FromIterator<Option<Ptr>>
199     for GenericStringArray<OffsetSize>
200 where
201     Ptr: AsRef<str>,
202 {
from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self203     fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
204         let iter = iter.into_iter();
205         let (_, data_len) = iter.size_hint();
206         let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.
207 
208         let mut offsets =
209             MutableBuffer::new((data_len + 1) * std::mem::size_of::<OffsetSize>());
210         let mut values = MutableBuffer::new(0);
211         let mut null_buf = MutableBuffer::new_null(data_len);
212         let null_slice = null_buf.as_slice_mut();
213         let mut length_so_far = OffsetSize::zero();
214         offsets.push(length_so_far);
215 
216         for (i, s) in iter.enumerate() {
217             if let Some(s) = s {
218                 let s = s.as_ref();
219                 // set null bit
220                 bit_util::set_bit(null_slice, i);
221 
222                 length_so_far += OffsetSize::from_usize(s.len()).unwrap();
223                 values.extend_from_slice(s.as_bytes());
224             } else {
225                 values.extend_from_slice(b"");
226             }
227             offsets.push(length_so_far);
228         }
229 
230         let array_data = ArrayData::builder(OffsetSize::DATA_TYPE)
231             .len(data_len)
232             .add_buffer(offsets.into())
233             .add_buffer(values.into())
234             .null_bit_buffer(null_buf.into())
235             .build();
236         Self::from(array_data)
237     }
238 }
239 
240 impl<'a, T: StringOffsetSizeTrait> IntoIterator for &'a GenericStringArray<T> {
241     type Item = Option<&'a str>;
242     type IntoIter = GenericStringIter<'a, T>;
243 
into_iter(self) -> Self::IntoIter244     fn into_iter(self) -> Self::IntoIter {
245         GenericStringIter::<'a, T>::new(self)
246     }
247 }
248 
249 impl<'a, T: StringOffsetSizeTrait> GenericStringArray<T> {
250     /// constructs a new iterator
iter(&'a self) -> GenericStringIter<'a, T>251     pub fn iter(&'a self) -> GenericStringIter<'a, T> {
252         GenericStringIter::<'a, T>::new(&self)
253     }
254 }
255 
256 impl<OffsetSize: StringOffsetSizeTrait> fmt::Debug for GenericStringArray<OffsetSize> {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result257     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
258         write!(f, "{}StringArray\n[\n", OffsetSize::prefix())?;
259         print_long_array(self, f, |array, index, f| {
260             fmt::Debug::fmt(&array.value(index), f)
261         })?;
262         write!(f, "]")
263     }
264 }
265 
266 impl<OffsetSize: StringOffsetSizeTrait> Array for GenericStringArray<OffsetSize> {
as_any(&self) -> &Any267     fn as_any(&self) -> &Any {
268         self
269     }
270 
data(&self) -> ArrayDataRef271     fn data(&self) -> ArrayDataRef {
272         self.data.clone()
273     }
274 
data_ref(&self) -> &ArrayDataRef275     fn data_ref(&self) -> &ArrayDataRef {
276         &self.data
277     }
278 
279     /// Returns the total number of bytes of memory occupied by the buffers owned by this [$name].
get_buffer_memory_size(&self) -> usize280     fn get_buffer_memory_size(&self) -> usize {
281         self.data.get_buffer_memory_size()
282     }
283 
284     /// Returns the total number of bytes of memory occupied physically by this [$name].
get_array_memory_size(&self) -> usize285     fn get_array_memory_size(&self) -> usize {
286         self.data.get_array_memory_size() + mem::size_of_val(self)
287     }
288 }
289 
290 impl<OffsetSize: StringOffsetSizeTrait> From<ArrayDataRef>
291     for GenericStringArray<OffsetSize>
292 {
from(data: ArrayDataRef) -> Self293     fn from(data: ArrayDataRef) -> Self {
294         assert_eq!(
295             data.data_type(),
296             &<OffsetSize as StringOffsetSizeTrait>::DATA_TYPE,
297             "[Large]StringArray expects Datatype::[Large]Utf8"
298         );
299         assert_eq!(
300             data.buffers().len(),
301             2,
302             "StringArray data should contain 2 buffers only (offsets and values)"
303         );
304         let offsets = data.buffers()[0].as_ptr();
305         let values = data.buffers()[1].as_ptr();
306         Self {
307             data,
308             value_offsets: unsafe { RawPtrBox::new(offsets) },
309             value_data: unsafe { RawPtrBox::new(values) },
310         }
311     }
312 }
313 
314 impl<OffsetSize: StringOffsetSizeTrait> From<Vec<Option<&str>>>
315     for GenericStringArray<OffsetSize>
316 {
from(v: Vec<Option<&str>>) -> Self317     fn from(v: Vec<Option<&str>>) -> Self {
318         GenericStringArray::<OffsetSize>::from_opt_vec(v)
319     }
320 }
321 
322 impl<OffsetSize: StringOffsetSizeTrait> From<Vec<&str>>
323     for GenericStringArray<OffsetSize>
324 {
from(v: Vec<&str>) -> Self325     fn from(v: Vec<&str>) -> Self {
326         GenericStringArray::<OffsetSize>::from_vec(v)
327     }
328 }
329 
330 /// An array where each element is a variable-sized sequence of bytes representing a string
331 /// whose maximum length (in bytes) is represented by a i32.
332 pub type StringArray = GenericStringArray<i32>;
333 
334 /// An array where each element is a variable-sized sequence of bytes representing a string
335 /// whose maximum length (in bytes) is represented by a i64.
336 pub type LargeStringArray = GenericStringArray<i64>;
337 
338 impl<T: StringOffsetSizeTrait> From<GenericListArray<T>> for GenericStringArray<T> {
from(v: GenericListArray<T>) -> Self339     fn from(v: GenericListArray<T>) -> Self {
340         GenericStringArray::<T>::from_list(v)
341     }
342 }
343 
344 #[cfg(test)]
345 mod tests {
346     use crate::array::{ListBuilder, StringBuilder};
347 
348     use super::*;
349 
350     #[test]
test_string_array_from_u8_slice()351     fn test_string_array_from_u8_slice() {
352         let values: Vec<&str> = vec!["hello", "", "parquet"];
353 
354         // Array data: ["hello", "", "parquet"]
355         let string_array = StringArray::from(values);
356 
357         assert_eq!(3, string_array.len());
358         assert_eq!(0, string_array.null_count());
359         assert_eq!("hello", string_array.value(0));
360         assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
361         assert_eq!("", string_array.value(1));
362         assert_eq!("", unsafe { string_array.value_unchecked(1) });
363         assert_eq!("parquet", string_array.value(2));
364         assert_eq!("parquet", unsafe { string_array.value_unchecked(2) });
365         assert_eq!(5, string_array.value_offsets()[2]);
366         assert_eq!(7, string_array.value_length(2));
367         for i in 0..3 {
368             assert!(string_array.is_valid(i));
369             assert!(!string_array.is_null(i));
370         }
371     }
372 
373     #[test]
374     #[should_panic(expected = "[Large]StringArray expects Datatype::[Large]Utf8")]
test_string_array_from_int()375     fn test_string_array_from_int() {
376         let array = LargeStringArray::from(vec!["a", "b"]);
377         StringArray::from(array.data());
378     }
379 
380     #[test]
test_large_string_array_from_u8_slice()381     fn test_large_string_array_from_u8_slice() {
382         let values: Vec<&str> = vec!["hello", "", "parquet"];
383 
384         // Array data: ["hello", "", "parquet"]
385         let string_array = LargeStringArray::from(values);
386 
387         assert_eq!(3, string_array.len());
388         assert_eq!(0, string_array.null_count());
389         assert_eq!("hello", string_array.value(0));
390         assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
391         assert_eq!("", string_array.value(1));
392         assert_eq!("", unsafe { string_array.value_unchecked(1) });
393         assert_eq!("parquet", string_array.value(2));
394         assert_eq!("parquet", unsafe { string_array.value_unchecked(2) });
395         assert_eq!(5, string_array.value_offsets()[2]);
396         assert_eq!(7, string_array.value_length(2));
397         for i in 0..3 {
398             assert!(string_array.is_valid(i));
399             assert!(!string_array.is_null(i));
400         }
401     }
402 
403     #[test]
test_nested_string_array()404     fn test_nested_string_array() {
405         let string_builder = StringBuilder::new(3);
406         let mut list_of_string_builder = ListBuilder::new(string_builder);
407 
408         list_of_string_builder.values().append_value("foo").unwrap();
409         list_of_string_builder.values().append_value("bar").unwrap();
410         list_of_string_builder.append(true).unwrap();
411 
412         list_of_string_builder
413             .values()
414             .append_value("foobar")
415             .unwrap();
416         list_of_string_builder.append(true).unwrap();
417         let list_of_strings = list_of_string_builder.finish();
418 
419         assert_eq!(list_of_strings.len(), 2);
420 
421         let first_slot = list_of_strings.value(0);
422         let first_list = first_slot.as_any().downcast_ref::<StringArray>().unwrap();
423         assert_eq!(first_list.len(), 2);
424         assert_eq!(first_list.value(0), "foo");
425         assert_eq!(unsafe { first_list.value_unchecked(0) }, "foo");
426         assert_eq!(first_list.value(1), "bar");
427         assert_eq!(unsafe { first_list.value_unchecked(1) }, "bar");
428 
429         let second_slot = list_of_strings.value(1);
430         let second_list = second_slot.as_any().downcast_ref::<StringArray>().unwrap();
431         assert_eq!(second_list.len(), 1);
432         assert_eq!(second_list.value(0), "foobar");
433         assert_eq!(unsafe { second_list.value_unchecked(0) }, "foobar");
434     }
435 
436     #[test]
437     #[should_panic(expected = "StringArray out of bounds access")]
test_string_array_get_value_index_out_of_bound()438     fn test_string_array_get_value_index_out_of_bound() {
439         let values: [u8; 12] = [
440             b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
441         ];
442         let offsets: [i32; 4] = [0, 5, 5, 12];
443         let array_data = ArrayData::builder(DataType::Utf8)
444             .len(3)
445             .add_buffer(Buffer::from_slice_ref(&offsets))
446             .add_buffer(Buffer::from_slice_ref(&values))
447             .build();
448         let string_array = StringArray::from(array_data);
449         string_array.value(4);
450     }
451 
452     #[test]
test_string_array_fmt_debug()453     fn test_string_array_fmt_debug() {
454         let arr: StringArray = vec!["hello", "arrow"].into();
455         assert_eq!(
456             "StringArray\n[\n  \"hello\",\n  \"arrow\",\n]",
457             format!("{:?}", arr)
458         );
459     }
460 
461     #[test]
test_large_string_array_fmt_debug()462     fn test_large_string_array_fmt_debug() {
463         let arr: LargeStringArray = vec!["hello", "arrow"].into();
464         assert_eq!(
465             "LargeStringArray\n[\n  \"hello\",\n  \"arrow\",\n]",
466             format!("{:?}", arr)
467         );
468     }
469 
470     #[test]
test_string_array_from_iter()471     fn test_string_array_from_iter() {
472         let data = vec![Some("hello"), None, Some("arrow")];
473         // from Vec<Option<&str>>
474         let array1 = StringArray::from(data.clone());
475         // from Iterator<Option<&str>>
476         let array2: StringArray = data.clone().into_iter().collect();
477         // from Iterator<Option<String>>
478         let array3: StringArray =
479             data.into_iter().map(|x| x.map(|s| s.to_string())).collect();
480 
481         assert_eq!(array1, array2);
482         assert_eq!(array2, array3);
483     }
484 
485     #[test]
test_string_array_from_iter_values()486     fn test_string_array_from_iter_values() {
487         let data = vec!["hello", "hello2"];
488         let array1 = StringArray::from_iter_values(data.iter());
489 
490         assert_eq!(array1.value(0), "hello");
491         assert_eq!(array1.value(1), "hello2");
492     }
493 }
494