1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 use std::convert::From; 19 use std::fmt; 20 use std::mem; 21 use std::{any::Any, iter::FromIterator}; 22 23 use super::{ 24 array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, ArrayDataRef, 25 GenericListArray, GenericStringIter, OffsetSizeTrait, 26 }; 27 use crate::buffer::Buffer; 28 use crate::util::bit_util; 29 use crate::{buffer::MutableBuffer, datatypes::DataType}; 30 31 /// Like OffsetSizeTrait, but specialized for Strings 32 // This allow us to expose a constant datatype for the GenericStringArray 33 pub trait StringOffsetSizeTrait: OffsetSizeTrait { 34 const DATA_TYPE: DataType; 35 } 36 37 impl StringOffsetSizeTrait for i32 { 38 const DATA_TYPE: DataType = DataType::Utf8; 39 } 40 41 impl StringOffsetSizeTrait for i64 { 42 const DATA_TYPE: DataType = DataType::LargeUtf8; 43 } 44 45 /// Generic struct for \[Large\]StringArray 46 pub struct GenericStringArray<OffsetSize: StringOffsetSizeTrait> { 47 data: ArrayDataRef, 48 value_offsets: RawPtrBox<OffsetSize>, 49 value_data: RawPtrBox<u8>, 50 } 51 52 impl<OffsetSize: StringOffsetSizeTrait> GenericStringArray<OffsetSize> { 53 /// Returns the length for the element at index `i`. 54 #[inline] value_length(&self, i: usize) -> OffsetSize55 pub fn value_length(&self, i: usize) -> OffsetSize { 56 let offsets = self.value_offsets(); 57 offsets[i + 1] - offsets[i] 58 } 59 60 /// Returns the offset values in the offsets buffer 61 #[inline] value_offsets(&self) -> &[OffsetSize]62 pub fn value_offsets(&self) -> &[OffsetSize] { 63 // Soundness 64 // pointer alignment & location is ensured by RawPtrBox 65 // buffer bounds/offset is ensured by the ArrayData instance. 66 unsafe { 67 std::slice::from_raw_parts( 68 self.value_offsets.as_ptr().add(self.data.offset()), 69 self.len() + 1, 70 ) 71 } 72 } 73 74 /// Returns a clone of the value data buffer value_data(&self) -> Buffer75 pub fn value_data(&self) -> Buffer { 76 self.data.buffers()[1].clone() 77 } 78 79 /// Returns the element at index 80 /// # Safety 81 /// caller is responsible for ensuring that index is within the array bounds value_unchecked(&self, i: usize) -> &str82 pub unsafe fn value_unchecked(&self, i: usize) -> &str { 83 let end = self.value_offsets().get_unchecked(i + 1); 84 let start = self.value_offsets().get_unchecked(i); 85 86 // Soundness 87 // pointer alignment & location is ensured by RawPtrBox 88 // buffer bounds/offset is ensured by the value_offset invariants 89 // ISSUE: utf-8 well formedness is not checked 90 let slice = std::slice::from_raw_parts( 91 self.value_data.as_ptr().offset(start.to_isize()), 92 (*end - *start).to_usize().unwrap(), 93 ); 94 std::str::from_utf8_unchecked(slice) 95 } 96 97 /// Returns the element at index `i` as &str value(&self, i: usize) -> &str98 pub fn value(&self, i: usize) -> &str { 99 assert!(i < self.data.len(), "StringArray out of bounds access"); 100 //Soundness: length checked above, offset buffer length is 1 larger than logical array length 101 let end = unsafe { self.value_offsets().get_unchecked(i + 1) }; 102 let start = unsafe { self.value_offsets().get_unchecked(i) }; 103 104 // Soundness 105 // pointer alignment & location is ensured by RawPtrBox 106 // buffer bounds/offset is ensured by the value_offset invariants 107 // ISSUE: utf-8 well formedness is not checked 108 unsafe { 109 let slice = std::slice::from_raw_parts( 110 self.value_data.as_ptr().offset(start.to_isize()), 111 (*end - *start).to_usize().unwrap(), 112 ); 113 std::str::from_utf8_unchecked(slice) 114 } 115 } 116 from_list(v: GenericListArray<OffsetSize>) -> Self117 fn from_list(v: GenericListArray<OffsetSize>) -> Self { 118 assert_eq!( 119 v.data().child_data()[0].child_data().len(), 120 0, 121 "StringArray can only be created from list array of u8 values \ 122 (i.e. List<PrimitiveArray<u8>>)." 123 ); 124 assert_eq!( 125 v.data_ref().child_data()[0].data_type(), 126 &DataType::UInt8, 127 "StringArray can only be created from List<u8> arrays, mismatched data types." 128 ); 129 130 let mut builder = ArrayData::builder(OffsetSize::DATA_TYPE) 131 .len(v.len()) 132 .add_buffer(v.data_ref().buffers()[0].clone()) 133 .add_buffer(v.data_ref().child_data()[0].buffers()[0].clone()); 134 if let Some(bitmap) = v.data().null_bitmap() { 135 builder = builder.null_bit_buffer(bitmap.bits.clone()) 136 } 137 138 let data = builder.build(); 139 Self::from(data) 140 } 141 from_vec(v: Vec<&str>) -> Self142 pub(crate) fn from_vec(v: Vec<&str>) -> Self { 143 let mut offsets = 144 MutableBuffer::new((v.len() + 1) * std::mem::size_of::<OffsetSize>()); 145 let mut values = MutableBuffer::new(0); 146 147 let mut length_so_far = OffsetSize::zero(); 148 offsets.push(length_so_far); 149 150 for s in &v { 151 length_so_far += OffsetSize::from_usize(s.len()).unwrap(); 152 offsets.push(length_so_far); 153 values.extend_from_slice(s.as_bytes()); 154 } 155 let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) 156 .len(v.len()) 157 .add_buffer(offsets.into()) 158 .add_buffer(values.into()) 159 .build(); 160 Self::from(array_data) 161 } 162 from_opt_vec(v: Vec<Option<&str>>) -> Self163 pub(crate) fn from_opt_vec(v: Vec<Option<&str>>) -> Self { 164 v.into_iter().collect() 165 } 166 167 /// Creates a `GenericStringArray` based on an iterator of values without nulls from_iter_values<Ptr, I: IntoIterator<Item = Ptr>>(iter: I) -> Self where Ptr: AsRef<str>,168 pub fn from_iter_values<Ptr, I: IntoIterator<Item = Ptr>>(iter: I) -> Self 169 where 170 Ptr: AsRef<str>, 171 { 172 let iter = iter.into_iter(); 173 let (_, data_len) = iter.size_hint(); 174 let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. 175 176 let mut offsets = 177 MutableBuffer::new((data_len + 1) * std::mem::size_of::<OffsetSize>()); 178 let mut values = MutableBuffer::new(0); 179 180 let mut length_so_far = OffsetSize::zero(); 181 offsets.push(length_so_far); 182 183 for i in iter { 184 let s = i.as_ref(); 185 length_so_far += OffsetSize::from_usize(s.len()).unwrap(); 186 offsets.push(length_so_far); 187 values.extend_from_slice(s.as_bytes()); 188 } 189 let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) 190 .len(data_len) 191 .add_buffer(offsets.into()) 192 .add_buffer(values.into()) 193 .build(); 194 Self::from(array_data) 195 } 196 } 197 198 impl<'a, Ptr, OffsetSize: StringOffsetSizeTrait> FromIterator<Option<Ptr>> 199 for GenericStringArray<OffsetSize> 200 where 201 Ptr: AsRef<str>, 202 { from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self203 fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self { 204 let iter = iter.into_iter(); 205 let (_, data_len) = iter.size_hint(); 206 let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. 207 208 let mut offsets = 209 MutableBuffer::new((data_len + 1) * std::mem::size_of::<OffsetSize>()); 210 let mut values = MutableBuffer::new(0); 211 let mut null_buf = MutableBuffer::new_null(data_len); 212 let null_slice = null_buf.as_slice_mut(); 213 let mut length_so_far = OffsetSize::zero(); 214 offsets.push(length_so_far); 215 216 for (i, s) in iter.enumerate() { 217 if let Some(s) = s { 218 let s = s.as_ref(); 219 // set null bit 220 bit_util::set_bit(null_slice, i); 221 222 length_so_far += OffsetSize::from_usize(s.len()).unwrap(); 223 values.extend_from_slice(s.as_bytes()); 224 } else { 225 values.extend_from_slice(b""); 226 } 227 offsets.push(length_so_far); 228 } 229 230 let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) 231 .len(data_len) 232 .add_buffer(offsets.into()) 233 .add_buffer(values.into()) 234 .null_bit_buffer(null_buf.into()) 235 .build(); 236 Self::from(array_data) 237 } 238 } 239 240 impl<'a, T: StringOffsetSizeTrait> IntoIterator for &'a GenericStringArray<T> { 241 type Item = Option<&'a str>; 242 type IntoIter = GenericStringIter<'a, T>; 243 into_iter(self) -> Self::IntoIter244 fn into_iter(self) -> Self::IntoIter { 245 GenericStringIter::<'a, T>::new(self) 246 } 247 } 248 249 impl<'a, T: StringOffsetSizeTrait> GenericStringArray<T> { 250 /// constructs a new iterator iter(&'a self) -> GenericStringIter<'a, T>251 pub fn iter(&'a self) -> GenericStringIter<'a, T> { 252 GenericStringIter::<'a, T>::new(&self) 253 } 254 } 255 256 impl<OffsetSize: StringOffsetSizeTrait> fmt::Debug for GenericStringArray<OffsetSize> { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result257 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 258 write!(f, "{}StringArray\n[\n", OffsetSize::prefix())?; 259 print_long_array(self, f, |array, index, f| { 260 fmt::Debug::fmt(&array.value(index), f) 261 })?; 262 write!(f, "]") 263 } 264 } 265 266 impl<OffsetSize: StringOffsetSizeTrait> Array for GenericStringArray<OffsetSize> { as_any(&self) -> &Any267 fn as_any(&self) -> &Any { 268 self 269 } 270 data(&self) -> ArrayDataRef271 fn data(&self) -> ArrayDataRef { 272 self.data.clone() 273 } 274 data_ref(&self) -> &ArrayDataRef275 fn data_ref(&self) -> &ArrayDataRef { 276 &self.data 277 } 278 279 /// Returns the total number of bytes of memory occupied by the buffers owned by this [$name]. get_buffer_memory_size(&self) -> usize280 fn get_buffer_memory_size(&self) -> usize { 281 self.data.get_buffer_memory_size() 282 } 283 284 /// Returns the total number of bytes of memory occupied physically by this [$name]. get_array_memory_size(&self) -> usize285 fn get_array_memory_size(&self) -> usize { 286 self.data.get_array_memory_size() + mem::size_of_val(self) 287 } 288 } 289 290 impl<OffsetSize: StringOffsetSizeTrait> From<ArrayDataRef> 291 for GenericStringArray<OffsetSize> 292 { from(data: ArrayDataRef) -> Self293 fn from(data: ArrayDataRef) -> Self { 294 assert_eq!( 295 data.data_type(), 296 &<OffsetSize as StringOffsetSizeTrait>::DATA_TYPE, 297 "[Large]StringArray expects Datatype::[Large]Utf8" 298 ); 299 assert_eq!( 300 data.buffers().len(), 301 2, 302 "StringArray data should contain 2 buffers only (offsets and values)" 303 ); 304 let offsets = data.buffers()[0].as_ptr(); 305 let values = data.buffers()[1].as_ptr(); 306 Self { 307 data, 308 value_offsets: unsafe { RawPtrBox::new(offsets) }, 309 value_data: unsafe { RawPtrBox::new(values) }, 310 } 311 } 312 } 313 314 impl<OffsetSize: StringOffsetSizeTrait> From<Vec<Option<&str>>> 315 for GenericStringArray<OffsetSize> 316 { from(v: Vec<Option<&str>>) -> Self317 fn from(v: Vec<Option<&str>>) -> Self { 318 GenericStringArray::<OffsetSize>::from_opt_vec(v) 319 } 320 } 321 322 impl<OffsetSize: StringOffsetSizeTrait> From<Vec<&str>> 323 for GenericStringArray<OffsetSize> 324 { from(v: Vec<&str>) -> Self325 fn from(v: Vec<&str>) -> Self { 326 GenericStringArray::<OffsetSize>::from_vec(v) 327 } 328 } 329 330 /// An array where each element is a variable-sized sequence of bytes representing a string 331 /// whose maximum length (in bytes) is represented by a i32. 332 pub type StringArray = GenericStringArray<i32>; 333 334 /// An array where each element is a variable-sized sequence of bytes representing a string 335 /// whose maximum length (in bytes) is represented by a i64. 336 pub type LargeStringArray = GenericStringArray<i64>; 337 338 impl<T: StringOffsetSizeTrait> From<GenericListArray<T>> for GenericStringArray<T> { from(v: GenericListArray<T>) -> Self339 fn from(v: GenericListArray<T>) -> Self { 340 GenericStringArray::<T>::from_list(v) 341 } 342 } 343 344 #[cfg(test)] 345 mod tests { 346 use crate::array::{ListBuilder, StringBuilder}; 347 348 use super::*; 349 350 #[test] test_string_array_from_u8_slice()351 fn test_string_array_from_u8_slice() { 352 let values: Vec<&str> = vec!["hello", "", "parquet"]; 353 354 // Array data: ["hello", "", "parquet"] 355 let string_array = StringArray::from(values); 356 357 assert_eq!(3, string_array.len()); 358 assert_eq!(0, string_array.null_count()); 359 assert_eq!("hello", string_array.value(0)); 360 assert_eq!("hello", unsafe { string_array.value_unchecked(0) }); 361 assert_eq!("", string_array.value(1)); 362 assert_eq!("", unsafe { string_array.value_unchecked(1) }); 363 assert_eq!("parquet", string_array.value(2)); 364 assert_eq!("parquet", unsafe { string_array.value_unchecked(2) }); 365 assert_eq!(5, string_array.value_offsets()[2]); 366 assert_eq!(7, string_array.value_length(2)); 367 for i in 0..3 { 368 assert!(string_array.is_valid(i)); 369 assert!(!string_array.is_null(i)); 370 } 371 } 372 373 #[test] 374 #[should_panic(expected = "[Large]StringArray expects Datatype::[Large]Utf8")] test_string_array_from_int()375 fn test_string_array_from_int() { 376 let array = LargeStringArray::from(vec!["a", "b"]); 377 StringArray::from(array.data()); 378 } 379 380 #[test] test_large_string_array_from_u8_slice()381 fn test_large_string_array_from_u8_slice() { 382 let values: Vec<&str> = vec!["hello", "", "parquet"]; 383 384 // Array data: ["hello", "", "parquet"] 385 let string_array = LargeStringArray::from(values); 386 387 assert_eq!(3, string_array.len()); 388 assert_eq!(0, string_array.null_count()); 389 assert_eq!("hello", string_array.value(0)); 390 assert_eq!("hello", unsafe { string_array.value_unchecked(0) }); 391 assert_eq!("", string_array.value(1)); 392 assert_eq!("", unsafe { string_array.value_unchecked(1) }); 393 assert_eq!("parquet", string_array.value(2)); 394 assert_eq!("parquet", unsafe { string_array.value_unchecked(2) }); 395 assert_eq!(5, string_array.value_offsets()[2]); 396 assert_eq!(7, string_array.value_length(2)); 397 for i in 0..3 { 398 assert!(string_array.is_valid(i)); 399 assert!(!string_array.is_null(i)); 400 } 401 } 402 403 #[test] test_nested_string_array()404 fn test_nested_string_array() { 405 let string_builder = StringBuilder::new(3); 406 let mut list_of_string_builder = ListBuilder::new(string_builder); 407 408 list_of_string_builder.values().append_value("foo").unwrap(); 409 list_of_string_builder.values().append_value("bar").unwrap(); 410 list_of_string_builder.append(true).unwrap(); 411 412 list_of_string_builder 413 .values() 414 .append_value("foobar") 415 .unwrap(); 416 list_of_string_builder.append(true).unwrap(); 417 let list_of_strings = list_of_string_builder.finish(); 418 419 assert_eq!(list_of_strings.len(), 2); 420 421 let first_slot = list_of_strings.value(0); 422 let first_list = first_slot.as_any().downcast_ref::<StringArray>().unwrap(); 423 assert_eq!(first_list.len(), 2); 424 assert_eq!(first_list.value(0), "foo"); 425 assert_eq!(unsafe { first_list.value_unchecked(0) }, "foo"); 426 assert_eq!(first_list.value(1), "bar"); 427 assert_eq!(unsafe { first_list.value_unchecked(1) }, "bar"); 428 429 let second_slot = list_of_strings.value(1); 430 let second_list = second_slot.as_any().downcast_ref::<StringArray>().unwrap(); 431 assert_eq!(second_list.len(), 1); 432 assert_eq!(second_list.value(0), "foobar"); 433 assert_eq!(unsafe { second_list.value_unchecked(0) }, "foobar"); 434 } 435 436 #[test] 437 #[should_panic(expected = "StringArray out of bounds access")] test_string_array_get_value_index_out_of_bound()438 fn test_string_array_get_value_index_out_of_bound() { 439 let values: [u8; 12] = [ 440 b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't', 441 ]; 442 let offsets: [i32; 4] = [0, 5, 5, 12]; 443 let array_data = ArrayData::builder(DataType::Utf8) 444 .len(3) 445 .add_buffer(Buffer::from_slice_ref(&offsets)) 446 .add_buffer(Buffer::from_slice_ref(&values)) 447 .build(); 448 let string_array = StringArray::from(array_data); 449 string_array.value(4); 450 } 451 452 #[test] test_string_array_fmt_debug()453 fn test_string_array_fmt_debug() { 454 let arr: StringArray = vec!["hello", "arrow"].into(); 455 assert_eq!( 456 "StringArray\n[\n \"hello\",\n \"arrow\",\n]", 457 format!("{:?}", arr) 458 ); 459 } 460 461 #[test] test_large_string_array_fmt_debug()462 fn test_large_string_array_fmt_debug() { 463 let arr: LargeStringArray = vec!["hello", "arrow"].into(); 464 assert_eq!( 465 "LargeStringArray\n[\n \"hello\",\n \"arrow\",\n]", 466 format!("{:?}", arr) 467 ); 468 } 469 470 #[test] test_string_array_from_iter()471 fn test_string_array_from_iter() { 472 let data = vec![Some("hello"), None, Some("arrow")]; 473 // from Vec<Option<&str>> 474 let array1 = StringArray::from(data.clone()); 475 // from Iterator<Option<&str>> 476 let array2: StringArray = data.clone().into_iter().collect(); 477 // from Iterator<Option<String>> 478 let array3: StringArray = 479 data.into_iter().map(|x| x.map(|s| s.to_string())).collect(); 480 481 assert_eq!(array1, array2); 482 assert_eq!(array2, array3); 483 } 484 485 #[test] test_string_array_from_iter_values()486 fn test_string_array_from_iter_values() { 487 let data = vec!["hello", "hello2"]; 488 let array1 = StringArray::from_iter_values(data.iter()); 489 490 assert_eq!(array1.value(0), "hello"); 491 assert_eq!(array1.value(1), "hello2"); 492 } 493 } 494