1 use crate::unit::{Unit, UnitID}; 2 use std::collections::HashSet; 3 4 const BLOCK_SIZE: usize = 256; 5 const NUM_TARGET_BLOCKS: i32 = 16; // the number of target blocks to find offsets 6 const INVALID_NEXT: u8 = 0; // 0 means that there is no next unused unit 7 const INVALID_PREV: u8 = 255; // 255 means that there is no previous unused unit 8 9 /// A double-array trie builder. 10 #[derive(Debug)] 11 pub struct DoubleArrayBuilder { 12 pub blocks: Vec<DoubleArrayBlock>, 13 pub used_offsets: HashSet<u32>, 14 } 15 16 impl DoubleArrayBuilder { 17 /// Constructs a new `DoubleArrayBuilder` with an empty `DoubleArrayBlock`. new() -> Self18 pub fn new() -> Self { 19 Self { 20 blocks: vec![DoubleArrayBlock::new(0)], 21 used_offsets: HashSet::new(), 22 } 23 } 24 25 /// Builds a double-array trie with a `keyset` and returns it when build finished successfully. 26 /// Otherwise, returns `None`. 27 /// The `keyset` must be sorted. build<'a, T>(keyset: &[(T, u32)]) -> Option<Vec<u8>> where T: AsRef<[u8]>,28 pub fn build<'a, T>(keyset: &[(T, u32)]) -> Option<Vec<u8>> 29 where 30 T: AsRef<[u8]>, 31 { 32 Self::new().build_from_keyset(keyset) 33 } 34 35 /// Builds a double-array trie with a `keyset` and returns it when build finished successfully. 36 /// Otherwise, returns `None`. 37 /// The `keyset` must be sorted. build_from_keyset<T>(&mut self, keyset: &[(T, u32)]) -> Option<Vec<u8>> where T: AsRef<[u8]>,38 pub fn build_from_keyset<T>(&mut self, keyset: &[(T, u32)]) -> Option<Vec<u8>> 39 where 40 T: AsRef<[u8]>, 41 { 42 self.reserve(0); // reserve root node 43 self.build_recursive(keyset, 0, 0, keyset.len(), 0)?; 44 45 let mut da_bytes = Vec::with_capacity(self.blocks.len() * BLOCK_SIZE); 46 for block in &self.blocks { 47 for unit in block.units.iter() { 48 let bytes = unit.as_u32().to_le_bytes(); 49 da_bytes.extend_from_slice(&bytes); 50 } 51 } 52 53 Some(da_bytes) 54 } 55 56 /// Returns the number of `Unit`s that this builder contains. num_units(&self) -> u3257 pub fn num_units(&self) -> u32 { 58 (self.blocks.len() * BLOCK_SIZE) as u32 59 } 60 61 /// Returns the number of used `Unit`s that this builder contains. num_used_units(&self) -> u3262 pub fn num_used_units(&self) -> u32 { 63 self.blocks 64 .iter() 65 .map(|block| { 66 block 67 .is_used 68 .iter() 69 .fold(0, |acc, &is_used| acc + if is_used { 1 } else { 0 }) 70 }) 71 .sum::<u32>() 72 } 73 get_block(&self, unit_id: UnitID) -> Option<&DoubleArrayBlock>74 fn get_block(&self, unit_id: UnitID) -> Option<&DoubleArrayBlock> { 75 self.blocks.get(unit_id / BLOCK_SIZE) 76 } 77 get_block_mut(&mut self, unit_id: UnitID) -> Option<&mut DoubleArrayBlock>78 fn get_block_mut(&mut self, unit_id: UnitID) -> Option<&mut DoubleArrayBlock> { 79 self.blocks.get_mut(unit_id / BLOCK_SIZE) 80 } 81 extend_block(&mut self) -> &DoubleArrayBlock82 fn extend_block(&mut self) -> &DoubleArrayBlock { 83 let block_id = self.blocks.len(); 84 self.blocks.push(DoubleArrayBlock::new(block_id)); 85 self.blocks.last().unwrap() 86 } 87 extend_block_mut(&mut self) -> &mut DoubleArrayBlock88 fn extend_block_mut(&mut self) -> &mut DoubleArrayBlock { 89 let block_id = self.blocks.len(); 90 self.blocks.push(DoubleArrayBlock::new(block_id)); 91 self.blocks.last_mut().unwrap() 92 } 93 get_unit_mut(&mut self, unit_id: UnitID) -> &mut Unit94 fn get_unit_mut(&mut self, unit_id: UnitID) -> &mut Unit { 95 while self.get_block(unit_id).is_none() { 96 self.extend_block_mut(); 97 } 98 let block = self.get_block_mut(unit_id).unwrap(); 99 &mut block.units[unit_id % BLOCK_SIZE] 100 } 101 reserve(&mut self, unit_id: UnitID)102 fn reserve(&mut self, unit_id: UnitID) { 103 while self.get_block(unit_id).is_none() { 104 self.extend_block_mut(); 105 } 106 let block = self.get_block_mut(unit_id).unwrap(); 107 assert!(unit_id % BLOCK_SIZE < 256); 108 block.reserve((unit_id % BLOCK_SIZE) as u8); 109 } 110 build_recursive<T>( &mut self, keyset: &[(T, u32)], depth: usize, begin: usize, end: usize, unit_id: UnitID, ) -> Option<()> where T: AsRef<[u8]>,111 fn build_recursive<T>( 112 &mut self, 113 keyset: &[(T, u32)], 114 depth: usize, 115 begin: usize, 116 end: usize, 117 unit_id: UnitID, 118 ) -> Option<()> 119 where 120 T: AsRef<[u8]>, 121 { 122 // element of labels is a tuple (label, start_position, end_position) 123 let mut labels: Vec<(u8, usize, usize)> = Vec::with_capacity(256); 124 let mut value = None; 125 126 for i in begin..end { 127 let key_value = keyset.get(i).unwrap(); 128 let label = { 129 let key = key_value.0.as_ref(); 130 if depth == key.len() { 131 0 132 } else { 133 *key.get(depth)? 134 } 135 }; 136 if label == 0 { 137 assert!(value.is_none()); // there is just one '\0' in a key 138 value = Some(key_value.1); 139 } 140 match labels.last_mut() { 141 Some(last_label) => { 142 if last_label.0 != label { 143 last_label.2 = i; // set end position 144 labels.push((label, i, 0)); 145 } 146 } 147 None => { 148 labels.push((label, i, 0)); 149 } 150 } 151 } 152 assert!(labels.len() > 0); 153 154 let mut last_label = labels.last_mut().unwrap(); 155 last_label.2 = end; 156 157 let labels_ = labels.iter().map(|(key, _, _)| *key).collect::<Vec<_>>(); 158 assert!(labels_.len() > 0); 159 160 // search an offset where these children fits to unused positions. 161 let offset: u32 = loop { 162 let offset = self.find_offset(unit_id, &labels_); 163 if offset.is_some() { 164 break offset.unwrap(); 165 } 166 self.extend_block(); 167 }; 168 assert!( 169 offset < (1u32 << 29), 170 "offset must be represented as 29 bits integer" 171 ); 172 173 // mark the offset used 174 self.used_offsets.insert(offset); 175 176 let has_leaf = labels_.first().filter(|&&x| x == 0).is_some(); 177 178 // populate offset and has_leaf flag to parent node 179 let parent_unit = self.get_unit_mut(unit_id); 180 assert_eq!( 181 parent_unit.offset(), 182 0, 183 "offset() should return 0 before set_offset()" 184 ); 185 parent_unit.set_offset(offset ^ unit_id as u32); // store the relative offset to the index 186 assert!( 187 !parent_unit.has_leaf(), 188 "has_leaf() should return false before set_has_leaf()" 189 ); 190 parent_unit.set_has_leaf(has_leaf); 191 192 // populate label or associated value to children node 193 for label in labels_ { 194 let child_id = (offset ^ label as u32) as UnitID; 195 self.reserve(child_id); 196 197 let unit = self.get_unit_mut(child_id); 198 199 // child node units should be empty 200 assert_eq!(unit.offset(), 0); 201 assert_eq!(unit.label(), 0); 202 assert_eq!(unit.value(), 0); 203 assert!(!unit.has_leaf()); 204 205 if label == 0 { 206 assert!(value.is_some()); 207 unit.set_value(value.unwrap()); 208 } else { 209 unit.set_label(label); 210 } 211 } 212 213 // recursive call in depth-first order 214 for (label, begin, end) in labels { 215 self.build_recursive( 216 keyset, 217 depth + 1, 218 begin, 219 end, 220 (label as u32 ^ offset) as UnitID, 221 ); 222 } 223 224 Some(()) 225 } 226 find_offset(&self, unit_id: UnitID, labels: &Vec<u8>) -> Option<u32>227 fn find_offset(&self, unit_id: UnitID, labels: &Vec<u8>) -> Option<u32> { 228 let head_block = (self.blocks.len() as i32 - NUM_TARGET_BLOCKS).max(0) as usize; 229 self.blocks 230 .iter() 231 .skip(head_block) // search for offset in last N blocks 232 .find_map(|block| { 233 // find the first valid offset in a block 234 for offset in block.find_offset(unit_id, labels) { 235 let offset_u32 = (block.id as u32) << 8 | offset as u32; 236 if !self.used_offsets.contains(&offset_u32) { 237 return Some((block.id as u32) << 8 | offset as u32); 238 } 239 } 240 None 241 }) 242 } 243 } 244 245 const DEFAULT_UNITS: [Unit; BLOCK_SIZE] = [Unit::new(); BLOCK_SIZE]; 246 const DEFAULT_IS_USED: [bool; BLOCK_SIZE] = [false; BLOCK_SIZE]; 247 const DEFAULT_NEXT_UNUSED: [u8; BLOCK_SIZE] = { 248 let mut next_unused = [INVALID_NEXT; BLOCK_SIZE]; 249 let mut i = 0; 250 while i < next_unused.len() - 1 { 251 next_unused[i] = (i + 1) as u8; 252 i += 1; 253 } 254 next_unused 255 }; 256 const DEFAULT_PREV_UNUSED: [u8; BLOCK_SIZE] = { 257 let mut prev_unused = [INVALID_PREV; BLOCK_SIZE]; 258 let mut i = 1; 259 while i < prev_unused.len() { 260 prev_unused[i] = (i - 1) as u8; 261 i += 1; 262 } 263 prev_unused 264 }; 265 266 /// A block that have a shard of a double-array and other useful data structures. 267 pub struct DoubleArrayBlock { 268 pub id: usize, 269 pub units: [Unit; BLOCK_SIZE], 270 pub is_used: [bool; BLOCK_SIZE], 271 pub head_unused: u8, 272 pub next_unused: [u8; BLOCK_SIZE], 273 pub prev_unused: [u8; BLOCK_SIZE], 274 } 275 276 impl DoubleArrayBlock { new(id: usize) -> Self277 const fn new(id: usize) -> Self { 278 Self { 279 id, 280 units: DEFAULT_UNITS, 281 is_used: DEFAULT_IS_USED, 282 head_unused: 0, 283 next_unused: DEFAULT_NEXT_UNUSED, 284 prev_unused: DEFAULT_PREV_UNUSED, 285 } 286 } 287 288 /// Finds a valid offset in this block. find_offset<'a>( &'a self, unit_id: UnitID, labels: &'a Vec<u8>, ) -> impl Iterator<Item = u8> + 'a289 fn find_offset<'a>( 290 &'a self, 291 unit_id: UnitID, 292 labels: &'a Vec<u8>, 293 ) -> impl Iterator<Item = u8> + 'a { 294 assert!(labels.len() > 0); 295 FindOffset { 296 unused_id: self.head_unused, 297 block: self, 298 unit_id, 299 labels, 300 } 301 } 302 reserve(&mut self, id: u8)303 fn reserve(&mut self, id: u8) { 304 // maintain is_used 305 self.is_used[id as usize] = true; 306 307 let prev_id = self.prev_unused[id as usize]; 308 let next_id = self.next_unused[id as usize]; 309 310 // maintain next_unused 311 if prev_id != INVALID_PREV { 312 self.next_unused[prev_id as usize] = next_id; 313 } 314 self.next_unused[id as usize] = INVALID_NEXT; // this line can be removed 315 316 // maintain prev_unused 317 if next_id != INVALID_NEXT { 318 self.prev_unused[next_id as usize] = prev_id; 319 } 320 self.prev_unused[id as usize] = INVALID_PREV; // this line can be removed 321 322 // maintain head_unused 323 if id == self.head_unused { 324 self.head_unused = next_id; 325 } 326 } 327 } 328 329 pub struct FindOffset<'a> { 330 unused_id: u8, 331 block: &'a DoubleArrayBlock, 332 unit_id: UnitID, // parent node position to set the offset 333 labels: &'a Vec<u8>, 334 } 335 336 impl<'a> FindOffset<'a> { 337 #[inline] is_valid_offset(&self, offset: u8) -> bool338 fn is_valid_offset(&self, offset: u8) -> bool { 339 let offset_u32 = (self.block.id as u32) << 8 | offset as u32; 340 let relative_offset = self.unit_id as u32 ^ offset_u32; 341 if (relative_offset & (0xFF << 21)) > 0 && (relative_offset & 0xFF) > 0 { 342 return false; 343 } 344 345 self.labels.iter().skip(1).all(|label| { 346 let id = offset ^ label; 347 match self.block.is_used.get(id as UnitID) { 348 Some(is_used) => !*is_used, 349 None => { 350 // something is going wrong 351 assert!(false, "DoubleArrayBlock is_used.get({}) was fault", id); 352 false 353 } 354 } 355 }) 356 } 357 } 358 359 impl<'a> Iterator for FindOffset<'a> { 360 type Item = u8; 361 next(&mut self) -> Option<Self::Item>362 fn next(&mut self) -> Option<Self::Item> { 363 if self.unused_id == INVALID_NEXT && self.block.is_used[self.unused_id as usize] == true { 364 return None; 365 } 366 367 // return if this block is full 368 if self.block.head_unused == INVALID_NEXT && self.block.is_used[0] == true { 369 assert!(self.block.is_used.iter().all(|is_used| *is_used)); // assert full 370 return None; 371 } 372 assert!(!self.block.is_used.iter().all(|is_used| *is_used)); // assert not full 373 374 loop { 375 assert!(!self.block.is_used[self.unused_id as usize]); 376 377 let first_label = *self.labels.first()?; 378 let offset = self.unused_id ^ first_label; 379 380 let is_valid_offset = self.is_valid_offset(offset); 381 382 // update unused_id to next unused node 383 self.unused_id = self.block.next_unused[self.unused_id as usize]; 384 385 if is_valid_offset { 386 return Some(offset); 387 } 388 389 if self.unused_id == INVALID_NEXT { 390 return None; 391 } 392 } 393 } 394 } 395 396 impl std::fmt::Debug for DoubleArrayBlock { fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result397 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 398 f.debug_struct("DoubleArrayBlock") 399 .field( 400 "units", 401 &format_args!( 402 "[{}]", 403 self.units 404 .iter() 405 .map(|u| u.to_string()) 406 .collect::<Vec<_>>() 407 .join(", ") 408 ), 409 ) 410 .field( 411 "is_used", 412 &format_args!( 413 "[{}]", 414 self.is_used 415 .iter() 416 .map(|u| u.to_string()) 417 .collect::<Vec<_>>() 418 .join(", ") 419 ), 420 ) 421 .field("head_unused", &self.head_unused) 422 .field( 423 "next_unused", 424 &format_args!( 425 "[{}]", 426 self.next_unused 427 .iter() 428 .map(|u| u.to_string()) 429 .collect::<Vec<_>>() 430 .join(", ") 431 ), 432 ) 433 .field( 434 "prev_unused", 435 &format_args!( 436 "[{}]", 437 self.prev_unused 438 .iter() 439 .map(|u| u.to_string()) 440 .collect::<Vec<_>>() 441 .join(", ") 442 ), 443 ) 444 .finish() 445 } 446 } 447 448 #[cfg(test)] 449 mod tests { 450 use crate::builder::DoubleArrayBuilder; 451 452 #[test] test_build()453 fn test_build() { 454 let keyset: &[(&[u8], u32)] = &[ 455 ("a".as_bytes(), 0), 456 ("aa".as_bytes(), 0), 457 ("aaa".as_bytes(), 0), 458 ("aaaa".as_bytes(), 0), 459 ("aaaaa".as_bytes(), 0), 460 ("ab".as_bytes(), 0), 461 ("abc".as_bytes(), 0), 462 ("abcd".as_bytes(), 0), 463 ("abcde".as_bytes(), 0), 464 ("abcdef".as_bytes(), 0), 465 ]; 466 467 let mut builder = DoubleArrayBuilder::new(); 468 let da = builder.build_from_keyset(keyset); 469 assert!(da.is_some()); 470 471 assert!(0 < builder.num_units()); 472 assert!(0 < builder.num_used_units()); 473 assert!(builder.num_used_units() < builder.num_units()); 474 } 475 } 476