1 use crate::unit::{Unit, UnitID};
2 use std::collections::HashSet;
3 
4 const BLOCK_SIZE: usize = 256;
5 const NUM_TARGET_BLOCKS: i32 = 16; // the number of target blocks to find offsets
6 const INVALID_NEXT: u8 = 0; // 0 means that there is no next unused unit
7 const INVALID_PREV: u8 = 255; // 255 means that there is no previous unused unit
8 
9 /// A double-array trie builder.
10 #[derive(Debug)]
11 pub struct DoubleArrayBuilder {
12     pub blocks: Vec<DoubleArrayBlock>,
13     pub used_offsets: HashSet<u32>,
14 }
15 
16 impl DoubleArrayBuilder {
17     /// Constructs a new `DoubleArrayBuilder` with an empty `DoubleArrayBlock`.
new() -> Self18     pub fn new() -> Self {
19         Self {
20             blocks: vec![DoubleArrayBlock::new(0)],
21             used_offsets: HashSet::new(),
22         }
23     }
24 
25     /// Builds a double-array trie with a `keyset` and returns it when build finished successfully.
26     /// Otherwise, returns `None`.
27     /// The `keyset` must be sorted.
build<'a, T>(keyset: &[(T, u32)]) -> Option<Vec<u8>> where T: AsRef<[u8]>,28     pub fn build<'a, T>(keyset: &[(T, u32)]) -> Option<Vec<u8>>
29     where
30         T: AsRef<[u8]>,
31     {
32         Self::new().build_from_keyset(keyset)
33     }
34 
35     /// Builds a double-array trie with a `keyset` and returns it when build finished successfully.
36     /// Otherwise, returns `None`.
37     /// The `keyset` must be sorted.
build_from_keyset<T>(&mut self, keyset: &[(T, u32)]) -> Option<Vec<u8>> where T: AsRef<[u8]>,38     pub fn build_from_keyset<T>(&mut self, keyset: &[(T, u32)]) -> Option<Vec<u8>>
39     where
40         T: AsRef<[u8]>,
41     {
42         self.reserve(0); // reserve root node
43         self.build_recursive(keyset, 0, 0, keyset.len(), 0)?;
44 
45         let mut da_bytes = Vec::with_capacity(self.blocks.len() * BLOCK_SIZE);
46         for block in &self.blocks {
47             for unit in block.units.iter() {
48                 let bytes = unit.as_u32().to_le_bytes();
49                 da_bytes.extend_from_slice(&bytes);
50             }
51         }
52 
53         Some(da_bytes)
54     }
55 
56     /// Returns the number of `Unit`s that this builder contains.
num_units(&self) -> u3257     pub fn num_units(&self) -> u32 {
58         (self.blocks.len() * BLOCK_SIZE) as u32
59     }
60 
61     /// Returns the number of used `Unit`s that this builder contains.
num_used_units(&self) -> u3262     pub fn num_used_units(&self) -> u32 {
63         self.blocks
64             .iter()
65             .map(|block| {
66                 block
67                     .is_used
68                     .iter()
69                     .fold(0, |acc, &is_used| acc + if is_used { 1 } else { 0 })
70             })
71             .sum::<u32>()
72     }
73 
get_block(&self, unit_id: UnitID) -> Option<&DoubleArrayBlock>74     fn get_block(&self, unit_id: UnitID) -> Option<&DoubleArrayBlock> {
75         self.blocks.get(unit_id / BLOCK_SIZE)
76     }
77 
get_block_mut(&mut self, unit_id: UnitID) -> Option<&mut DoubleArrayBlock>78     fn get_block_mut(&mut self, unit_id: UnitID) -> Option<&mut DoubleArrayBlock> {
79         self.blocks.get_mut(unit_id / BLOCK_SIZE)
80     }
81 
extend_block(&mut self) -> &DoubleArrayBlock82     fn extend_block(&mut self) -> &DoubleArrayBlock {
83         let block_id = self.blocks.len();
84         self.blocks.push(DoubleArrayBlock::new(block_id));
85         self.blocks.last().unwrap()
86     }
87 
extend_block_mut(&mut self) -> &mut DoubleArrayBlock88     fn extend_block_mut(&mut self) -> &mut DoubleArrayBlock {
89         let block_id = self.blocks.len();
90         self.blocks.push(DoubleArrayBlock::new(block_id));
91         self.blocks.last_mut().unwrap()
92     }
93 
get_unit_mut(&mut self, unit_id: UnitID) -> &mut Unit94     fn get_unit_mut(&mut self, unit_id: UnitID) -> &mut Unit {
95         while self.get_block(unit_id).is_none() {
96             self.extend_block_mut();
97         }
98         let block = self.get_block_mut(unit_id).unwrap();
99         &mut block.units[unit_id % BLOCK_SIZE]
100     }
101 
reserve(&mut self, unit_id: UnitID)102     fn reserve(&mut self, unit_id: UnitID) {
103         while self.get_block(unit_id).is_none() {
104             self.extend_block_mut();
105         }
106         let block = self.get_block_mut(unit_id).unwrap();
107         assert!(unit_id % BLOCK_SIZE < 256);
108         block.reserve((unit_id % BLOCK_SIZE) as u8);
109     }
110 
build_recursive<T>( &mut self, keyset: &[(T, u32)], depth: usize, begin: usize, end: usize, unit_id: UnitID, ) -> Option<()> where T: AsRef<[u8]>,111     fn build_recursive<T>(
112         &mut self,
113         keyset: &[(T, u32)],
114         depth: usize,
115         begin: usize,
116         end: usize,
117         unit_id: UnitID,
118     ) -> Option<()>
119     where
120         T: AsRef<[u8]>,
121     {
122         // element of labels is a tuple (label, start_position, end_position)
123         let mut labels: Vec<(u8, usize, usize)> = Vec::with_capacity(256);
124         let mut value = None;
125 
126         for i in begin..end {
127             let key_value = keyset.get(i).unwrap();
128             let label = {
129                 let key = key_value.0.as_ref();
130                 if depth == key.len() {
131                     0
132                 } else {
133                     *key.get(depth)?
134                 }
135             };
136             if label == 0 {
137                 assert!(value.is_none()); // there is just one '\0' in a key
138                 value = Some(key_value.1);
139             }
140             match labels.last_mut() {
141                 Some(last_label) => {
142                     if last_label.0 != label {
143                         last_label.2 = i; // set end position
144                         labels.push((label, i, 0));
145                     }
146                 }
147                 None => {
148                     labels.push((label, i, 0));
149                 }
150             }
151         }
152         assert!(labels.len() > 0);
153 
154         let mut last_label = labels.last_mut().unwrap();
155         last_label.2 = end;
156 
157         let labels_ = labels.iter().map(|(key, _, _)| *key).collect::<Vec<_>>();
158         assert!(labels_.len() > 0);
159 
160         // search an offset where these children fits to unused positions.
161         let offset: u32 = loop {
162             let offset = self.find_offset(unit_id, &labels_);
163             if offset.is_some() {
164                 break offset.unwrap();
165             }
166             self.extend_block();
167         };
168         assert!(
169             offset < (1u32 << 29),
170             "offset must be represented as 29 bits integer"
171         );
172 
173         // mark the offset used
174         self.used_offsets.insert(offset);
175 
176         let has_leaf = labels_.first().filter(|&&x| x == 0).is_some();
177 
178         // populate offset and has_leaf flag to parent node
179         let parent_unit = self.get_unit_mut(unit_id);
180         assert_eq!(
181             parent_unit.offset(),
182             0,
183             "offset() should return 0 before set_offset()"
184         );
185         parent_unit.set_offset(offset ^ unit_id as u32); // store the relative offset to the index
186         assert!(
187             !parent_unit.has_leaf(),
188             "has_leaf() should return false before set_has_leaf()"
189         );
190         parent_unit.set_has_leaf(has_leaf);
191 
192         // populate label or associated value to children node
193         for label in labels_ {
194             let child_id = (offset ^ label as u32) as UnitID;
195             self.reserve(child_id);
196 
197             let unit = self.get_unit_mut(child_id);
198 
199             // child node units should be empty
200             assert_eq!(unit.offset(), 0);
201             assert_eq!(unit.label(), 0);
202             assert_eq!(unit.value(), 0);
203             assert!(!unit.has_leaf());
204 
205             if label == 0 {
206                 assert!(value.is_some());
207                 unit.set_value(value.unwrap());
208             } else {
209                 unit.set_label(label);
210             }
211         }
212 
213         // recursive call in depth-first order
214         for (label, begin, end) in labels {
215             self.build_recursive(
216                 keyset,
217                 depth + 1,
218                 begin,
219                 end,
220                 (label as u32 ^ offset) as UnitID,
221             );
222         }
223 
224         Some(())
225     }
226 
find_offset(&self, unit_id: UnitID, labels: &Vec<u8>) -> Option<u32>227     fn find_offset(&self, unit_id: UnitID, labels: &Vec<u8>) -> Option<u32> {
228         let head_block = (self.blocks.len() as i32 - NUM_TARGET_BLOCKS).max(0) as usize;
229         self.blocks
230             .iter()
231             .skip(head_block) // search for offset in last N blocks
232             .find_map(|block| {
233                 // find the first valid offset in a block
234                 for offset in block.find_offset(unit_id, labels) {
235                     let offset_u32 = (block.id as u32) << 8 | offset as u32;
236                     if !self.used_offsets.contains(&offset_u32) {
237                         return Some((block.id as u32) << 8 | offset as u32);
238                     }
239                 }
240                 None
241             })
242     }
243 }
244 
245 const DEFAULT_UNITS: [Unit; BLOCK_SIZE] = [Unit::new(); BLOCK_SIZE];
246 const DEFAULT_IS_USED: [bool; BLOCK_SIZE] = [false; BLOCK_SIZE];
247 const DEFAULT_NEXT_UNUSED: [u8; BLOCK_SIZE] = {
248     let mut next_unused = [INVALID_NEXT; BLOCK_SIZE];
249     let mut i = 0;
250     while i < next_unused.len() - 1 {
251         next_unused[i] = (i + 1) as u8;
252         i += 1;
253     }
254     next_unused
255 };
256 const DEFAULT_PREV_UNUSED: [u8; BLOCK_SIZE] = {
257     let mut prev_unused = [INVALID_PREV; BLOCK_SIZE];
258     let mut i = 1;
259     while i < prev_unused.len() {
260         prev_unused[i] = (i - 1) as u8;
261         i += 1;
262     }
263     prev_unused
264 };
265 
266 /// A block that have a shard of a double-array and other useful data structures.
267 pub struct DoubleArrayBlock {
268     pub id: usize,
269     pub units: [Unit; BLOCK_SIZE],
270     pub is_used: [bool; BLOCK_SIZE],
271     pub head_unused: u8,
272     pub next_unused: [u8; BLOCK_SIZE],
273     pub prev_unused: [u8; BLOCK_SIZE],
274 }
275 
276 impl DoubleArrayBlock {
new(id: usize) -> Self277     const fn new(id: usize) -> Self {
278         Self {
279             id,
280             units: DEFAULT_UNITS,
281             is_used: DEFAULT_IS_USED,
282             head_unused: 0,
283             next_unused: DEFAULT_NEXT_UNUSED,
284             prev_unused: DEFAULT_PREV_UNUSED,
285         }
286     }
287 
288     /// Finds a valid offset in this block.
find_offset<'a>( &'a self, unit_id: UnitID, labels: &'a Vec<u8>, ) -> impl Iterator<Item = u8> + 'a289     fn find_offset<'a>(
290         &'a self,
291         unit_id: UnitID,
292         labels: &'a Vec<u8>,
293     ) -> impl Iterator<Item = u8> + 'a {
294         assert!(labels.len() > 0);
295         FindOffset {
296             unused_id: self.head_unused,
297             block: self,
298             unit_id,
299             labels,
300         }
301     }
302 
reserve(&mut self, id: u8)303     fn reserve(&mut self, id: u8) {
304         // maintain is_used
305         self.is_used[id as usize] = true;
306 
307         let prev_id = self.prev_unused[id as usize];
308         let next_id = self.next_unused[id as usize];
309 
310         // maintain next_unused
311         if prev_id != INVALID_PREV {
312             self.next_unused[prev_id as usize] = next_id;
313         }
314         self.next_unused[id as usize] = INVALID_NEXT; // this line can be removed
315 
316         // maintain prev_unused
317         if next_id != INVALID_NEXT {
318             self.prev_unused[next_id as usize] = prev_id;
319         }
320         self.prev_unused[id as usize] = INVALID_PREV; // this line can be removed
321 
322         // maintain head_unused
323         if id == self.head_unused {
324             self.head_unused = next_id;
325         }
326     }
327 }
328 
329 pub struct FindOffset<'a> {
330     unused_id: u8,
331     block: &'a DoubleArrayBlock,
332     unit_id: UnitID, // parent node position to set the offset
333     labels: &'a Vec<u8>,
334 }
335 
336 impl<'a> FindOffset<'a> {
337     #[inline]
is_valid_offset(&self, offset: u8) -> bool338     fn is_valid_offset(&self, offset: u8) -> bool {
339         let offset_u32 = (self.block.id as u32) << 8 | offset as u32;
340         let relative_offset = self.unit_id as u32 ^ offset_u32;
341         if (relative_offset & (0xFF << 21)) > 0 && (relative_offset & 0xFF) > 0 {
342             return false;
343         }
344 
345         self.labels.iter().skip(1).all(|label| {
346             let id = offset ^ label;
347             match self.block.is_used.get(id as UnitID) {
348                 Some(is_used) => !*is_used,
349                 None => {
350                     // something is going wrong
351                     assert!(false, "DoubleArrayBlock is_used.get({}) was fault", id);
352                     false
353                 }
354             }
355         })
356     }
357 }
358 
359 impl<'a> Iterator for FindOffset<'a> {
360     type Item = u8;
361 
next(&mut self) -> Option<Self::Item>362     fn next(&mut self) -> Option<Self::Item> {
363         if self.unused_id == INVALID_NEXT && self.block.is_used[self.unused_id as usize] == true {
364             return None;
365         }
366 
367         // return if this block is full
368         if self.block.head_unused == INVALID_NEXT && self.block.is_used[0] == true {
369             assert!(self.block.is_used.iter().all(|is_used| *is_used)); // assert full
370             return None;
371         }
372         assert!(!self.block.is_used.iter().all(|is_used| *is_used)); // assert not full
373 
374         loop {
375             assert!(!self.block.is_used[self.unused_id as usize]);
376 
377             let first_label = *self.labels.first()?;
378             let offset = self.unused_id ^ first_label;
379 
380             let is_valid_offset = self.is_valid_offset(offset);
381 
382             // update unused_id to next unused node
383             self.unused_id = self.block.next_unused[self.unused_id as usize];
384 
385             if is_valid_offset {
386                 return Some(offset);
387             }
388 
389             if self.unused_id == INVALID_NEXT {
390                 return None;
391             }
392         }
393     }
394 }
395 
396 impl std::fmt::Debug for DoubleArrayBlock {
fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result397     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
398         f.debug_struct("DoubleArrayBlock")
399             .field(
400                 "units",
401                 &format_args!(
402                     "[{}]",
403                     self.units
404                         .iter()
405                         .map(|u| u.to_string())
406                         .collect::<Vec<_>>()
407                         .join(", ")
408                 ),
409             )
410             .field(
411                 "is_used",
412                 &format_args!(
413                     "[{}]",
414                     self.is_used
415                         .iter()
416                         .map(|u| u.to_string())
417                         .collect::<Vec<_>>()
418                         .join(", ")
419                 ),
420             )
421             .field("head_unused", &self.head_unused)
422             .field(
423                 "next_unused",
424                 &format_args!(
425                     "[{}]",
426                     self.next_unused
427                         .iter()
428                         .map(|u| u.to_string())
429                         .collect::<Vec<_>>()
430                         .join(", ")
431                 ),
432             )
433             .field(
434                 "prev_unused",
435                 &format_args!(
436                     "[{}]",
437                     self.prev_unused
438                         .iter()
439                         .map(|u| u.to_string())
440                         .collect::<Vec<_>>()
441                         .join(", ")
442                 ),
443             )
444             .finish()
445     }
446 }
447 
448 #[cfg(test)]
449 mod tests {
450     use crate::builder::DoubleArrayBuilder;
451 
452     #[test]
test_build()453     fn test_build() {
454         let keyset: &[(&[u8], u32)] = &[
455             ("a".as_bytes(), 0),
456             ("aa".as_bytes(), 0),
457             ("aaa".as_bytes(), 0),
458             ("aaaa".as_bytes(), 0),
459             ("aaaaa".as_bytes(), 0),
460             ("ab".as_bytes(), 0),
461             ("abc".as_bytes(), 0),
462             ("abcd".as_bytes(), 0),
463             ("abcde".as_bytes(), 0),
464             ("abcdef".as_bytes(), 0),
465         ];
466 
467         let mut builder = DoubleArrayBuilder::new();
468         let da = builder.build_from_keyset(keyset);
469         assert!(da.is_some());
470 
471         assert!(0 < builder.num_units());
472         assert!(0 < builder.num_used_units());
473         assert!(builder.num_used_units() < builder.num_units());
474     }
475 }
476