1 //===- llvm/ADT/SparseMultiSet.h - Sparse multiset --------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file defines the SparseMultiSet class, which adds multiset behavior to
11 /// the SparseSet.
12 ///
13 /// A sparse multiset holds a small number of objects identified by integer keys
14 /// from a moderately sized universe. The sparse multiset uses more memory than
15 /// other containers in order to provide faster operations. Any key can map to
16 /// multiple values. A SparseMultiSetNode class is provided, which serves as a
17 /// convenient base class for the contents of a SparseMultiSet.
18 ///
19 //===----------------------------------------------------------------------===//
20 
21 #ifndef LLVM_ADT_SPARSEMULTISET_H
22 #define LLVM_ADT_SPARSEMULTISET_H
23 
24 #include "llvm/ADT/identity.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/SparseSet.h"
27 #include <cassert>
28 #include <cstdint>
29 #include <cstdlib>
30 #include <iterator>
31 #include <limits>
32 #include <utility>
33 
34 namespace llvm {
35 
36 /// Fast multiset implementation for objects that can be identified by small
37 /// unsigned keys.
38 ///
39 /// SparseMultiSet allocates memory proportional to the size of the key
40 /// universe, so it is not recommended for building composite data structures.
41 /// It is useful for algorithms that require a single set with fast operations.
42 ///
43 /// Compared to DenseSet and DenseMap, SparseMultiSet provides constant-time
44 /// fast clear() as fast as a vector.  The find(), insert(), and erase()
45 /// operations are all constant time, and typically faster than a hash table.
46 /// The iteration order doesn't depend on numerical key values, it only depends
47 /// on the order of insert() and erase() operations.  Iteration order is the
48 /// insertion order. Iteration is only provided over elements of equivalent
49 /// keys, but iterators are bidirectional.
50 ///
51 /// Compared to BitVector, SparseMultiSet<unsigned> uses 8x-40x more memory, but
52 /// offers constant-time clear() and size() operations as well as fast iteration
53 /// independent on the size of the universe.
54 ///
55 /// SparseMultiSet contains a dense vector holding all the objects and a sparse
56 /// array holding indexes into the dense vector.  Most of the memory is used by
57 /// the sparse array which is the size of the key universe. The SparseT template
58 /// parameter provides a space/speed tradeoff for sets holding many elements.
59 ///
60 /// When SparseT is uint32_t, find() only touches up to 3 cache lines, but the
61 /// sparse array uses 4 x Universe bytes.
62 ///
63 /// When SparseT is uint8_t (the default), find() touches up to 3+[N/256] cache
64 /// lines, but the sparse array is 4x smaller.  N is the number of elements in
65 /// the set.
66 ///
67 /// For sets that may grow to thousands of elements, SparseT should be set to
68 /// uint16_t or uint32_t.
69 ///
70 /// Multiset behavior is provided by providing doubly linked lists for values
71 /// that are inlined in the dense vector. SparseMultiSet is a good choice when
72 /// one desires a growable number of entries per key, as it will retain the
73 /// SparseSet algorithmic properties despite being growable. Thus, it is often a
74 /// better choice than a SparseSet of growable containers or a vector of
75 /// vectors. SparseMultiSet also keeps iterators valid after erasure (provided
76 /// the iterators don't point to the element erased), allowing for more
77 /// intuitive and fast removal.
78 ///
79 /// @tparam ValueT      The type of objects in the set.
80 /// @tparam KeyFunctorT A functor that computes an unsigned index from KeyT.
81 /// @tparam SparseT     An unsigned integer type. See above.
82 ///
83 template<typename ValueT,
84          typename KeyFunctorT = identity<unsigned>,
85          typename SparseT = uint8_t>
86 class SparseMultiSet {
87   static_assert(std::numeric_limits<SparseT>::is_integer &&
88                 !std::numeric_limits<SparseT>::is_signed,
89                 "SparseT must be an unsigned integer type");
90 
91   /// The actual data that's stored, as a doubly-linked list implemented via
92   /// indices into the DenseVector.  The doubly linked list is implemented
93   /// circular in Prev indices, and INVALID-terminated in Next indices. This
94   /// provides efficient access to list tails. These nodes can also be
95   /// tombstones, in which case they are actually nodes in a single-linked
96   /// freelist of recyclable slots.
97   struct SMSNode {
98     static constexpr unsigned INVALID = ~0U;
99 
100     ValueT Data;
101     unsigned Prev;
102     unsigned Next;
103 
104     SMSNode(ValueT D, unsigned P, unsigned N) : Data(D), Prev(P), Next(N) {}
105 
106     /// List tails have invalid Nexts.
107     bool isTail() const {
108       return Next == INVALID;
109     }
110 
111     /// Whether this node is a tombstone node, and thus is in our freelist.
112     bool isTombstone() const {
113       return Prev == INVALID;
114     }
115 
116     /// Since the list is circular in Prev, all non-tombstone nodes have a valid
117     /// Prev.
118     bool isValid() const { return Prev != INVALID; }
119   };
120 
121   using KeyT = typename KeyFunctorT::argument_type;
122   using DenseT = SmallVector<SMSNode, 8>;
123   DenseT Dense;
124   SparseT *Sparse = nullptr;
125   unsigned Universe = 0;
126   KeyFunctorT KeyIndexOf;
127   SparseSetValFunctor<KeyT, ValueT, KeyFunctorT> ValIndexOf;
128 
129   /// We have a built-in recycler for reusing tombstone slots. This recycler
130   /// puts a singly-linked free list into tombstone slots, allowing us quick
131   /// erasure, iterator preservation, and dense size.
132   unsigned FreelistIdx = SMSNode::INVALID;
133   unsigned NumFree = 0;
134 
135   unsigned sparseIndex(const ValueT &Val) const {
136     assert(ValIndexOf(Val) < Universe &&
137            "Invalid key in set. Did object mutate?");
138     return ValIndexOf(Val);
139   }
140   unsigned sparseIndex(const SMSNode &N) const { return sparseIndex(N.Data); }
141 
142   /// Whether the given entry is the head of the list. List heads's previous
143   /// pointers are to the tail of the list, allowing for efficient access to the
144   /// list tail. D must be a valid entry node.
145   bool isHead(const SMSNode &D) const {
146     assert(D.isValid() && "Invalid node for head");
147     return Dense[D.Prev].isTail();
148   }
149 
150   /// Whether the given entry is a singleton entry, i.e. the only entry with
151   /// that key.
152   bool isSingleton(const SMSNode &N) const {
153     assert(N.isValid() && "Invalid node for singleton");
154     // Is N its own predecessor?
155     return &Dense[N.Prev] == &N;
156   }
157 
158   /// Add in the given SMSNode. Uses a free entry in our freelist if
159   /// available. Returns the index of the added node.
160   unsigned addValue(const ValueT& V, unsigned Prev, unsigned Next) {
161     if (NumFree == 0) {
162       Dense.push_back(SMSNode(V, Prev, Next));
163       return Dense.size() - 1;
164     }
165 
166     // Peel off a free slot
167     unsigned Idx = FreelistIdx;
168     unsigned NextFree = Dense[Idx].Next;
169     assert(Dense[Idx].isTombstone() && "Non-tombstone free?");
170 
171     Dense[Idx] = SMSNode(V, Prev, Next);
172     FreelistIdx = NextFree;
173     --NumFree;
174     return Idx;
175   }
176 
177   /// Make the current index a new tombstone. Pushes it onto the freelist.
178   void makeTombstone(unsigned Idx) {
179     Dense[Idx].Prev = SMSNode::INVALID;
180     Dense[Idx].Next = FreelistIdx;
181     FreelistIdx = Idx;
182     ++NumFree;
183   }
184 
185 public:
186   using value_type = ValueT;
187   using reference = ValueT &;
188   using const_reference = const ValueT &;
189   using pointer = ValueT *;
190   using const_pointer = const ValueT *;
191   using size_type = unsigned;
192 
193   SparseMultiSet() = default;
194   SparseMultiSet(const SparseMultiSet &) = delete;
195   SparseMultiSet &operator=(const SparseMultiSet &) = delete;
196   ~SparseMultiSet() { free(Sparse); }
197 
198   /// Set the universe size which determines the largest key the set can hold.
199   /// The universe must be sized before any elements can be added.
200   ///
201   /// @param U Universe size. All object keys must be less than U.
202   ///
203   void setUniverse(unsigned U) {
204     // It's not hard to resize the universe on a non-empty set, but it doesn't
205     // seem like a likely use case, so we can add that code when we need it.
206     assert(empty() && "Can only resize universe on an empty map");
207     // Hysteresis prevents needless reallocations.
208     if (U >= Universe/4 && U <= Universe)
209       return;
210     free(Sparse);
211     // The Sparse array doesn't actually need to be initialized, so malloc
212     // would be enough here, but that will cause tools like valgrind to
213     // complain about branching on uninitialized data.
214     Sparse = static_cast<SparseT*>(safe_calloc(U, sizeof(SparseT)));
215     Universe = U;
216   }
217 
218   /// Our iterators are iterators over the collection of objects that share a
219   /// key.
220   template <typename SMSPtrTy> class iterator_base {
221     friend class SparseMultiSet;
222 
223   public:
224     using iterator_category = std::bidirectional_iterator_tag;
225     using value_type = ValueT;
226     using difference_type = std::ptrdiff_t;
227     using pointer = value_type *;
228     using reference = value_type &;
229 
230   private:
231     SMSPtrTy SMS;
232     unsigned Idx;
233     unsigned SparseIdx;
234 
235     iterator_base(SMSPtrTy P, unsigned I, unsigned SI)
236       : SMS(P), Idx(I), SparseIdx(SI) {}
237 
238     /// Whether our iterator has fallen outside our dense vector.
239     bool isEnd() const {
240       if (Idx == SMSNode::INVALID)
241         return true;
242 
243       assert(Idx < SMS->Dense.size() && "Out of range, non-INVALID Idx?");
244       return false;
245     }
246 
247     /// Whether our iterator is properly keyed, i.e. the SparseIdx is valid
248     bool isKeyed() const { return SparseIdx < SMS->Universe; }
249 
250     unsigned Prev() const { return SMS->Dense[Idx].Prev; }
251     unsigned Next() const { return SMS->Dense[Idx].Next; }
252 
253     void setPrev(unsigned P) { SMS->Dense[Idx].Prev = P; }
254     void setNext(unsigned N) { SMS->Dense[Idx].Next = N; }
255 
256   public:
257     reference operator*() const {
258       assert(isKeyed() && SMS->sparseIndex(SMS->Dense[Idx].Data) == SparseIdx &&
259              "Dereferencing iterator of invalid key or index");
260 
261       return SMS->Dense[Idx].Data;
262     }
263     pointer operator->() const { return &operator*(); }
264 
265     /// Comparison operators
266     bool operator==(const iterator_base &RHS) const {
267       // end compares equal
268       if (SMS == RHS.SMS && Idx == RHS.Idx) {
269         assert((isEnd() || SparseIdx == RHS.SparseIdx) &&
270                "Same dense entry, but different keys?");
271         return true;
272       }
273 
274       return false;
275     }
276 
277     bool operator!=(const iterator_base &RHS) const {
278       return !operator==(RHS);
279     }
280 
281     /// Increment and decrement operators
282     iterator_base &operator--() { // predecrement - Back up
283       assert(isKeyed() && "Decrementing an invalid iterator");
284       assert((isEnd() || !SMS->isHead(SMS->Dense[Idx])) &&
285              "Decrementing head of list");
286 
287       // If we're at the end, then issue a new find()
288       if (isEnd())
289         Idx = SMS->findIndex(SparseIdx).Prev();
290       else
291         Idx = Prev();
292 
293       return *this;
294     }
295     iterator_base &operator++() { // preincrement - Advance
296       assert(!isEnd() && isKeyed() && "Incrementing an invalid/end iterator");
297       Idx = Next();
298       return *this;
299     }
300     iterator_base operator--(int) { // postdecrement
301       iterator_base I(*this);
302       --*this;
303       return I;
304     }
305     iterator_base operator++(int) { // postincrement
306       iterator_base I(*this);
307       ++*this;
308       return I;
309     }
310   };
311 
312   using iterator = iterator_base<SparseMultiSet *>;
313   using const_iterator = iterator_base<const SparseMultiSet *>;
314 
315   // Convenience types
316   using RangePair = std::pair<iterator, iterator>;
317 
318   /// Returns an iterator past this container. Note that such an iterator cannot
319   /// be decremented, but will compare equal to other end iterators.
320   iterator end() { return iterator(this, SMSNode::INVALID, SMSNode::INVALID); }
321   const_iterator end() const {
322     return const_iterator(this, SMSNode::INVALID, SMSNode::INVALID);
323   }
324 
325   /// Returns true if the set is empty.
326   ///
327   /// This is not the same as BitVector::empty().
328   ///
329   bool empty() const { return size() == 0; }
330 
331   /// Returns the number of elements in the set.
332   ///
333   /// This is not the same as BitVector::size() which returns the size of the
334   /// universe.
335   ///
336   size_type size() const {
337     assert(NumFree <= Dense.size() && "Out-of-bounds free entries");
338     return Dense.size() - NumFree;
339   }
340 
341   /// Clears the set.  This is a very fast constant time operation.
342   ///
343   void clear() {
344     // Sparse does not need to be cleared, see find().
345     Dense.clear();
346     NumFree = 0;
347     FreelistIdx = SMSNode::INVALID;
348   }
349 
350   /// Find an element by its index.
351   ///
352   /// @param   Idx A valid index to find.
353   /// @returns An iterator to the element identified by key, or end().
354   ///
355   iterator findIndex(unsigned Idx) {
356     assert(Idx < Universe && "Key out of range");
357     const unsigned Stride = std::numeric_limits<SparseT>::max() + 1u;
358     for (unsigned i = Sparse[Idx], e = Dense.size(); i < e; i += Stride) {
359       const unsigned FoundIdx = sparseIndex(Dense[i]);
360       // Check that we're pointing at the correct entry and that it is the head
361       // of a valid list.
362       if (Idx == FoundIdx && Dense[i].isValid() && isHead(Dense[i]))
363         return iterator(this, i, Idx);
364       // Stride is 0 when SparseT >= unsigned.  We don't need to loop.
365       if (!Stride)
366         break;
367     }
368     return end();
369   }
370 
371   /// Find an element by its key.
372   ///
373   /// @param   Key A valid key to find.
374   /// @returns An iterator to the element identified by key, or end().
375   ///
376   iterator find(const KeyT &Key) {
377     return findIndex(KeyIndexOf(Key));
378   }
379 
380   const_iterator find(const KeyT &Key) const {
381     iterator I = const_cast<SparseMultiSet*>(this)->findIndex(KeyIndexOf(Key));
382     return const_iterator(I.SMS, I.Idx, KeyIndexOf(Key));
383   }
384 
385   /// Returns the number of elements identified by Key. This will be linear in
386   /// the number of elements of that key.
387   size_type count(const KeyT &Key) const {
388     unsigned Ret = 0;
389     for (const_iterator It = find(Key); It != end(); ++It)
390       ++Ret;
391 
392     return Ret;
393   }
394 
395   /// Returns true if this set contains an element identified by Key.
396   bool contains(const KeyT &Key) const {
397     return find(Key) != end();
398   }
399 
400   /// Return the head and tail of the subset's list, otherwise returns end().
401   iterator getHead(const KeyT &Key) { return find(Key); }
402   iterator getTail(const KeyT &Key) {
403     iterator I = find(Key);
404     if (I != end())
405       I = iterator(this, I.Prev(), KeyIndexOf(Key));
406     return I;
407   }
408 
409   /// The bounds of the range of items sharing Key K. First member is the head
410   /// of the list, and the second member is a decrementable end iterator for
411   /// that key.
412   RangePair equal_range(const KeyT &K) {
413     iterator B = find(K);
414     iterator E = iterator(this, SMSNode::INVALID, B.SparseIdx);
415     return std::make_pair(B, E);
416   }
417 
418   /// Insert a new element at the tail of the subset list. Returns an iterator
419   /// to the newly added entry.
420   iterator insert(const ValueT &Val) {
421     unsigned Idx = sparseIndex(Val);
422     iterator I = findIndex(Idx);
423 
424     unsigned NodeIdx = addValue(Val, SMSNode::INVALID, SMSNode::INVALID);
425 
426     if (I == end()) {
427       // Make a singleton list
428       Sparse[Idx] = NodeIdx;
429       Dense[NodeIdx].Prev = NodeIdx;
430       return iterator(this, NodeIdx, Idx);
431     }
432 
433     // Stick it at the end.
434     unsigned HeadIdx = I.Idx;
435     unsigned TailIdx = I.Prev();
436     Dense[TailIdx].Next = NodeIdx;
437     Dense[HeadIdx].Prev = NodeIdx;
438     Dense[NodeIdx].Prev = TailIdx;
439 
440     return iterator(this, NodeIdx, Idx);
441   }
442 
443   /// Erases an existing element identified by a valid iterator.
444   ///
445   /// This invalidates iterators pointing at the same entry, but erase() returns
446   /// an iterator pointing to the next element in the subset's list. This makes
447   /// it possible to erase selected elements while iterating over the subset:
448   ///
449   ///   tie(I, E) = Set.equal_range(Key);
450   ///   while (I != E)
451   ///     if (test(*I))
452   ///       I = Set.erase(I);
453   ///     else
454   ///       ++I;
455   ///
456   /// Note that if the last element in the subset list is erased, this will
457   /// return an end iterator which can be decremented to get the new tail (if it
458   /// exists):
459   ///
460   ///  tie(B, I) = Set.equal_range(Key);
461   ///  for (bool isBegin = B == I; !isBegin; /* empty */) {
462   ///    isBegin = (--I) == B;
463   ///    if (test(I))
464   ///      break;
465   ///    I = erase(I);
466   ///  }
467   iterator erase(iterator I) {
468     assert(I.isKeyed() && !I.isEnd() && !Dense[I.Idx].isTombstone() &&
469            "erasing invalid/end/tombstone iterator");
470 
471     // First, unlink the node from its list. Then swap the node out with the
472     // dense vector's last entry
473     iterator NextI = unlink(Dense[I.Idx]);
474 
475     // Put in a tombstone.
476     makeTombstone(I.Idx);
477 
478     return NextI;
479   }
480 
481   /// Erase all elements with the given key. This invalidates all
482   /// iterators of that key.
483   void eraseAll(const KeyT &K) {
484     for (iterator I = find(K); I != end(); /* empty */)
485       I = erase(I);
486   }
487 
488 private:
489   /// Unlink the node from its list. Returns the next node in the list.
490   iterator unlink(const SMSNode &N) {
491     if (isSingleton(N)) {
492       // Singleton is already unlinked
493       assert(N.Next == SMSNode::INVALID && "Singleton has next?");
494       return iterator(this, SMSNode::INVALID, ValIndexOf(N.Data));
495     }
496 
497     if (isHead(N)) {
498       // If we're the head, then update the sparse array and our next.
499       Sparse[sparseIndex(N)] = N.Next;
500       Dense[N.Next].Prev = N.Prev;
501       return iterator(this, N.Next, ValIndexOf(N.Data));
502     }
503 
504     if (N.isTail()) {
505       // If we're the tail, then update our head and our previous.
506       findIndex(sparseIndex(N)).setPrev(N.Prev);
507       Dense[N.Prev].Next = N.Next;
508 
509       // Give back an end iterator that can be decremented
510       iterator I(this, N.Prev, ValIndexOf(N.Data));
511       return ++I;
512     }
513 
514     // Otherwise, just drop us
515     Dense[N.Next].Prev = N.Prev;
516     Dense[N.Prev].Next = N.Next;
517     return iterator(this, N.Next, ValIndexOf(N.Data));
518   }
519 };
520 
521 } // end namespace llvm
522 
523 #endif // LLVM_ADT_SPARSEMULTISET_H
524