1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <assert.h>
20 #include <errno.h>
21 #include <stdint.h>
22 
23 #include <type_traits>
24 
25 #include <folly/Portability.h>
26 #include <folly/concurrency/CacheLocality.h>
27 #include <folly/portability/SysMman.h>
28 #include <folly/portability/Unistd.h>
29 #include <folly/synchronization/AtomicStruct.h>
30 
31 // Ignore shadowing warnings within this file, so includers can use -Wshadow.
32 FOLLY_PUSH_WARNING
33 FOLLY_GNU_DISABLE_WARNING("-Wshadow")
34 
35 namespace folly {
36 
37 namespace detail {
38 template <typename Pool>
39 struct IndexedMemPoolRecycler;
40 } // namespace detail
41 
42 template <
43     typename T,
44     bool EagerRecycleWhenTrivial = false,
45     bool EagerRecycleWhenNotTrivial = true>
46 struct IndexedMemPoolTraits {
eagerRecycleIndexedMemPoolTraits47   static constexpr bool eagerRecycle() {
48     return std::is_trivial<T>::value ? EagerRecycleWhenTrivial
49                                      : EagerRecycleWhenNotTrivial;
50   }
51 
52   /// Called when the element pointed to by ptr is allocated for the
53   /// first time.
initializeIndexedMemPoolTraits54   static void initialize(T* ptr) {
55     if (!eagerRecycle()) {
56       new (ptr) T();
57     }
58   }
59 
60   /// Called when the element pointed to by ptr is freed at the pool
61   /// destruction time.
cleanupIndexedMemPoolTraits62   static void cleanup(T* ptr) {
63     if (!eagerRecycle()) {
64       ptr->~T();
65     }
66   }
67 
68   /// Called when the element is allocated with the arguments forwarded from
69   /// IndexedMemPool::allocElem.
70   template <typename... Args>
onAllocateIndexedMemPoolTraits71   static void onAllocate(T* ptr, Args&&... args) {
72     static_assert(
73         sizeof...(Args) == 0 || eagerRecycle(),
74         "emplace-style allocation requires eager recycle, "
75         "which is defaulted only for non-trivial types");
76     if (eagerRecycle()) {
77       new (ptr) T(std::forward<Args>(args)...);
78     }
79   }
80 
81   /// Called when the element is recycled.
onRecycleIndexedMemPoolTraits82   static void onRecycle(T* ptr) {
83     if (eagerRecycle()) {
84       ptr->~T();
85     }
86   }
87 };
88 
89 /// IndexedMemPool traits that implements the lazy lifecycle strategy. In this
90 /// strategy elements are default-constructed the first time they are allocated,
91 /// and destroyed when the pool itself is destroyed.
92 template <typename T>
93 using IndexedMemPoolTraitsLazyRecycle = IndexedMemPoolTraits<T, false, false>;
94 
95 /// IndexedMemPool traits that implements the eager lifecycle strategy. In this
96 /// strategy elements are constructed when they are allocated from the pool and
97 /// destroyed when recycled.
98 template <typename T>
99 using IndexedMemPoolTraitsEagerRecycle = IndexedMemPoolTraits<T, true, true>;
100 
101 /// Instances of IndexedMemPool dynamically allocate and then pool their
102 /// element type (T), returning 4-byte integer indices that can be passed
103 /// to the pool's operator[] method to access or obtain pointers to the
104 /// actual elements.  The memory backing items returned from the pool
105 /// will always be readable, even if items have been returned to the pool.
106 /// These two features are useful for lock-free algorithms.  The indexing
107 /// behavior makes it easy to build tagged pointer-like-things, since
108 /// a large number of elements can be managed using fewer bits than a
109 /// full pointer.  The access-after-free behavior makes it safe to read
110 /// from T-s even after they have been recycled, since it is guaranteed
111 /// that the memory won't have been returned to the OS and unmapped
112 /// (the algorithm must still use a mechanism to validate that the read
113 /// was correct, but it doesn't have to worry about page faults), and if
114 /// the elements use internal sequence numbers it can be guaranteed that
115 /// there won't be an ABA match due to the element being overwritten with
116 /// a different type that has the same bit pattern.
117 ///
118 /// The object lifecycle strategy is controlled by the Traits parameter.
119 /// One strategy, implemented by IndexedMemPoolTraitsEagerRecycle, is to
120 /// construct objects when they are allocated from the pool and destroy
121 /// them when they are recycled.  In this mode allocIndex and allocElem
122 /// have emplace-like semantics.  In another strategy, implemented by
123 /// IndexedMemPoolTraitsLazyRecycle, objects are default-constructed the
124 /// first time they are removed from the pool, and deleted when the pool
125 /// itself is deleted.  By default the first mode is used for non-trivial
126 /// T, and the second is used for trivial T.  Clients can customize the
127 /// object lifecycle by providing their own Traits implementation.
128 /// See IndexedMemPoolTraits for a Traits example.
129 ///
130 /// IMPORTANT: Space for extra elements is allocated to account for those
131 /// that are inaccessible because they are in other local lists, so the
132 /// actual number of items that can be allocated ranges from capacity to
133 /// capacity + (NumLocalLists_-1)*LocalListLimit_.  This is important if
134 /// you are trying to maximize the capacity of the pool while constraining
135 /// the bit size of the resulting pointers, because the pointers will
136 /// actually range up to the boosted capacity.  See maxIndexForCapacity
137 /// and capacityForMaxIndex.
138 ///
139 /// To avoid contention, NumLocalLists_ free lists of limited (less than
140 /// or equal to LocalListLimit_) size are maintained, and each thread
141 /// retrieves and returns entries from its associated local list.  If the
142 /// local list becomes too large then elements are placed in bulk in a
143 /// global free list.  This allows items to be efficiently recirculated
144 /// from consumers to producers.  AccessSpreader is used to access the
145 /// local lists, so there is no performance advantage to having more
146 /// local lists than L1 caches.
147 ///
148 /// The pool mmap-s the entire necessary address space when the pool is
149 /// constructed, but delays element construction.  This means that only
150 /// elements that are actually returned to the caller get paged into the
151 /// process's resident set (RSS).
152 template <
153     typename T,
154     uint32_t NumLocalLists_ = 32,
155     uint32_t LocalListLimit_ = 200,
156     template <typename> class Atom = std::atomic,
157     typename Traits = IndexedMemPoolTraits<T>>
158 struct IndexedMemPool {
159   typedef T value_type;
160 
161   typedef std::unique_ptr<T, detail::IndexedMemPoolRecycler<IndexedMemPool>>
162       UniquePtr;
163 
164   IndexedMemPool(const IndexedMemPool&) = delete;
165   IndexedMemPool& operator=(const IndexedMemPool&) = delete;
166 
167   static_assert(LocalListLimit_ <= 255, "LocalListLimit must fit in 8 bits");
168   enum {
169     NumLocalLists = NumLocalLists_,
170     LocalListLimit = LocalListLimit_,
171   };
172 
173   static_assert(
174       std::is_nothrow_default_constructible<Atom<uint32_t>>::value,
175       "Atom must be nothrow default constructible");
176 
177   // these are public because clients may need to reason about the number
178   // of bits required to hold indices from a pool, given its capacity
179 
maxIndexForCapacityIndexedMemPool180   static constexpr uint32_t maxIndexForCapacity(uint32_t capacity) {
181     // index of std::numeric_limits<uint32_t>::max() is reserved for isAllocated
182     // tracking
183     return uint32_t(std::min(
184         uint64_t(capacity) + (NumLocalLists - 1) * LocalListLimit,
185         uint64_t(std::numeric_limits<uint32_t>::max() - 1)));
186   }
187 
capacityForMaxIndexIndexedMemPool188   static constexpr uint32_t capacityForMaxIndex(uint32_t maxIndex) {
189     return maxIndex - (NumLocalLists - 1) * LocalListLimit;
190   }
191 
192   /// Constructs a pool that can allocate at least _capacity_ elements,
193   /// even if all the local lists are full
IndexedMemPoolIndexedMemPool194   explicit IndexedMemPool(uint32_t capacity)
195       : actualCapacity_(maxIndexForCapacity(capacity)),
196         size_(0),
197         globalHead_(TaggedPtr{}) {
198     const size_t needed = sizeof(Slot) * (actualCapacity_ + 1);
199     size_t pagesize = size_t(sysconf(_SC_PAGESIZE));
200     mmapLength_ = ((needed - 1) & ~(pagesize - 1)) + pagesize;
201     assert(needed <= mmapLength_ && mmapLength_ < needed + pagesize);
202     assert((mmapLength_ % pagesize) == 0);
203 
204     slots_ = static_cast<Slot*>(mmap(
205         nullptr,
206         mmapLength_,
207         PROT_READ | PROT_WRITE,
208         MAP_PRIVATE | MAP_ANONYMOUS,
209         -1,
210         0));
211     if (slots_ == MAP_FAILED) {
212       assert(errno == ENOMEM);
213       throw std::bad_alloc();
214     }
215   }
216 
217   /// Destroys all of the contained elements
~IndexedMemPoolIndexedMemPool218   ~IndexedMemPool() {
219     using A = Atom<uint32_t>;
220     for (uint32_t i = maxAllocatedIndex(); i > 0; --i) {
221       Traits::cleanup(&slots_[i].elem);
222       slots_[i].localNext.~A();
223       slots_[i].globalNext.~A();
224     }
225     munmap(slots_, mmapLength_);
226   }
227 
228   /// Returns a lower bound on the number of elements that may be
229   /// simultaneously allocated and not yet recycled.  Because of the
230   /// local lists it is possible that more elements than this are returned
231   /// successfully
capacityIndexedMemPool232   uint32_t capacity() { return capacityForMaxIndex(actualCapacity_); }
233 
234   /// Returns the maximum index of elements ever allocated in this pool
235   /// including elements that have been recycled.
maxAllocatedIndexIndexedMemPool236   uint32_t maxAllocatedIndex() const {
237     // Take the minimum since it is possible that size_ > actualCapacity_.
238     // This can happen if there are multiple concurrent requests
239     // when size_ == actualCapacity_ - 1.
240     return std::min(uint32_t(size_), uint32_t(actualCapacity_));
241   }
242 
243   /// Finds a slot with a non-zero index, emplaces a T there if we're
244   /// using the eager recycle lifecycle mode, and returns the index,
245   /// or returns 0 if no elements are available.  Passes a pointer to
246   /// the element to Traits::onAllocate before the slot is marked as
247   /// allocated.
248   template <typename... Args>
allocIndexIndexedMemPool249   uint32_t allocIndex(Args&&... args) {
250     auto idx = localPop(localHead());
251     if (idx != 0) {
252       Slot& s = slot(idx);
253       Traits::onAllocate(&s.elem, std::forward<Args>(args)...);
254       markAllocated(s);
255     }
256     return idx;
257   }
258 
259   /// If an element is available, returns a std::unique_ptr to it that will
260   /// recycle the element to the pool when it is reclaimed, otherwise returns
261   /// a null (falsy) std::unique_ptr.  Passes a pointer to the element to
262   /// Traits::onAllocate before the slot is marked as allocated.
263   template <typename... Args>
allocElemIndexedMemPool264   UniquePtr allocElem(Args&&... args) {
265     auto idx = allocIndex(std::forward<Args>(args)...);
266     T* ptr = idx == 0 ? nullptr : &slot(idx).elem;
267     return UniquePtr(ptr, typename UniquePtr::deleter_type(this));
268   }
269 
270   /// Gives up ownership previously granted by alloc()
recycleIndexIndexedMemPool271   void recycleIndex(uint32_t idx) {
272     assert(isAllocated(idx));
273     localPush(localHead(), idx);
274   }
275 
276   /// Provides access to the pooled element referenced by idx
277   T& operator[](uint32_t idx) { return slot(idx).elem; }
278 
279   /// Provides access to the pooled element referenced by idx
280   const T& operator[](uint32_t idx) const { return slot(idx).elem; }
281 
282   /// If elem == &pool[idx], then pool.locateElem(elem) == idx.  Also,
283   /// pool.locateElem(nullptr) == 0
locateElemIndexedMemPool284   uint32_t locateElem(const T* elem) const {
285     if (!elem) {
286       return 0;
287     }
288 
289     static_assert(std::is_standard_layout<Slot>::value, "offsetof needs POD");
290 
291     auto slot = reinterpret_cast<const Slot*>(
292         reinterpret_cast<const char*>(elem) - offsetof(Slot, elem));
293     auto rv = uint32_t(slot - slots_);
294 
295     // this assert also tests that rv is in range
296     assert(elem == &(*this)[rv]);
297     return rv;
298   }
299 
300   /// Returns true iff idx has been alloc()ed and not recycleIndex()ed
isAllocatedIndexedMemPool301   bool isAllocated(uint32_t idx) const {
302     return slot(idx).localNext.load(std::memory_order_acquire) == uint32_t(-1);
303   }
304 
305  private:
306   ///////////// types
307 
308   struct Slot {
309     T elem;
310     Atom<uint32_t> localNext;
311     Atom<uint32_t> globalNext;
312 
SlotIndexedMemPool::Slot313     Slot() : localNext{}, globalNext{} {}
314   };
315 
316   struct TaggedPtr {
317     uint32_t idx;
318 
319     // size is bottom 8 bits, tag in top 24.  g++'s code generation for
320     // bitfields seems to depend on the phase of the moon, plus we can
321     // do better because we can rely on other checks to avoid masking
322     uint32_t tagAndSize;
323 
324     enum : uint32_t {
325       SizeBits = 8,
326       SizeMask = (1U << SizeBits) - 1,
327       TagIncr = 1U << SizeBits,
328     };
329 
sizeIndexedMemPool::TaggedPtr330     uint32_t size() const { return tagAndSize & SizeMask; }
331 
withSizeIndexedMemPool::TaggedPtr332     TaggedPtr withSize(uint32_t repl) const {
333       assert(repl <= LocalListLimit);
334       return TaggedPtr{idx, (tagAndSize & ~SizeMask) | repl};
335     }
336 
withSizeIncrIndexedMemPool::TaggedPtr337     TaggedPtr withSizeIncr() const {
338       assert(size() < LocalListLimit);
339       return TaggedPtr{idx, tagAndSize + 1};
340     }
341 
withSizeDecrIndexedMemPool::TaggedPtr342     TaggedPtr withSizeDecr() const {
343       assert(size() > 0);
344       return TaggedPtr{idx, tagAndSize - 1};
345     }
346 
withIdxIndexedMemPool::TaggedPtr347     TaggedPtr withIdx(uint32_t repl) const {
348       return TaggedPtr{repl, tagAndSize + TagIncr};
349     }
350 
withEmptyIndexedMemPool::TaggedPtr351     TaggedPtr withEmpty() const { return withIdx(0).withSize(0); }
352   };
353 
alignasIndexedMemPool354   struct alignas(hardware_destructive_interference_size) LocalList {
355     AtomicStruct<TaggedPtr, Atom> head;
356 
357     LocalList() : head(TaggedPtr{}) {}
358   };
359 
360   ////////// fields
361 
362   /// the number of bytes allocated from mmap, which is a multiple of
363   /// the page size of the machine
364   size_t mmapLength_;
365 
366   /// the actual number of slots that we will allocate, to guarantee
367   /// that we will satisfy the capacity requested at construction time.
368   /// They will be numbered 1..actualCapacity_ (note the 1-based counting),
369   /// and occupy slots_[1..actualCapacity_].
370   uint32_t actualCapacity_;
371 
372   /// this records the number of slots that have actually been constructed.
373   /// To allow use of atomic ++ instead of CAS, we let this overflow.
374   /// The actual number of constructed elements is min(actualCapacity_,
375   /// size_)
376   Atom<uint32_t> size_;
377 
378   /// raw storage, only 1..min(size_,actualCapacity_) (inclusive) are
379   /// actually constructed.  Note that slots_[0] is not constructed or used
380   alignas(hardware_destructive_interference_size) Slot* slots_;
381 
382   /// use AccessSpreader to find your list.  We use stripes instead of
383   /// thread-local to avoid the need to grow or shrink on thread start
384   /// or join.   These are heads of lists chained with localNext
385   LocalList local_[NumLocalLists];
386 
387   /// this is the head of a list of node chained by globalNext, that are
388   /// themselves each the head of a list chained by localNext
389   alignas(hardware_destructive_interference_size)
390       AtomicStruct<TaggedPtr, Atom> globalHead_;
391 
392   ///////////// private methods
393 
slotIndexIndexedMemPool394   uint32_t slotIndex(uint32_t idx) const {
395     assert(
396         0 < idx && idx <= actualCapacity_ &&
397         idx <= size_.load(std::memory_order_acquire));
398     return idx;
399   }
400 
slotIndexedMemPool401   Slot& slot(uint32_t idx) { return slots_[slotIndex(idx)]; }
402 
slotIndexedMemPool403   const Slot& slot(uint32_t idx) const { return slots_[slotIndex(idx)]; }
404 
405   // localHead references a full list chained by localNext.  s should
406   // reference slot(localHead), it is passed as a micro-optimization
globalPushIndexedMemPool407   void globalPush(Slot& s, uint32_t localHead) {
408     while (true) {
409       TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
410       s.globalNext.store(gh.idx, std::memory_order_relaxed);
411       if (globalHead_.compare_exchange_strong(gh, gh.withIdx(localHead))) {
412         // success
413         return;
414       }
415     }
416   }
417 
418   // idx references a single node
localPushIndexedMemPool419   void localPush(AtomicStruct<TaggedPtr, Atom>& head, uint32_t idx) {
420     Slot& s = slot(idx);
421     TaggedPtr h = head.load(std::memory_order_acquire);
422     bool recycled = false;
423     while (true) {
424       s.localNext.store(h.idx, std::memory_order_release);
425       if (!recycled) {
426         Traits::onRecycle(&slot(idx).elem);
427         recycled = true;
428       }
429 
430       if (h.size() == LocalListLimit) {
431         // push will overflow local list, steal it instead
432         if (head.compare_exchange_strong(h, h.withEmpty())) {
433           // steal was successful, put everything in the global list
434           globalPush(s, idx);
435           return;
436         }
437       } else {
438         // local list has space
439         if (head.compare_exchange_strong(h, h.withIdx(idx).withSizeIncr())) {
440           // success
441           return;
442         }
443       }
444       // h was updated by failing CAS
445     }
446   }
447 
448   // returns 0 if empty
globalPopIndexedMemPool449   uint32_t globalPop() {
450     while (true) {
451       TaggedPtr gh = globalHead_.load(std::memory_order_acquire);
452       if (gh.idx == 0 ||
453           globalHead_.compare_exchange_strong(
454               gh,
455               gh.withIdx(
456                   slot(gh.idx).globalNext.load(std::memory_order_relaxed)))) {
457         // global list is empty, or pop was successful
458         return gh.idx;
459       }
460     }
461   }
462 
463   // returns 0 if allocation failed
localPopIndexedMemPool464   uint32_t localPop(AtomicStruct<TaggedPtr, Atom>& head) {
465     while (true) {
466       TaggedPtr h = head.load(std::memory_order_acquire);
467       if (h.idx != 0) {
468         // local list is non-empty, try to pop
469         Slot& s = slot(h.idx);
470         auto next = s.localNext.load(std::memory_order_relaxed);
471         if (head.compare_exchange_strong(h, h.withIdx(next).withSizeDecr())) {
472           // success
473           return h.idx;
474         }
475         continue;
476       }
477 
478       uint32_t idx = globalPop();
479       if (idx == 0) {
480         // global list is empty, allocate and construct new slot
481         if (size_.load(std::memory_order_relaxed) >= actualCapacity_ ||
482             (idx = ++size_) > actualCapacity_) {
483           // allocation failed
484           return 0;
485         }
486         Slot& s = slot(idx);
487         // Atom is enforced above to be nothrow-default-constructible
488         // As an optimization, use default-initialization (no parens) rather
489         // than direct-initialization (with parens): these locations are
490         // stored-to before they are loaded-from
491         new (&s.localNext) Atom<uint32_t>;
492         new (&s.globalNext) Atom<uint32_t>;
493         Traits::initialize(&s.elem);
494         return idx;
495       }
496 
497       Slot& s = slot(idx);
498       auto next = s.localNext.load(std::memory_order_relaxed);
499       if (head.compare_exchange_strong(
500               h, h.withIdx(next).withSize(LocalListLimit))) {
501         // global list moved to local list, keep head for us
502         return idx;
503       }
504       // local bulk push failed, return idx to the global list and try again
505       globalPush(s, idx);
506     }
507   }
508 
localHeadIndexedMemPool509   AtomicStruct<TaggedPtr, Atom>& localHead() {
510     auto stripe = AccessSpreader<Atom>::current(NumLocalLists);
511     return local_[stripe].head;
512   }
513 
markAllocatedIndexedMemPool514   void markAllocated(Slot& slot) {
515     slot.localNext.store(uint32_t(-1), std::memory_order_release);
516   }
517 
518  public:
519   static constexpr std::size_t kSlotSize = sizeof(Slot);
520 };
521 
522 namespace detail {
523 
524 /// This is a stateful Deleter functor, which allows std::unique_ptr
525 /// to track elements allocated from an IndexedMemPool by tracking the
526 /// associated pool.  See IndexedMemPool::allocElem.
527 template <typename Pool>
528 struct IndexedMemPoolRecycler {
529   Pool* pool;
530 
IndexedMemPoolRecyclerIndexedMemPoolRecycler531   explicit IndexedMemPoolRecycler(Pool* pool) : pool(pool) {}
532 
533   IndexedMemPoolRecycler(const IndexedMemPoolRecycler<Pool>& rhs) = default;
534   IndexedMemPoolRecycler& operator=(const IndexedMemPoolRecycler<Pool>& rhs) =
535       default;
536 
operatorIndexedMemPoolRecycler537   void operator()(typename Pool::value_type* elem) const {
538     pool->recycleIndex(pool->locateElem(elem));
539   }
540 };
541 
542 } // namespace detail
543 
544 } // namespace folly
545 
546 FOLLY_POP_WARNING
547