1 //  Copyright (c) 2014-2016 John Biddiscombe
2 //
3 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
4 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5 
6 #ifndef HPX_PARCELSET_POLICIES_VERBS_MEMORY_POOL
7 #define HPX_PARCELSET_POLICIES_VERBS_MEMORY_POOL
8 
9 #include <hpx/lcos/local/mutex.hpp>
10 #include <hpx/lcos/local/spinlock.hpp>
11 //
12 #include <atomic>
13 #include <stack>
14 #include <unordered_map>
15 #include <iostream>
16 #include <cstddef>
17 #include <memory>
18 //
19 #include <boost/lockfree/stack.hpp>
20 //
21 #include <hpx/config/parcelport_defines.hpp>
22 //
23 #include <plugins/parcelport/parcelport_logging.hpp>
24 #include <plugins/parcelport/verbs/rdma/rdma_locks.hpp>
25 #include <plugins/parcelport/verbs/rdma/rdma_chunk_pool.hpp>
26 #include <plugins/parcelport/verbs/rdma/verbs_protection_domain.hpp>
27 #include <plugins/parcelport/verbs/rdma/verbs_memory_region.hpp>
28 
29 // the default memory chunk size in bytes
30 #define RDMA_POOL_1K_CHUNK          0x001*0x0400 //  1KB
31 #define RDMA_POOL_SMALL_CHUNK_SIZE  0x010*0x0400 // 16KB
32 #define RDMA_POOL_MEDIUM_CHUNK_SIZE 0x040*0x0400 // 64KB
33 #define RDMA_POOL_LARGE_CHUNK_SIZE  0x400*0x0400 //  1MB
34 
35 #define RDMA_POOL_MAX_1K_CHUNKS     1024
36 #define RDMA_POOL_MAX_SMALL_CHUNKS  1024
37 #define RDMA_POOL_MAX_MEDIUM_CHUNKS 64
38 #define RDMA_POOL_MAX_LARGE_CHUNKS  32
39 
40 #define RDMA_POOL_USE_LOCKFREE_STACK 1
41 
42 // if the HPX configuration has set a different value, use it
43 #if defined(HPX_PARCELPORT_VERBS_MEMORY_CHUNK_SIZE)
44 # undef RDMA_POOL_SMALL_CHUNK_SIZE
45 # define RDMA_POOL_SMALL_CHUNK_SIZE HPX_PARCELPORT_VERBS_MEMORY_CHUNK_SIZE
46 #endif
47 
48 static_assert ( HPX_PARCELPORT_VERBS_MEMORY_CHUNK_SIZE<RDMA_POOL_MEDIUM_CHUNK_SIZE ,
49 "Default memory Chunk size must be less than medium chunk size" );
50 
51 
52 // Description of memory pool objects:
53 //
54 // memory_region_allocator:
55 // An allocator that returns memory of the requested size. The memory is pinned
56 // and ready to be used for RDMA operations. A memory_region object is
57 // used, it contains the memory registration information needed by the verbs API.
58 //
59 // rdma_chunk_pool :
60 // This is a class taken from boost that takes a block of memory (in this case provided
61 // by the memory_region_allocator) and divides it up into N smaller blocks.
62 // These smaller blocks can be used for individual objects or can be used as buffers.
63 // If 16 blocks of 1K are requested, it will call the allocator and request
64 // 16*1K + 16 bytes. The overhead per memory allocation request is 16 bytes
65 //
66 // pool_container:
67 // The pool container wraps an rdma_chunk_pool and provides a stack. When a user
68 // requests a small block, one is popped off the stack. At startup, the pool_container
69 // requests a large number of blocks from the rdma_chunk_pool and sets the correct
70 // address offset within each larger chunk for each small block and pushes the mini
71 // verbs_memory_region onto the stack. Thus N small rdma_regions are created from a
72 // single larger one and memory blocks come from contiguous memory.
73 //
74 // rdma_memory_pool:
75 // The rdma_memory_pool maintains 4 pool_container (stacks) of different sized blocks
76 // so that most user requests can be fulfilled.
77 // If a request cannot be filled, the pool can generate temporary blocks with
78 // new allocations and on-the-fly registration of the memory.
79 // Additionally, it also provides a simple API so users may pass pre-allocated
80 // memory to the pool for on-the-fly registration (rdma transfer of user memory chunks)
81 // and later de-registration.
82 
83 namespace hpx {
84 namespace parcelset {
85 namespace policies {
86 namespace verbs
87 {
88     struct rdma_memory_pool;
89 }}}}
90 
91 namespace hpx {
92 namespace parcelset {
93 namespace policies {
94 namespace verbs
95 {
96     // A simple tag type we use for logging assistance (identification)
deschpx::parcelset::policies::verbs::pool_tiny97     struct pool_tiny   { static const char *desc() { return "Tiny ";   } };
deschpx::parcelset::policies::verbs::pool_small98     struct pool_small  { static const char *desc() { return "Small ";  } };
deschpx::parcelset::policies::verbs::pool_medium99     struct pool_medium { static const char *desc() { return "Medium "; } };
deschpx::parcelset::policies::verbs::pool_large100     struct pool_large  { static const char *desc() { return "Large ";  } };
101 
102     // ---------------------------------------------------------------------------
103     // pool_container, collect some routines for reuse with
104     // small, medium, large chunks
105     // ---------------------------------------------------------------------------
106     template <typename pool_chunk_allocator, typename PoolType>
107     struct pool_container
108     {
109 #ifndef RDMA_POOL_USE_LOCKFREE_STACK
110         typedef hpx::lcos::local::spinlock                               mutex_type;
111         typedef hpx::parcelset::policies::verbs::scoped_lock<mutex_type> scoped_lock;
112 #endif
113 
114         // ------------------------------------------------------------------------
pool_containerhpx::parcelset::policies::verbs::pool_container115         pool_container(verbs_protection_domain_ptr pd, std::size_t chunk_size,
116             std::size_t chunks_per_block, std::size_t max_items) :
117                 chunk_size_(chunk_size), max_chunks_(max_items), used_(0),
118                 chunk_allocator(pd, chunk_size, chunks_per_block, chunks_per_block)
119         {
120             LOG_DEBUG_MSG(PoolType::desc() << "Creating with chunk_size "
121                 << hexnumber(chunk_size_) << "max_chunks " << decnumber(max_chunks_));
122         }
123 
124         // ------------------------------------------------------------------------
allocate_poolhpx::parcelset::policies::verbs::pool_container125         bool allocate_pool(std::size_t _num_chunks)
126         {
127             LOG_DEBUG_MSG(PoolType::desc() << "Allocating " << decnumber(_num_chunks)
128                 << " blocks of " << hexlength(chunk_size_));
129             //
130             for (std::size_t i=0; i<_num_chunks; i++) {
131                 LOG_TRACE_MSG(PoolType::desc() << "Allocate Block "
132                     << i << " of size " << hexlength(chunk_size_));
133                 verbs_memory_region region = chunk_allocator.malloc();
134                 if (region.get_address()!=nullptr) {
135                     block_list_[region.get_address()] = region;
136                     // we use the pointer to the region for access
137                     verbs_memory_region *r = &block_list_[region.get_address()];
138                     push(r);
139                 }
140                 else {
141                     LOG_ERROR_MSG(PoolType::desc()
142                         << "Block Allocation Stopped at " << (i-1));
143                     return false;
144                 }
145             }
146             used_ = 0;
147             return true;
148         }
149 
150         // ------------------------------------------------------------------------
DeallocatePoolhpx::parcelset::policies::verbs::pool_container151         int DeallocatePool()
152         {
153             if (used_!=0) {
154                 LOG_ERROR_MSG(PoolType::desc()
155                     << "Deallocating free_list : Not all blocks were returned "
156                     << " refcounts " << decnumber(used_));
157             }
158             while (!free_list_.empty()) {
159                 chunk_allocator.free(*pop());
160             }
161             block_list_.clear();
162             chunk_allocator.release_memory();
163             return 1;
164         }
165 
166         // ------------------------------------------------------------------------
pushhpx::parcelset::policies::verbs::pool_container167         inline void push(verbs_memory_region *region)
168         {
169 #ifndef RDMA_POOL_USE_LOCKFREE_STACK
170             scoped_lock lock(memBuffer_mutex_);
171 #endif
172             LOG_TRACE_MSG(PoolType::desc() << "Push block "
173                 << hexpointer(region->get_address()) << hexlength(region->get_size())
174                 << decnumber(used_-1));
175 #ifdef RDMA_POOL_MEMORY_CHECK
176             uintptr_t val = uintptr_t(region->get_address());
177             LOG_TRACE_MSG(PoolType::desc()
178                 << "Writing 0xdeadbeef to region address "
179                 << hexpointer(region->get_address()));
180             if (region->get_address()!=nullptr) {
181                 // get use the pointer to the region
182                 uintptr_t *ptr = reinterpret_cast<uintptr_t*>(region->get_address());
183                 for (unsigned int c=0; c<chunk_size_/8; ++c) {
184                     ptr[c] = 0xdeadbeef;
185                     ptr[c] = val;
186                 }
187             }
188 #endif
189 
190 #ifdef RDMA_POOL_USE_LOCKFREE_STACK
191             if (!free_list_.push(region)) {
192                 LOG_ERROR_MSG(PoolType::desc() << "Error in memory pool push");
193             }
194 #else
195             free_list_.push(region);
196 #endif
197             // decrement one reference
198             used_--;
199         }
200 
201         // ------------------------------------------------------------------------
pophpx::parcelset::policies::verbs::pool_container202         inline verbs_memory_region *pop()
203         {
204 #ifndef RDMA_POOL_USE_LOCKFREE_STACK
205             scoped_lock lock(memBuffer_mutex_);
206 #endif
207             // if we have not exceeded our max size, allocate a new block
208             if (free_list_.empty()) {
209                 //  LOG_TRACE_MSG("Creating new small Block as free list is empty "
210                 // "but max chunks " << max_small_chunks_ << " not reached");
211                 //  AllocateRegisteredBlock(length);
212                 //std::terminate();
213                 return nullptr;
214             }
215 #ifdef RDMA_POOL_USE_LOCKFREE_STACK
216             // get a block
217             verbs_memory_region *region = nullptr;
218             if (!free_list_.pop(region)) {
219                 LOG_DEBUG_MSG(PoolType::desc() << "Error in memory pool pop");
220             }
221 #else
222             verbs_memory_region *region = free_list_.top();
223             free_list_.pop();
224 #endif
225             // Keep reference counts to self so that we can check
226             // this pool is not deleted whilst blocks still exist
227             used_++;
228             LOG_TRACE_MSG(PoolType::desc() << "Pop block "
229                 << hexpointer(region->get_address()) << hexlength(region->get_size())
230                 << decnumber(used_));
231             //
232             return region;
233         }
234 
decrement_used_counthpx::parcelset::policies::verbs::pool_container235         void decrement_used_count(uint32_t N) {
236             used_ -= N;
237         }
238 
239         //
240         std::size_t                                 chunk_size_;
241         std::size_t                                 max_chunks_;
242         std::atomic<int>                            used_;
243 #ifdef RDMA_POOL_USE_LOCKFREE_STACK
244         boost::lockfree::stack<verbs_memory_region*,
245             boost::lockfree::capacity<8192>> free_list_;
246 #else
247         std::stack<verbs_memory_region*> free_list_;
248         mutex_type                      memBuffer_mutex_;
249 #endif
250         //
251         pool_chunk_allocator                           chunk_allocator;
252         std::unordered_map<char *, verbs_memory_region> block_list_;
253 };
254 
255     // ---------------------------------------------------------------------------
256     // memory pool, holds 4 smaller pools and pops/pushes to the one
257     // of the right size for the requested data
258     // ---------------------------------------------------------------------------
259     struct rdma_memory_pool
260     {
261         HPX_NON_COPYABLE(rdma_memory_pool);
262 
263         //----------------------------------------------------------------------------
264         // constructor
rdma_memory_poolhpx::parcelset::policies::verbs::rdma_memory_pool265         rdma_memory_pool(verbs_protection_domain_ptr pd) :
266             protection_domain_(pd),
267             tiny_  (pd, RDMA_POOL_1K_CHUNK,         1024, RDMA_POOL_MAX_1K_CHUNKS),
268             small_ (pd, RDMA_POOL_SMALL_CHUNK_SIZE, 1024, RDMA_POOL_MAX_SMALL_CHUNKS),
269             medium_(pd, RDMA_POOL_MEDIUM_CHUNK_SIZE,  64, RDMA_POOL_MAX_MEDIUM_CHUNKS),
270             large_ (pd, RDMA_POOL_LARGE_CHUNK_SIZE,   32, RDMA_POOL_MAX_LARGE_CHUNKS),
271             temp_regions(0),
272             user_regions(0)
273         {
274             tiny_.allocate_pool(RDMA_POOL_MAX_1K_CHUNKS);
275             small_.allocate_pool(RDMA_POOL_MAX_SMALL_CHUNKS);
276             medium_.allocate_pool(RDMA_POOL_MAX_MEDIUM_CHUNKS);
277             large_.allocate_pool(RDMA_POOL_MAX_LARGE_CHUNKS);
278             LOG_DEBUG_MSG("Completed memory_pool initialization");
279         }
280 
281         //----------------------------------------------------------------------------
282         // destructor
~rdma_memory_poolhpx::parcelset::policies::verbs::rdma_memory_pool283         ~rdma_memory_pool()
284         {
285             deallocate_pools();
286         }
287 
288         //----------------------------------------------------------------------------
deallocate_poolshpx::parcelset::policies::verbs::rdma_memory_pool289         int deallocate_pools()
290         {
291             bool ok = true;
292             ok = ok && tiny_.DeallocatePool();
293             ok = ok && small_.DeallocatePool();
294             ok = ok && medium_.DeallocatePool();
295             ok = ok && large_.DeallocatePool();
296             return ok;
297         }
298 
299         // -------------------------
300         // User allocation interface
301         // -------------------------
302         // The verbs_memory_region* versions of allocate/deallocate
303         // should be used in preference to the std:: compatible
304         // versions using char* for efficiency
305 
306         //----------------------------------------------------------------------------
307         // query the pool for a chunk of a given size to see if one is available
308         // this function is 'unsafe' because it is not thread safe and another
309         // thread may push/pop a block after this is called and invalidate the result.
can_allocate_unsafehpx::parcelset::policies::verbs::rdma_memory_pool310         inline bool can_allocate_unsafe(size_t length) const
311         {
312             if (length<=tiny_.chunk_size_) {
313                 return !tiny_.free_list_.empty();
314             }
315             else if (length<=small_.chunk_size_) {
316                 return !small_.free_list_.empty();
317             }
318             else if (length<=medium_.chunk_size_) {
319                 return !medium_.free_list_.empty();
320             }
321             else if (length<=large_.chunk_size_) {
322                 return !large_.free_list_.empty();
323             }
324             return true;
325         }
326 
327         //----------------------------------------------------------------------------
328         // allocate a region, if size=0 a tiny region is returned
allocate_regionhpx::parcelset::policies::verbs::rdma_memory_pool329         inline verbs_memory_region *allocate_region(size_t length)
330         {
331             verbs_memory_region *region = nullptr;
332             //
333             if (length<=tiny_.chunk_size_) {
334                 region = tiny_.pop();
335             }
336             else if (length<=small_.chunk_size_) {
337                 region = small_.pop();
338             }
339             else if (length<=medium_.chunk_size_) {
340                 region = medium_.pop();
341             }
342             else if (length<=large_.chunk_size_) {
343                 region = large_.pop();
344             }
345             // if we didn't get a block from the cache, create one on the fly
346             if (region==nullptr) {
347                 region = allocate_temporary_region(length);
348             }
349 
350             LOG_TRACE_MSG("Popping Block"
351                 << " buffer "    << hexpointer(region->get_address())
352                 << " region "    << hexpointer(region)
353                 << " size "      << hexlength(region->get_size())
354                 << " chunksize " << hexlength(small_.chunk_size_) << " "
355                 << hexlength(medium_.chunk_size_) << " " << hexlength(large_.chunk_size_)
356                 << " free (t) "  << (RDMA_POOL_MAX_1K_CHUNKS-tiny_.used_)
357                 << " used "      << decnumber(this->small_.used_)
358                 << " free (s) "  << (RDMA_POOL_MAX_SMALL_CHUNKS-small_.used_)
359                 << " used "      << decnumber(this->small_.used_)
360                 << " free (m) "  << (RDMA_POOL_MAX_MEDIUM_CHUNKS-medium_.used_)
361                 << " used "      << decnumber(this->medium_.used_)
362                 << " free (l) "  << (RDMA_POOL_MAX_LARGE_CHUNKS-large_.used_)
363                 << " used "      << decnumber(this->large_.used_));
364             //
365             return region;
366         }
367 
368         //----------------------------------------------------------------------------
369         // release a region back to the pool
deallocatehpx::parcelset::policies::verbs::rdma_memory_pool370         inline void deallocate(verbs_memory_region *region)
371         {
372             // if this region was registered on the fly, then don't return it to the pool
373             if (region->get_temp_region() || region->get_user_region()) {
374                 if (region->get_temp_region()) {
375                     temp_regions--;
376                     LOG_TRACE_MSG("Deallocating temp registered block "
377                         << hexpointer(region->get_address()) << decnumber(temp_regions));
378                 }
379                 else if (region->get_user_region()) {
380                     user_regions--;
381                     LOG_TRACE_MSG("Deleting (user region) "
382                         << hexpointer(region->get_address()) << decnumber(user_regions));
383                 }
384                 delete region;
385                 return;
386             }
387 
388             // put the block back on the free list
389             if (region->get_size()<=tiny_.chunk_size_) {
390                 tiny_.push(region);
391             }
392             else if (region->get_size()<=small_.chunk_size_) {
393                 small_.push(region);
394             }
395             else if (region->get_size()<=medium_.chunk_size_) {
396                 medium_.push(region);
397             }
398             else if (region->get_size()<=large_.chunk_size_) {
399                 large_.push(region);
400             }
401 
402             LOG_TRACE_MSG("Pushing Block"
403                 << " buffer "    << hexpointer(region->get_address())
404                 << " region "    << hexpointer(region)
405                 << " free (t) "  << (RDMA_POOL_MAX_1K_CHUNKS-tiny_.used_)
406                 << " used "      << decnumber(this->small_.used_)
407                 << " free (s) "  << (RDMA_POOL_MAX_SMALL_CHUNKS-small_.used_)
408                 << " used "      << decnumber(this->small_.used_)
409                 << " free (m) "  << (RDMA_POOL_MAX_MEDIUM_CHUNKS-medium_.used_)
410                 << " used "      << decnumber(this->medium_.used_)
411                 << " free (l) "  << (RDMA_POOL_MAX_LARGE_CHUNKS-large_.used_)
412                 << " used "      << decnumber(this->large_.used_));
413         }
414 
415         //----------------------------------------------------------------------------
416         // allocates a region from the heap and registers it, it bypasses the pool
417         // when deallocted, it will be unregistered and deleted, not returned to the pool
allocate_temporary_regionhpx::parcelset::policies::verbs::rdma_memory_pool418         inline verbs_memory_region* allocate_temporary_region(std::size_t length)
419         {
420             verbs_memory_region *region = new verbs_memory_region();
421             region->set_temp_region();
422             region->allocate(protection_domain_, length);
423             temp_regions++;
424             LOG_TRACE_MSG("Allocating temp registered block "
425                 << hexpointer(region->get_address()) << hexlength(length)
426                 << decnumber(temp_regions));
427             return region;
428         }
429 
430         //----------------------------------------------------------------------------
431         // allocate a region, returning a memory block address
432         // this is compatible with STL like allocators but should be avoided
433         // as deallocation requires a map lookup of the address to find it's block
allocatehpx::parcelset::policies::verbs::rdma_memory_pool434         char *allocate(size_t length)
435         {
436             verbs_memory_region *region = allocate_region(length);
437             return region->get_address();
438         }
439 
440         //----------------------------------------------------------------------------
441         // deallocate a region using its memory address as handle
442         // this involves a map lookup to find the region and is therefore
443         // less efficient than releasing memory via the region pointer
deallocatehpx::parcelset::policies::verbs::rdma_memory_pool444         void deallocate(void *address, size_t size=0)
445         {
446             verbs_memory_region *region = pointer_map_[address];
447             deallocate(region);
448         }
449 
450         //----------------------------------------------------------------------------
451         // find an verbs_memory_region* from the memory address it wraps
RegionFromAddresshpx::parcelset::policies::verbs::rdma_memory_pool452         verbs_memory_region *RegionFromAddress(char * const addr) {
453             return pointer_map_[addr];
454         }
455 
456         //----------------------------------------------------------------------------
457         // internal variables
458         //----------------------------------------------------------------------------
459         // used to map the internal memory address to the region that
460         // holds the registration information
461         std::unordered_map<const void *, verbs_memory_region*> pointer_map_;
462 
463         // protection domain that memory is registered with
464         verbs_protection_domain_ptr protection_domain_;
465 
466         // maintain 4 pools of thread safe pre-allocated regions of fixed size.
467         // they obtain their memory from the segmented storage provided
468         pool_container<rdma_chunk_pool<memory_region_allocator>, pool_tiny> tiny_;
469         pool_container<rdma_chunk_pool<memory_region_allocator>, pool_small> small_;
470         pool_container<rdma_chunk_pool<memory_region_allocator>, pool_medium> medium_;
471         pool_container<rdma_chunk_pool<memory_region_allocator>, pool_large> large_;
472         //
473         // a counter
474         std::atomic<int> temp_regions;
475         std::atomic<int> user_regions;
476     };
477 
478     typedef std::shared_ptr<rdma_memory_pool> rdma_memory_pool_ptr;
479 }}}}
480 
481 #endif
482