1 // Copyright (c) 2014-2016 John Biddiscombe 2 // 3 // Distributed under the Boost Software License, Version 1.0. (See accompanying 4 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 5 6 #ifndef HPX_PARCELSET_POLICIES_VERBS_MEMORY_POOL 7 #define HPX_PARCELSET_POLICIES_VERBS_MEMORY_POOL 8 9 #include <hpx/lcos/local/mutex.hpp> 10 #include <hpx/lcos/local/spinlock.hpp> 11 // 12 #include <atomic> 13 #include <stack> 14 #include <unordered_map> 15 #include <iostream> 16 #include <cstddef> 17 #include <memory> 18 // 19 #include <boost/lockfree/stack.hpp> 20 // 21 #include <hpx/config/parcelport_defines.hpp> 22 // 23 #include <plugins/parcelport/parcelport_logging.hpp> 24 #include <plugins/parcelport/verbs/rdma/rdma_locks.hpp> 25 #include <plugins/parcelport/verbs/rdma/rdma_chunk_pool.hpp> 26 #include <plugins/parcelport/verbs/rdma/verbs_protection_domain.hpp> 27 #include <plugins/parcelport/verbs/rdma/verbs_memory_region.hpp> 28 29 // the default memory chunk size in bytes 30 #define RDMA_POOL_1K_CHUNK 0x001*0x0400 // 1KB 31 #define RDMA_POOL_SMALL_CHUNK_SIZE 0x010*0x0400 // 16KB 32 #define RDMA_POOL_MEDIUM_CHUNK_SIZE 0x040*0x0400 // 64KB 33 #define RDMA_POOL_LARGE_CHUNK_SIZE 0x400*0x0400 // 1MB 34 35 #define RDMA_POOL_MAX_1K_CHUNKS 1024 36 #define RDMA_POOL_MAX_SMALL_CHUNKS 1024 37 #define RDMA_POOL_MAX_MEDIUM_CHUNKS 64 38 #define RDMA_POOL_MAX_LARGE_CHUNKS 32 39 40 #define RDMA_POOL_USE_LOCKFREE_STACK 1 41 42 // if the HPX configuration has set a different value, use it 43 #if defined(HPX_PARCELPORT_VERBS_MEMORY_CHUNK_SIZE) 44 # undef RDMA_POOL_SMALL_CHUNK_SIZE 45 # define RDMA_POOL_SMALL_CHUNK_SIZE HPX_PARCELPORT_VERBS_MEMORY_CHUNK_SIZE 46 #endif 47 48 static_assert ( HPX_PARCELPORT_VERBS_MEMORY_CHUNK_SIZE<RDMA_POOL_MEDIUM_CHUNK_SIZE , 49 "Default memory Chunk size must be less than medium chunk size" ); 50 51 52 // Description of memory pool objects: 53 // 54 // memory_region_allocator: 55 // An allocator that returns memory of the requested size. The memory is pinned 56 // and ready to be used for RDMA operations. A memory_region object is 57 // used, it contains the memory registration information needed by the verbs API. 58 // 59 // rdma_chunk_pool : 60 // This is a class taken from boost that takes a block of memory (in this case provided 61 // by the memory_region_allocator) and divides it up into N smaller blocks. 62 // These smaller blocks can be used for individual objects or can be used as buffers. 63 // If 16 blocks of 1K are requested, it will call the allocator and request 64 // 16*1K + 16 bytes. The overhead per memory allocation request is 16 bytes 65 // 66 // pool_container: 67 // The pool container wraps an rdma_chunk_pool and provides a stack. When a user 68 // requests a small block, one is popped off the stack. At startup, the pool_container 69 // requests a large number of blocks from the rdma_chunk_pool and sets the correct 70 // address offset within each larger chunk for each small block and pushes the mini 71 // verbs_memory_region onto the stack. Thus N small rdma_regions are created from a 72 // single larger one and memory blocks come from contiguous memory. 73 // 74 // rdma_memory_pool: 75 // The rdma_memory_pool maintains 4 pool_container (stacks) of different sized blocks 76 // so that most user requests can be fulfilled. 77 // If a request cannot be filled, the pool can generate temporary blocks with 78 // new allocations and on-the-fly registration of the memory. 79 // Additionally, it also provides a simple API so users may pass pre-allocated 80 // memory to the pool for on-the-fly registration (rdma transfer of user memory chunks) 81 // and later de-registration. 82 83 namespace hpx { 84 namespace parcelset { 85 namespace policies { 86 namespace verbs 87 { 88 struct rdma_memory_pool; 89 }}}} 90 91 namespace hpx { 92 namespace parcelset { 93 namespace policies { 94 namespace verbs 95 { 96 // A simple tag type we use for logging assistance (identification) deschpx::parcelset::policies::verbs::pool_tiny97 struct pool_tiny { static const char *desc() { return "Tiny "; } }; deschpx::parcelset::policies::verbs::pool_small98 struct pool_small { static const char *desc() { return "Small "; } }; deschpx::parcelset::policies::verbs::pool_medium99 struct pool_medium { static const char *desc() { return "Medium "; } }; deschpx::parcelset::policies::verbs::pool_large100 struct pool_large { static const char *desc() { return "Large "; } }; 101 102 // --------------------------------------------------------------------------- 103 // pool_container, collect some routines for reuse with 104 // small, medium, large chunks 105 // --------------------------------------------------------------------------- 106 template <typename pool_chunk_allocator, typename PoolType> 107 struct pool_container 108 { 109 #ifndef RDMA_POOL_USE_LOCKFREE_STACK 110 typedef hpx::lcos::local::spinlock mutex_type; 111 typedef hpx::parcelset::policies::verbs::scoped_lock<mutex_type> scoped_lock; 112 #endif 113 114 // ------------------------------------------------------------------------ pool_containerhpx::parcelset::policies::verbs::pool_container115 pool_container(verbs_protection_domain_ptr pd, std::size_t chunk_size, 116 std::size_t chunks_per_block, std::size_t max_items) : 117 chunk_size_(chunk_size), max_chunks_(max_items), used_(0), 118 chunk_allocator(pd, chunk_size, chunks_per_block, chunks_per_block) 119 { 120 LOG_DEBUG_MSG(PoolType::desc() << "Creating with chunk_size " 121 << hexnumber(chunk_size_) << "max_chunks " << decnumber(max_chunks_)); 122 } 123 124 // ------------------------------------------------------------------------ allocate_poolhpx::parcelset::policies::verbs::pool_container125 bool allocate_pool(std::size_t _num_chunks) 126 { 127 LOG_DEBUG_MSG(PoolType::desc() << "Allocating " << decnumber(_num_chunks) 128 << " blocks of " << hexlength(chunk_size_)); 129 // 130 for (std::size_t i=0; i<_num_chunks; i++) { 131 LOG_TRACE_MSG(PoolType::desc() << "Allocate Block " 132 << i << " of size " << hexlength(chunk_size_)); 133 verbs_memory_region region = chunk_allocator.malloc(); 134 if (region.get_address()!=nullptr) { 135 block_list_[region.get_address()] = region; 136 // we use the pointer to the region for access 137 verbs_memory_region *r = &block_list_[region.get_address()]; 138 push(r); 139 } 140 else { 141 LOG_ERROR_MSG(PoolType::desc() 142 << "Block Allocation Stopped at " << (i-1)); 143 return false; 144 } 145 } 146 used_ = 0; 147 return true; 148 } 149 150 // ------------------------------------------------------------------------ DeallocatePoolhpx::parcelset::policies::verbs::pool_container151 int DeallocatePool() 152 { 153 if (used_!=0) { 154 LOG_ERROR_MSG(PoolType::desc() 155 << "Deallocating free_list : Not all blocks were returned " 156 << " refcounts " << decnumber(used_)); 157 } 158 while (!free_list_.empty()) { 159 chunk_allocator.free(*pop()); 160 } 161 block_list_.clear(); 162 chunk_allocator.release_memory(); 163 return 1; 164 } 165 166 // ------------------------------------------------------------------------ pushhpx::parcelset::policies::verbs::pool_container167 inline void push(verbs_memory_region *region) 168 { 169 #ifndef RDMA_POOL_USE_LOCKFREE_STACK 170 scoped_lock lock(memBuffer_mutex_); 171 #endif 172 LOG_TRACE_MSG(PoolType::desc() << "Push block " 173 << hexpointer(region->get_address()) << hexlength(region->get_size()) 174 << decnumber(used_-1)); 175 #ifdef RDMA_POOL_MEMORY_CHECK 176 uintptr_t val = uintptr_t(region->get_address()); 177 LOG_TRACE_MSG(PoolType::desc() 178 << "Writing 0xdeadbeef to region address " 179 << hexpointer(region->get_address())); 180 if (region->get_address()!=nullptr) { 181 // get use the pointer to the region 182 uintptr_t *ptr = reinterpret_cast<uintptr_t*>(region->get_address()); 183 for (unsigned int c=0; c<chunk_size_/8; ++c) { 184 ptr[c] = 0xdeadbeef; 185 ptr[c] = val; 186 } 187 } 188 #endif 189 190 #ifdef RDMA_POOL_USE_LOCKFREE_STACK 191 if (!free_list_.push(region)) { 192 LOG_ERROR_MSG(PoolType::desc() << "Error in memory pool push"); 193 } 194 #else 195 free_list_.push(region); 196 #endif 197 // decrement one reference 198 used_--; 199 } 200 201 // ------------------------------------------------------------------------ pophpx::parcelset::policies::verbs::pool_container202 inline verbs_memory_region *pop() 203 { 204 #ifndef RDMA_POOL_USE_LOCKFREE_STACK 205 scoped_lock lock(memBuffer_mutex_); 206 #endif 207 // if we have not exceeded our max size, allocate a new block 208 if (free_list_.empty()) { 209 // LOG_TRACE_MSG("Creating new small Block as free list is empty " 210 // "but max chunks " << max_small_chunks_ << " not reached"); 211 // AllocateRegisteredBlock(length); 212 //std::terminate(); 213 return nullptr; 214 } 215 #ifdef RDMA_POOL_USE_LOCKFREE_STACK 216 // get a block 217 verbs_memory_region *region = nullptr; 218 if (!free_list_.pop(region)) { 219 LOG_DEBUG_MSG(PoolType::desc() << "Error in memory pool pop"); 220 } 221 #else 222 verbs_memory_region *region = free_list_.top(); 223 free_list_.pop(); 224 #endif 225 // Keep reference counts to self so that we can check 226 // this pool is not deleted whilst blocks still exist 227 used_++; 228 LOG_TRACE_MSG(PoolType::desc() << "Pop block " 229 << hexpointer(region->get_address()) << hexlength(region->get_size()) 230 << decnumber(used_)); 231 // 232 return region; 233 } 234 decrement_used_counthpx::parcelset::policies::verbs::pool_container235 void decrement_used_count(uint32_t N) { 236 used_ -= N; 237 } 238 239 // 240 std::size_t chunk_size_; 241 std::size_t max_chunks_; 242 std::atomic<int> used_; 243 #ifdef RDMA_POOL_USE_LOCKFREE_STACK 244 boost::lockfree::stack<verbs_memory_region*, 245 boost::lockfree::capacity<8192>> free_list_; 246 #else 247 std::stack<verbs_memory_region*> free_list_; 248 mutex_type memBuffer_mutex_; 249 #endif 250 // 251 pool_chunk_allocator chunk_allocator; 252 std::unordered_map<char *, verbs_memory_region> block_list_; 253 }; 254 255 // --------------------------------------------------------------------------- 256 // memory pool, holds 4 smaller pools and pops/pushes to the one 257 // of the right size for the requested data 258 // --------------------------------------------------------------------------- 259 struct rdma_memory_pool 260 { 261 HPX_NON_COPYABLE(rdma_memory_pool); 262 263 //---------------------------------------------------------------------------- 264 // constructor rdma_memory_poolhpx::parcelset::policies::verbs::rdma_memory_pool265 rdma_memory_pool(verbs_protection_domain_ptr pd) : 266 protection_domain_(pd), 267 tiny_ (pd, RDMA_POOL_1K_CHUNK, 1024, RDMA_POOL_MAX_1K_CHUNKS), 268 small_ (pd, RDMA_POOL_SMALL_CHUNK_SIZE, 1024, RDMA_POOL_MAX_SMALL_CHUNKS), 269 medium_(pd, RDMA_POOL_MEDIUM_CHUNK_SIZE, 64, RDMA_POOL_MAX_MEDIUM_CHUNKS), 270 large_ (pd, RDMA_POOL_LARGE_CHUNK_SIZE, 32, RDMA_POOL_MAX_LARGE_CHUNKS), 271 temp_regions(0), 272 user_regions(0) 273 { 274 tiny_.allocate_pool(RDMA_POOL_MAX_1K_CHUNKS); 275 small_.allocate_pool(RDMA_POOL_MAX_SMALL_CHUNKS); 276 medium_.allocate_pool(RDMA_POOL_MAX_MEDIUM_CHUNKS); 277 large_.allocate_pool(RDMA_POOL_MAX_LARGE_CHUNKS); 278 LOG_DEBUG_MSG("Completed memory_pool initialization"); 279 } 280 281 //---------------------------------------------------------------------------- 282 // destructor ~rdma_memory_poolhpx::parcelset::policies::verbs::rdma_memory_pool283 ~rdma_memory_pool() 284 { 285 deallocate_pools(); 286 } 287 288 //---------------------------------------------------------------------------- deallocate_poolshpx::parcelset::policies::verbs::rdma_memory_pool289 int deallocate_pools() 290 { 291 bool ok = true; 292 ok = ok && tiny_.DeallocatePool(); 293 ok = ok && small_.DeallocatePool(); 294 ok = ok && medium_.DeallocatePool(); 295 ok = ok && large_.DeallocatePool(); 296 return ok; 297 } 298 299 // ------------------------- 300 // User allocation interface 301 // ------------------------- 302 // The verbs_memory_region* versions of allocate/deallocate 303 // should be used in preference to the std:: compatible 304 // versions using char* for efficiency 305 306 //---------------------------------------------------------------------------- 307 // query the pool for a chunk of a given size to see if one is available 308 // this function is 'unsafe' because it is not thread safe and another 309 // thread may push/pop a block after this is called and invalidate the result. can_allocate_unsafehpx::parcelset::policies::verbs::rdma_memory_pool310 inline bool can_allocate_unsafe(size_t length) const 311 { 312 if (length<=tiny_.chunk_size_) { 313 return !tiny_.free_list_.empty(); 314 } 315 else if (length<=small_.chunk_size_) { 316 return !small_.free_list_.empty(); 317 } 318 else if (length<=medium_.chunk_size_) { 319 return !medium_.free_list_.empty(); 320 } 321 else if (length<=large_.chunk_size_) { 322 return !large_.free_list_.empty(); 323 } 324 return true; 325 } 326 327 //---------------------------------------------------------------------------- 328 // allocate a region, if size=0 a tiny region is returned allocate_regionhpx::parcelset::policies::verbs::rdma_memory_pool329 inline verbs_memory_region *allocate_region(size_t length) 330 { 331 verbs_memory_region *region = nullptr; 332 // 333 if (length<=tiny_.chunk_size_) { 334 region = tiny_.pop(); 335 } 336 else if (length<=small_.chunk_size_) { 337 region = small_.pop(); 338 } 339 else if (length<=medium_.chunk_size_) { 340 region = medium_.pop(); 341 } 342 else if (length<=large_.chunk_size_) { 343 region = large_.pop(); 344 } 345 // if we didn't get a block from the cache, create one on the fly 346 if (region==nullptr) { 347 region = allocate_temporary_region(length); 348 } 349 350 LOG_TRACE_MSG("Popping Block" 351 << " buffer " << hexpointer(region->get_address()) 352 << " region " << hexpointer(region) 353 << " size " << hexlength(region->get_size()) 354 << " chunksize " << hexlength(small_.chunk_size_) << " " 355 << hexlength(medium_.chunk_size_) << " " << hexlength(large_.chunk_size_) 356 << " free (t) " << (RDMA_POOL_MAX_1K_CHUNKS-tiny_.used_) 357 << " used " << decnumber(this->small_.used_) 358 << " free (s) " << (RDMA_POOL_MAX_SMALL_CHUNKS-small_.used_) 359 << " used " << decnumber(this->small_.used_) 360 << " free (m) " << (RDMA_POOL_MAX_MEDIUM_CHUNKS-medium_.used_) 361 << " used " << decnumber(this->medium_.used_) 362 << " free (l) " << (RDMA_POOL_MAX_LARGE_CHUNKS-large_.used_) 363 << " used " << decnumber(this->large_.used_)); 364 // 365 return region; 366 } 367 368 //---------------------------------------------------------------------------- 369 // release a region back to the pool deallocatehpx::parcelset::policies::verbs::rdma_memory_pool370 inline void deallocate(verbs_memory_region *region) 371 { 372 // if this region was registered on the fly, then don't return it to the pool 373 if (region->get_temp_region() || region->get_user_region()) { 374 if (region->get_temp_region()) { 375 temp_regions--; 376 LOG_TRACE_MSG("Deallocating temp registered block " 377 << hexpointer(region->get_address()) << decnumber(temp_regions)); 378 } 379 else if (region->get_user_region()) { 380 user_regions--; 381 LOG_TRACE_MSG("Deleting (user region) " 382 << hexpointer(region->get_address()) << decnumber(user_regions)); 383 } 384 delete region; 385 return; 386 } 387 388 // put the block back on the free list 389 if (region->get_size()<=tiny_.chunk_size_) { 390 tiny_.push(region); 391 } 392 else if (region->get_size()<=small_.chunk_size_) { 393 small_.push(region); 394 } 395 else if (region->get_size()<=medium_.chunk_size_) { 396 medium_.push(region); 397 } 398 else if (region->get_size()<=large_.chunk_size_) { 399 large_.push(region); 400 } 401 402 LOG_TRACE_MSG("Pushing Block" 403 << " buffer " << hexpointer(region->get_address()) 404 << " region " << hexpointer(region) 405 << " free (t) " << (RDMA_POOL_MAX_1K_CHUNKS-tiny_.used_) 406 << " used " << decnumber(this->small_.used_) 407 << " free (s) " << (RDMA_POOL_MAX_SMALL_CHUNKS-small_.used_) 408 << " used " << decnumber(this->small_.used_) 409 << " free (m) " << (RDMA_POOL_MAX_MEDIUM_CHUNKS-medium_.used_) 410 << " used " << decnumber(this->medium_.used_) 411 << " free (l) " << (RDMA_POOL_MAX_LARGE_CHUNKS-large_.used_) 412 << " used " << decnumber(this->large_.used_)); 413 } 414 415 //---------------------------------------------------------------------------- 416 // allocates a region from the heap and registers it, it bypasses the pool 417 // when deallocted, it will be unregistered and deleted, not returned to the pool allocate_temporary_regionhpx::parcelset::policies::verbs::rdma_memory_pool418 inline verbs_memory_region* allocate_temporary_region(std::size_t length) 419 { 420 verbs_memory_region *region = new verbs_memory_region(); 421 region->set_temp_region(); 422 region->allocate(protection_domain_, length); 423 temp_regions++; 424 LOG_TRACE_MSG("Allocating temp registered block " 425 << hexpointer(region->get_address()) << hexlength(length) 426 << decnumber(temp_regions)); 427 return region; 428 } 429 430 //---------------------------------------------------------------------------- 431 // allocate a region, returning a memory block address 432 // this is compatible with STL like allocators but should be avoided 433 // as deallocation requires a map lookup of the address to find it's block allocatehpx::parcelset::policies::verbs::rdma_memory_pool434 char *allocate(size_t length) 435 { 436 verbs_memory_region *region = allocate_region(length); 437 return region->get_address(); 438 } 439 440 //---------------------------------------------------------------------------- 441 // deallocate a region using its memory address as handle 442 // this involves a map lookup to find the region and is therefore 443 // less efficient than releasing memory via the region pointer deallocatehpx::parcelset::policies::verbs::rdma_memory_pool444 void deallocate(void *address, size_t size=0) 445 { 446 verbs_memory_region *region = pointer_map_[address]; 447 deallocate(region); 448 } 449 450 //---------------------------------------------------------------------------- 451 // find an verbs_memory_region* from the memory address it wraps RegionFromAddresshpx::parcelset::policies::verbs::rdma_memory_pool452 verbs_memory_region *RegionFromAddress(char * const addr) { 453 return pointer_map_[addr]; 454 } 455 456 //---------------------------------------------------------------------------- 457 // internal variables 458 //---------------------------------------------------------------------------- 459 // used to map the internal memory address to the region that 460 // holds the registration information 461 std::unordered_map<const void *, verbs_memory_region*> pointer_map_; 462 463 // protection domain that memory is registered with 464 verbs_protection_domain_ptr protection_domain_; 465 466 // maintain 4 pools of thread safe pre-allocated regions of fixed size. 467 // they obtain their memory from the segmented storage provided 468 pool_container<rdma_chunk_pool<memory_region_allocator>, pool_tiny> tiny_; 469 pool_container<rdma_chunk_pool<memory_region_allocator>, pool_small> small_; 470 pool_container<rdma_chunk_pool<memory_region_allocator>, pool_medium> medium_; 471 pool_container<rdma_chunk_pool<memory_region_allocator>, pool_large> large_; 472 // 473 // a counter 474 std::atomic<int> temp_regions; 475 std::atomic<int> user_regions; 476 }; 477 478 typedef std::shared_ptr<rdma_memory_pool> rdma_memory_pool_ptr; 479 }}}} 480 481 #endif 482