1 /****************************************************************************** 2 * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the NVIDIA CORPORATION nor the 13 * names of its contributors may be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 ******************************************************************************/ 28 29 /****************************************************************************** 30 * Simple caching allocator for device memory allocations. The allocator is 31 * thread-safe and capable of managing device allocations on multiple devices. 32 ******************************************************************************/ 33 34 #pragma once 35 36 #include "util_namespace.cuh" 37 #include "util_debug.cuh" 38 39 #include <set> 40 #include <map> 41 42 #include "host/mutex.cuh" 43 #include <math.h> 44 45 /// Optional outer namespace(s) 46 CUB_NS_PREFIX 47 48 /// CUB namespace 49 namespace cub { 50 51 52 /** 53 * \addtogroup UtilMgmt 54 * @{ 55 */ 56 57 58 /****************************************************************************** 59 * CachingDeviceAllocator (host use) 60 ******************************************************************************/ 61 62 /** 63 * \brief A simple caching allocator for device memory allocations. 64 * 65 * \par Overview 66 * The allocator is thread-safe and stream-safe and is capable of managing cached 67 * device allocations on multiple devices. It behaves as follows: 68 * 69 * \par 70 * - Allocations from the allocator are associated with an \p active_stream. Once freed, 71 * the allocation becomes available immediately for reuse within the \p active_stream 72 * with which it was associated with during allocation, and it becomes available for 73 * reuse within other streams when all prior work submitted to \p active_stream has completed. 74 * - Allocations are categorized and cached by bin size. A new allocation request of 75 * a given size will only consider cached allocations within the corresponding bin. 76 * - Bin limits progress geometrically in accordance with the growth factor 77 * \p bin_growth provided during construction. Unused device allocations within 78 * a larger bin cache are not reused for allocation requests that categorize to 79 * smaller bin sizes. 80 * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to 81 * (\p bin_growth ^ \p min_bin). 82 * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest 83 * bin and are simply freed when they are deallocated instead of being returned 84 * to a bin-cache. 85 * - %If the total storage of cached allocations on a given device will exceed 86 * \p max_cached_bytes, allocations for that device are simply freed when they are 87 * deallocated instead of being returned to their bin-cache. 88 * 89 * \par 90 * For example, the default-constructed CachingDeviceAllocator is configured with: 91 * - \p bin_growth = 8 92 * - \p min_bin = 3 93 * - \p max_bin = 7 94 * - \p max_cached_bytes = 6MB - 1B 95 * 96 * \par 97 * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB 98 * and sets a maximum of 6,291,455 cached bytes per device 99 * 100 */ 101 struct CachingDeviceAllocator 102 { 103 104 //--------------------------------------------------------------------- 105 // Constants 106 //--------------------------------------------------------------------- 107 108 /// Out-of-bounds bin 109 static const unsigned int INVALID_BIN = (unsigned int) -1; 110 111 /// Invalid size 112 static const size_t INVALID_SIZE = (size_t) -1; 113 114 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 115 116 /// Invalid device ordinal 117 static const int INVALID_DEVICE_ORDINAL = -1; 118 119 //--------------------------------------------------------------------- 120 // Type definitions and helper types 121 //--------------------------------------------------------------------- 122 123 /** 124 * Descriptor for device memory allocations 125 */ 126 struct BlockDescriptor 127 { 128 void* d_ptr; // Device pointer 129 size_t bytes; // Size of allocation in bytes 130 unsigned int bin; // Bin enumeration 131 int device; // device ordinal 132 cudaStream_t associated_stream; // Associated associated_stream 133 cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed 134 135 // Constructor (suitable for searching maps for a specific block, given its pointer and device) BlockDescriptorcub::CachingDeviceAllocator::BlockDescriptor136 BlockDescriptor(void *d_ptr, int device) : 137 d_ptr(d_ptr), 138 bytes(0), 139 bin(INVALID_BIN), 140 device(device), 141 associated_stream(0), 142 ready_event(0) 143 {} 144 145 // Constructor (suitable for searching maps for a range of suitable blocks, given a device) BlockDescriptorcub::CachingDeviceAllocator::BlockDescriptor146 BlockDescriptor(int device) : 147 d_ptr(NULL), 148 bytes(0), 149 bin(INVALID_BIN), 150 device(device), 151 associated_stream(0), 152 ready_event(0) 153 {} 154 155 // Comparison functor for comparing device pointers PtrComparecub::CachingDeviceAllocator::BlockDescriptor156 static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) 157 { 158 if (a.device == b.device) 159 return (a.d_ptr < b.d_ptr); 160 else 161 return (a.device < b.device); 162 } 163 164 // Comparison functor for comparing allocation sizes SizeComparecub::CachingDeviceAllocator::BlockDescriptor165 static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) 166 { 167 if (a.device == b.device) 168 return (a.bytes < b.bytes); 169 else 170 return (a.device < b.device); 171 } 172 }; 173 174 /// BlockDescriptor comparator function interface 175 typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); 176 177 class TotalBytes { 178 public: 179 size_t free; 180 size_t live; TotalBytes()181 TotalBytes() { free = live = 0; } 182 }; 183 184 /// Set type for cached blocks (ordered by size) 185 typedef std::multiset<BlockDescriptor, Compare> CachedBlocks; 186 187 /// Set type for live blocks (ordered by ptr) 188 typedef std::multiset<BlockDescriptor, Compare> BusyBlocks; 189 190 /// Map type of device ordinals to the number of cached bytes cached by each device 191 typedef std::map<int, TotalBytes> GpuCachedBytes; 192 193 194 //--------------------------------------------------------------------- 195 // Utility functions 196 //--------------------------------------------------------------------- 197 198 /** 199 * Integer pow function for unsigned base and exponent 200 */ IntPowcub::CachingDeviceAllocator201 static unsigned int IntPow( 202 unsigned int base, 203 unsigned int exp) 204 { 205 unsigned int retval = 1; 206 while (exp > 0) 207 { 208 if (exp & 1) { 209 retval = retval * base; // multiply the result by the current base 210 } 211 base = base * base; // square the base 212 exp = exp >> 1; // divide the exponent in half 213 } 214 return retval; 215 } 216 217 218 /** 219 * Round up to the nearest power-of 220 */ NearestPowerOfcub::CachingDeviceAllocator221 void NearestPowerOf( 222 unsigned int &power, 223 size_t &rounded_bytes, 224 unsigned int base, 225 size_t value) 226 { 227 power = 0; 228 rounded_bytes = 1; 229 230 if (value * base < value) 231 { 232 // Overflow 233 power = sizeof(size_t) * 8; 234 rounded_bytes = size_t(0) - 1; 235 return; 236 } 237 238 while (rounded_bytes < value) 239 { 240 rounded_bytes *= base; 241 power++; 242 } 243 } 244 245 246 //--------------------------------------------------------------------- 247 // Fields 248 //--------------------------------------------------------------------- 249 250 cub::Mutex mutex; /// Mutex for thread-safety 251 252 unsigned int bin_growth; /// Geometric growth factor for bin-sizes 253 unsigned int min_bin; /// Minimum bin enumeration 254 unsigned int max_bin; /// Maximum bin enumeration 255 256 size_t min_bin_bytes; /// Minimum bin size 257 size_t max_bin_bytes; /// Maximum bin size 258 size_t max_cached_bytes; /// Maximum aggregate cached bytes per device 259 260 const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) 261 bool debug; /// Whether or not to print (de)allocation events to stdout 262 263 GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device 264 CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse 265 BusyBlocks live_blocks; /// Set of live device allocations currently in use 266 267 #endif // DOXYGEN_SHOULD_SKIP_THIS 268 269 //--------------------------------------------------------------------- 270 // Methods 271 //--------------------------------------------------------------------- 272 273 /** 274 * \brief Constructor. 275 */ CachingDeviceAllocatorcub::CachingDeviceAllocator276 CachingDeviceAllocator( 277 unsigned int bin_growth, ///< Geometric growth factor for bin-sizes 278 unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) 279 unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) 280 size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit) 281 bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) 282 bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) 283 : 284 bin_growth(bin_growth), 285 min_bin(min_bin), 286 max_bin(max_bin), 287 min_bin_bytes(IntPow(bin_growth, min_bin)), 288 max_bin_bytes(IntPow(bin_growth, max_bin)), 289 max_cached_bytes(max_cached_bytes), 290 skip_cleanup(skip_cleanup), 291 debug(debug), 292 cached_blocks(BlockDescriptor::SizeCompare), 293 live_blocks(BlockDescriptor::PtrCompare) 294 {} 295 296 297 /** 298 * \brief Default constructor. 299 * 300 * Configured with: 301 * \par 302 * - \p bin_growth = 8 303 * - \p min_bin = 3 304 * - \p max_bin = 7 305 * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes 306 * 307 * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and 308 * sets a maximum of 6,291,455 cached bytes per device 309 */ CachingDeviceAllocatorcub::CachingDeviceAllocator310 CachingDeviceAllocator( 311 bool skip_cleanup = false, 312 bool debug = false) 313 : 314 bin_growth(8), 315 min_bin(3), 316 max_bin(7), 317 min_bin_bytes(IntPow(bin_growth, min_bin)), 318 max_bin_bytes(IntPow(bin_growth, max_bin)), 319 max_cached_bytes((max_bin_bytes * 3) - 1), 320 skip_cleanup(skip_cleanup), 321 debug(debug), 322 cached_blocks(BlockDescriptor::SizeCompare), 323 live_blocks(BlockDescriptor::PtrCompare) 324 {} 325 326 327 /** 328 * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. 329 * 330 * Changing the ceiling of cached bytes does not cause any allocations (in-use or 331 * cached-in-reserve) to be freed. See \p FreeAllCached(). 332 */ SetMaxCachedBytescub::CachingDeviceAllocator333 cudaError_t SetMaxCachedBytes( 334 size_t max_cached_bytes) 335 { 336 // Lock 337 mutex.Lock(); 338 339 if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes); 340 341 this->max_cached_bytes = max_cached_bytes; 342 343 // Unlock 344 mutex.Unlock(); 345 346 return cudaSuccess; 347 } 348 349 350 /** 351 * \brief Provides a suitable allocation of device memory for the given size on the specified device. 352 * 353 * Once freed, the allocation becomes available immediately for reuse within the \p active_stream 354 * with which it was associated with during allocation, and it becomes available for reuse within other 355 * streams when all prior work submitted to \p active_stream has completed. 356 */ DeviceAllocatecub::CachingDeviceAllocator357 cudaError_t DeviceAllocate( 358 int device, ///< [in] Device on which to place the allocation 359 void **d_ptr, ///< [out] Reference to pointer to the allocation 360 size_t bytes, ///< [in] Minimum number of bytes for the allocation 361 cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation 362 { 363 *d_ptr = NULL; 364 int entrypoint_device = INVALID_DEVICE_ORDINAL; 365 cudaError_t error = cudaSuccess; 366 367 if (device == INVALID_DEVICE_ORDINAL) 368 { 369 if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; 370 device = entrypoint_device; 371 } 372 373 // Create a block descriptor for the requested allocation 374 bool found = false; 375 BlockDescriptor search_key(device); 376 search_key.associated_stream = active_stream; 377 NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); 378 379 if (search_key.bin > max_bin) 380 { 381 // Bin is greater than our maximum bin: allocate the request 382 // exactly and give out-of-bounds bin. It will not be cached 383 // for reuse when returned. 384 search_key.bin = INVALID_BIN; 385 search_key.bytes = bytes; 386 } 387 else 388 { 389 // Search for a suitable cached allocation: lock 390 mutex.Lock(); 391 392 if (search_key.bin < min_bin) 393 { 394 // Bin is less than minimum bin: round up 395 search_key.bin = min_bin; 396 search_key.bytes = min_bin_bytes; 397 } 398 399 // Iterate through the range of cached blocks on the same device in the same bin 400 CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); 401 while ((block_itr != cached_blocks.end()) 402 && (block_itr->device == device) 403 && (block_itr->bin == search_key.bin)) 404 { 405 // To prevent races with reusing blocks returned by the host but still 406 // in use by the device, only consider cached blocks that are 407 // either (from the active stream) or (from an idle stream) 408 if ((active_stream == block_itr->associated_stream) || 409 (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) 410 { 411 // Reuse existing cache block. Insert into live blocks. 412 found = true; 413 search_key = *block_itr; 414 search_key.associated_stream = active_stream; 415 live_blocks.insert(search_key); 416 417 // Remove from free blocks 418 cached_bytes[device].free -= search_key.bytes; 419 cached_bytes[device].live += search_key.bytes; 420 421 if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n", 422 device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream); 423 424 cached_blocks.erase(block_itr); 425 426 break; 427 } 428 block_itr++; 429 } 430 431 // Done searching: unlock 432 mutex.Unlock(); 433 } 434 435 // Allocate the block if necessary 436 if (!found) 437 { 438 // Set runtime's current device to specified device (entrypoint may not be set) 439 if (device != entrypoint_device) 440 { 441 if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; 442 if (CubDebug(error = cudaSetDevice(device))) return error; 443 } 444 445 // Attempt to allocate 446 if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) 447 { 448 // The allocation attempt failed: free all cached blocks on device and retry 449 if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", 450 device, (long long) search_key.bytes, (long long) search_key.associated_stream); 451 452 error = cudaSuccess; // Reset the error we will return 453 cudaGetLastError(); // Reset CUDART's error 454 455 // Lock 456 mutex.Lock(); 457 458 // Iterate the range of free blocks on the same device 459 BlockDescriptor free_key(device); 460 CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key); 461 462 while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) 463 { 464 // No need to worry about synchronization with the device: cudaFree is 465 // blocking and will synchronize across all kernels executing 466 // on the current device 467 468 // Free device memory and destroy stream event. 469 if (CubDebug(error = cudaFree(block_itr->d_ptr))) break; 470 if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break; 471 472 // Reduce balance and erase entry 473 cached_bytes[device].free -= block_itr->bytes; 474 475 if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", 476 device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); 477 478 cached_blocks.erase(block_itr); 479 480 block_itr++; 481 } 482 483 // Unlock 484 mutex.Unlock(); 485 486 // Return under error 487 if (error) return error; 488 489 // Try to allocate again 490 if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error; 491 } 492 493 // Create ready event 494 if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) 495 return error; 496 497 // Insert into live blocks 498 mutex.Lock(); 499 live_blocks.insert(search_key); 500 cached_bytes[device].live += search_key.bytes; 501 mutex.Unlock(); 502 503 if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n", 504 device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); 505 506 // Attempt to revert back to previous device if necessary 507 if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) 508 { 509 if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; 510 } 511 } 512 513 // Copy device pointer to output parameter 514 *d_ptr = search_key.d_ptr; 515 516 if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", 517 (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); 518 519 return error; 520 } 521 522 523 /** 524 * \brief Provides a suitable allocation of device memory for the given size on the current device. 525 * 526 * Once freed, the allocation becomes available immediately for reuse within the \p active_stream 527 * with which it was associated with during allocation, and it becomes available for reuse within other 528 * streams when all prior work submitted to \p active_stream has completed. 529 */ DeviceAllocatecub::CachingDeviceAllocator530 cudaError_t DeviceAllocate( 531 void **d_ptr, ///< [out] Reference to pointer to the allocation 532 size_t bytes, ///< [in] Minimum number of bytes for the allocation 533 cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation 534 { 535 return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream); 536 } 537 538 539 /** 540 * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator. 541 * 542 * Once freed, the allocation becomes available immediately for reuse within the \p active_stream 543 * with which it was associated with during allocation, and it becomes available for reuse within other 544 * streams when all prior work submitted to \p active_stream has completed. 545 */ DeviceFreecub::CachingDeviceAllocator546 cudaError_t DeviceFree( 547 int device, 548 void* d_ptr) 549 { 550 int entrypoint_device = INVALID_DEVICE_ORDINAL; 551 cudaError_t error = cudaSuccess; 552 553 if (device == INVALID_DEVICE_ORDINAL) 554 { 555 if (CubDebug(error = cudaGetDevice(&entrypoint_device))) 556 return error; 557 device = entrypoint_device; 558 } 559 560 // Lock 561 mutex.Lock(); 562 563 // Find corresponding block descriptor 564 bool recached = false; 565 BlockDescriptor search_key(d_ptr, device); 566 BusyBlocks::iterator block_itr = live_blocks.find(search_key); 567 if (block_itr != live_blocks.end()) 568 { 569 // Remove from live blocks 570 search_key = *block_itr; 571 live_blocks.erase(block_itr); 572 cached_bytes[device].live -= search_key.bytes; 573 574 // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold 575 if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) 576 { 577 // Insert returned allocation into free blocks 578 recached = true; 579 cached_blocks.insert(search_key); 580 cached_bytes[device].free += search_key.bytes; 581 582 if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", 583 device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), 584 (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); 585 } 586 } 587 588 // Unlock 589 mutex.Unlock(); 590 591 // First set to specified device (entrypoint may not be set) 592 if (device != entrypoint_device) 593 { 594 if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; 595 if (CubDebug(error = cudaSetDevice(device))) return error; 596 } 597 598 if (recached) 599 { 600 // Insert the ready event in the associated stream (must have current device set properly) 601 if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error; 602 } 603 else 604 { 605 // Free the allocation from the runtime and cleanup the event. 606 if (CubDebug(error = cudaFree(d_ptr))) return error; 607 if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; 608 609 if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", 610 device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); 611 } 612 613 // Reset device 614 if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) 615 { 616 if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; 617 } 618 619 return error; 620 } 621 622 623 /** 624 * \brief Frees a live allocation of device memory on the current device, returning it to the allocator. 625 * 626 * Once freed, the allocation becomes available immediately for reuse within the \p active_stream 627 * with which it was associated with during allocation, and it becomes available for reuse within other 628 * streams when all prior work submitted to \p active_stream has completed. 629 */ DeviceFreecub::CachingDeviceAllocator630 cudaError_t DeviceFree( 631 void* d_ptr) 632 { 633 return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); 634 } 635 636 637 /** 638 * \brief Frees all cached device allocations on all devices 639 */ FreeAllCachedcub::CachingDeviceAllocator640 cudaError_t FreeAllCached() 641 { 642 cudaError_t error = cudaSuccess; 643 int entrypoint_device = INVALID_DEVICE_ORDINAL; 644 int current_device = INVALID_DEVICE_ORDINAL; 645 646 mutex.Lock(); 647 648 while (!cached_blocks.empty()) 649 { 650 // Get first block 651 CachedBlocks::iterator begin = cached_blocks.begin(); 652 653 // Get entry-point device ordinal if necessary 654 if (entrypoint_device == INVALID_DEVICE_ORDINAL) 655 { 656 if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; 657 } 658 659 // Set current device ordinal if necessary 660 if (begin->device != current_device) 661 { 662 if (CubDebug(error = cudaSetDevice(begin->device))) break; 663 current_device = begin->device; 664 } 665 666 // Free device memory 667 if (CubDebug(error = cudaFree(begin->d_ptr))) break; 668 if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break; 669 670 // Reduce balance and erase entry 671 cached_bytes[current_device].free -= begin->bytes; 672 673 if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", 674 current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live); 675 676 cached_blocks.erase(begin); 677 } 678 679 mutex.Unlock(); 680 681 // Attempt to revert back to entry-point device if necessary 682 if (entrypoint_device != INVALID_DEVICE_ORDINAL) 683 { 684 if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; 685 } 686 687 return error; 688 } 689 690 691 /** 692 * \brief Destructor 693 */ ~CachingDeviceAllocatorcub::CachingDeviceAllocator694 virtual ~CachingDeviceAllocator() 695 { 696 if (!skip_cleanup) 697 FreeAllCached(); 698 } 699 700 }; 701 702 703 704 705 /** @} */ // end group UtilMgmt 706 707 } // CUB namespace 708 CUB_NS_POSTFIX // Optional outer namespace(s) 709