1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of the NVIDIA CORPORATION nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /******************************************************************************
30  * Simple caching allocator for device memory allocations. The allocator is
31  * thread-safe and capable of managing device allocations on multiple devices.
32  ******************************************************************************/
33 
34 #pragma once
35 
36 #include "util_namespace.cuh"
37 #include "util_debug.cuh"
38 
39 #include <set>
40 #include <map>
41 
42 #include "host/mutex.cuh"
43 #include <math.h>
44 
45 /// Optional outer namespace(s)
46 CUB_NS_PREFIX
47 
48 /// CUB namespace
49 namespace cub {
50 
51 
52 /**
53  * \addtogroup UtilMgmt
54  * @{
55  */
56 
57 
58 /******************************************************************************
59  * CachingDeviceAllocator (host use)
60  ******************************************************************************/
61 
62 /**
63  * \brief A simple caching allocator for device memory allocations.
64  *
65  * \par Overview
66  * The allocator is thread-safe and stream-safe and is capable of managing cached
67  * device allocations on multiple devices.  It behaves as follows:
68  *
69  * \par
70  * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
71  *   the allocation becomes available immediately for reuse within the \p active_stream
72  *   with which it was associated with during allocation, and it becomes available for
73  *   reuse within other streams when all prior work submitted to \p active_stream has completed.
74  * - Allocations are categorized and cached by bin size.  A new allocation request of
75  *   a given size will only consider cached allocations within the corresponding bin.
76  * - Bin limits progress geometrically in accordance with the growth factor
77  *   \p bin_growth provided during construction.  Unused device allocations within
78  *   a larger bin cache are not reused for allocation requests that categorize to
79  *   smaller bin sizes.
80  * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
81  *   (\p bin_growth ^ \p min_bin).
82  * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
83  *   bin and are simply freed when they are deallocated instead of being returned
84  *   to a bin-cache.
85  * - %If the total storage of cached allocations on a given device will exceed
86  *   \p max_cached_bytes, allocations for that device are simply freed when they are
87  *   deallocated instead of being returned to their bin-cache.
88  *
89  * \par
90  * For example, the default-constructed CachingDeviceAllocator is configured with:
91  * - \p bin_growth          = 8
92  * - \p min_bin             = 3
93  * - \p max_bin             = 7
94  * - \p max_cached_bytes    = 6MB - 1B
95  *
96  * \par
97  * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
98  * and sets a maximum of 6,291,455 cached bytes per device
99  *
100  */
101 struct CachingDeviceAllocator
102 {
103 
104     //---------------------------------------------------------------------
105     // Constants
106     //---------------------------------------------------------------------
107 
108     /// Out-of-bounds bin
109     static const unsigned int INVALID_BIN = (unsigned int) -1;
110 
111     /// Invalid size
112     static const size_t INVALID_SIZE = (size_t) -1;
113 
114 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
115 
116     /// Invalid device ordinal
117     static const int INVALID_DEVICE_ORDINAL = -1;
118 
119     //---------------------------------------------------------------------
120     // Type definitions and helper types
121     //---------------------------------------------------------------------
122 
123     /**
124      * Descriptor for device memory allocations
125      */
126     struct BlockDescriptor
127     {
128         void*           d_ptr;              // Device pointer
129         size_t          bytes;              // Size of allocation in bytes
130         unsigned int    bin;                // Bin enumeration
131         int             device;             // device ordinal
132         cudaStream_t    associated_stream;  // Associated associated_stream
133         cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
134 
135         // Constructor (suitable for searching maps for a specific block, given its pointer and device)
BlockDescriptorcub::CachingDeviceAllocator::BlockDescriptor136         BlockDescriptor(void *d_ptr, int device) :
137             d_ptr(d_ptr),
138             bytes(0),
139             bin(INVALID_BIN),
140             device(device),
141             associated_stream(0),
142             ready_event(0)
143         {}
144 
145         // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
BlockDescriptorcub::CachingDeviceAllocator::BlockDescriptor146         BlockDescriptor(int device) :
147             d_ptr(NULL),
148             bytes(0),
149             bin(INVALID_BIN),
150             device(device),
151             associated_stream(0),
152             ready_event(0)
153         {}
154 
155         // Comparison functor for comparing device pointers
PtrComparecub::CachingDeviceAllocator::BlockDescriptor156         static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
157         {
158             if (a.device == b.device)
159                 return (a.d_ptr < b.d_ptr);
160             else
161                 return (a.device < b.device);
162         }
163 
164         // Comparison functor for comparing allocation sizes
SizeComparecub::CachingDeviceAllocator::BlockDescriptor165         static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
166         {
167             if (a.device == b.device)
168                 return (a.bytes < b.bytes);
169             else
170                 return (a.device < b.device);
171         }
172     };
173 
174     /// BlockDescriptor comparator function interface
175     typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
176 
177     class TotalBytes {
178     public:
179         size_t free;
180         size_t live;
TotalBytes()181         TotalBytes() { free = live = 0; }
182     };
183 
184     /// Set type for cached blocks (ordered by size)
185     typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
186 
187     /// Set type for live blocks (ordered by ptr)
188     typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
189 
190     /// Map type of device ordinals to the number of cached bytes cached by each device
191     typedef std::map<int, TotalBytes> GpuCachedBytes;
192 
193 
194     //---------------------------------------------------------------------
195     // Utility functions
196     //---------------------------------------------------------------------
197 
198     /**
199      * Integer pow function for unsigned base and exponent
200      */
IntPowcub::CachingDeviceAllocator201     static unsigned int IntPow(
202         unsigned int base,
203         unsigned int exp)
204     {
205         unsigned int retval = 1;
206         while (exp > 0)
207         {
208             if (exp & 1) {
209                 retval = retval * base;        // multiply the result by the current base
210             }
211             base = base * base;                // square the base
212             exp = exp >> 1;                    // divide the exponent in half
213         }
214         return retval;
215     }
216 
217 
218     /**
219      * Round up to the nearest power-of
220      */
NearestPowerOfcub::CachingDeviceAllocator221     void NearestPowerOf(
222         unsigned int    &power,
223         size_t          &rounded_bytes,
224         unsigned int    base,
225         size_t          value)
226     {
227         power = 0;
228         rounded_bytes = 1;
229 
230         if (value * base < value)
231         {
232             // Overflow
233             power = sizeof(size_t) * 8;
234             rounded_bytes = size_t(0) - 1;
235             return;
236         }
237 
238         while (rounded_bytes < value)
239         {
240             rounded_bytes *= base;
241             power++;
242         }
243     }
244 
245 
246     //---------------------------------------------------------------------
247     // Fields
248     //---------------------------------------------------------------------
249 
250     cub::Mutex      mutex;              /// Mutex for thread-safety
251 
252     unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
253     unsigned int    min_bin;            /// Minimum bin enumeration
254     unsigned int    max_bin;            /// Maximum bin enumeration
255 
256     size_t          min_bin_bytes;      /// Minimum bin size
257     size_t          max_bin_bytes;      /// Maximum bin size
258     size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
259 
260     const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
261     bool            debug;              /// Whether or not to print (de)allocation events to stdout
262 
263     GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
264     CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
265     BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
266 
267 #endif // DOXYGEN_SHOULD_SKIP_THIS
268 
269     //---------------------------------------------------------------------
270     // Methods
271     //---------------------------------------------------------------------
272 
273     /**
274      * \brief Constructor.
275      */
CachingDeviceAllocatorcub::CachingDeviceAllocator276     CachingDeviceAllocator(
277         unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
278         unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
279         unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
280         size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
281         bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
282         bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
283     :
284         bin_growth(bin_growth),
285         min_bin(min_bin),
286         max_bin(max_bin),
287         min_bin_bytes(IntPow(bin_growth, min_bin)),
288         max_bin_bytes(IntPow(bin_growth, max_bin)),
289         max_cached_bytes(max_cached_bytes),
290         skip_cleanup(skip_cleanup),
291         debug(debug),
292         cached_blocks(BlockDescriptor::SizeCompare),
293         live_blocks(BlockDescriptor::PtrCompare)
294     {}
295 
296 
297     /**
298      * \brief Default constructor.
299      *
300      * Configured with:
301      * \par
302      * - \p bin_growth          = 8
303      * - \p min_bin             = 3
304      * - \p max_bin             = 7
305      * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
306      *
307      * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
308      * sets a maximum of 6,291,455 cached bytes per device
309      */
CachingDeviceAllocatorcub::CachingDeviceAllocator310     CachingDeviceAllocator(
311         bool skip_cleanup = false,
312         bool debug = false)
313     :
314         bin_growth(8),
315         min_bin(3),
316         max_bin(7),
317         min_bin_bytes(IntPow(bin_growth, min_bin)),
318         max_bin_bytes(IntPow(bin_growth, max_bin)),
319         max_cached_bytes((max_bin_bytes * 3) - 1),
320         skip_cleanup(skip_cleanup),
321         debug(debug),
322         cached_blocks(BlockDescriptor::SizeCompare),
323         live_blocks(BlockDescriptor::PtrCompare)
324     {}
325 
326 
327     /**
328      * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
329      *
330      * Changing the ceiling of cached bytes does not cause any allocations (in-use or
331      * cached-in-reserve) to be freed.  See \p FreeAllCached().
332      */
SetMaxCachedBytescub::CachingDeviceAllocator333     cudaError_t SetMaxCachedBytes(
334         size_t max_cached_bytes)
335     {
336         // Lock
337         mutex.Lock();
338 
339         if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
340 
341         this->max_cached_bytes = max_cached_bytes;
342 
343         // Unlock
344         mutex.Unlock();
345 
346         return cudaSuccess;
347     }
348 
349 
350     /**
351      * \brief Provides a suitable allocation of device memory for the given size on the specified device.
352      *
353      * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
354      * with which it was associated with during allocation, and it becomes available for reuse within other
355      * streams when all prior work submitted to \p active_stream has completed.
356      */
DeviceAllocatecub::CachingDeviceAllocator357     cudaError_t DeviceAllocate(
358         int             device,             ///< [in] Device on which to place the allocation
359         void            **d_ptr,            ///< [out] Reference to pointer to the allocation
360         size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
361         cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
362     {
363         *d_ptr                          = NULL;
364         int entrypoint_device           = INVALID_DEVICE_ORDINAL;
365         cudaError_t error               = cudaSuccess;
366 
367         if (device == INVALID_DEVICE_ORDINAL)
368         {
369             if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
370             device = entrypoint_device;
371         }
372 
373         // Create a block descriptor for the requested allocation
374         bool found = false;
375         BlockDescriptor search_key(device);
376         search_key.associated_stream = active_stream;
377         NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
378 
379         if (search_key.bin > max_bin)
380         {
381             // Bin is greater than our maximum bin: allocate the request
382             // exactly and give out-of-bounds bin.  It will not be cached
383             // for reuse when returned.
384             search_key.bin      = INVALID_BIN;
385             search_key.bytes    = bytes;
386         }
387         else
388         {
389             // Search for a suitable cached allocation: lock
390             mutex.Lock();
391 
392             if (search_key.bin < min_bin)
393             {
394                 // Bin is less than minimum bin: round up
395                 search_key.bin      = min_bin;
396                 search_key.bytes    = min_bin_bytes;
397             }
398 
399             // Iterate through the range of cached blocks on the same device in the same bin
400             CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
401             while ((block_itr != cached_blocks.end())
402                     && (block_itr->device == device)
403                     && (block_itr->bin == search_key.bin))
404             {
405                 // To prevent races with reusing blocks returned by the host but still
406                 // in use by the device, only consider cached blocks that are
407                 // either (from the active stream) or (from an idle stream)
408                 if ((active_stream == block_itr->associated_stream) ||
409                     (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
410                 {
411                     // Reuse existing cache block.  Insert into live blocks.
412                     found = true;
413                     search_key = *block_itr;
414                     search_key.associated_stream = active_stream;
415                     live_blocks.insert(search_key);
416 
417                     // Remove from free blocks
418                     cached_bytes[device].free -= search_key.bytes;
419                     cached_bytes[device].live += search_key.bytes;
420 
421                     if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
422                         device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
423 
424                     cached_blocks.erase(block_itr);
425 
426                     break;
427                 }
428                 block_itr++;
429             }
430 
431             // Done searching: unlock
432             mutex.Unlock();
433         }
434 
435         // Allocate the block if necessary
436         if (!found)
437         {
438             // Set runtime's current device to specified device (entrypoint may not be set)
439             if (device != entrypoint_device)
440             {
441                 if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
442                 if (CubDebug(error = cudaSetDevice(device))) return error;
443             }
444 
445             // Attempt to allocate
446             if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
447             {
448                 // The allocation attempt failed: free all cached blocks on device and retry
449                 if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
450                       device, (long long) search_key.bytes, (long long) search_key.associated_stream);
451 
452                 error = cudaSuccess;    // Reset the error we will return
453                 cudaGetLastError();     // Reset CUDART's error
454 
455                 // Lock
456                 mutex.Lock();
457 
458                 // Iterate the range of free blocks on the same device
459                 BlockDescriptor free_key(device);
460                 CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
461 
462                 while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
463                 {
464                     // No need to worry about synchronization with the device: cudaFree is
465                     // blocking and will synchronize across all kernels executing
466                     // on the current device
467 
468                     // Free device memory and destroy stream event.
469                     if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
470                     if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
471 
472                     // Reduce balance and erase entry
473                     cached_bytes[device].free -= block_itr->bytes;
474 
475                     if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
476                         device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
477 
478                     cached_blocks.erase(block_itr);
479 
480                     block_itr++;
481                 }
482 
483                 // Unlock
484                 mutex.Unlock();
485 
486                 // Return under error
487                 if (error) return error;
488 
489                 // Try to allocate again
490                 if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
491             }
492 
493             // Create ready event
494             if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
495                 return error;
496 
497             // Insert into live blocks
498             mutex.Lock();
499             live_blocks.insert(search_key);
500             cached_bytes[device].live += search_key.bytes;
501             mutex.Unlock();
502 
503             if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
504                       device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
505 
506             // Attempt to revert back to previous device if necessary
507             if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
508             {
509                 if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
510             }
511         }
512 
513         // Copy device pointer to output parameter
514         *d_ptr = search_key.d_ptr;
515 
516         if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
517             (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
518 
519         return error;
520     }
521 
522 
523     /**
524      * \brief Provides a suitable allocation of device memory for the given size on the current device.
525      *
526      * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
527      * with which it was associated with during allocation, and it becomes available for reuse within other
528      * streams when all prior work submitted to \p active_stream has completed.
529      */
DeviceAllocatecub::CachingDeviceAllocator530     cudaError_t DeviceAllocate(
531         void            **d_ptr,            ///< [out] Reference to pointer to the allocation
532         size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
533         cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
534     {
535         return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
536     }
537 
538 
539     /**
540      * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
541      *
542      * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
543      * with which it was associated with during allocation, and it becomes available for reuse within other
544      * streams when all prior work submitted to \p active_stream has completed.
545      */
DeviceFreecub::CachingDeviceAllocator546     cudaError_t DeviceFree(
547         int             device,
548         void*           d_ptr)
549     {
550         int entrypoint_device           = INVALID_DEVICE_ORDINAL;
551         cudaError_t error               = cudaSuccess;
552 
553         if (device == INVALID_DEVICE_ORDINAL)
554         {
555             if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
556                 return error;
557             device = entrypoint_device;
558         }
559 
560         // Lock
561         mutex.Lock();
562 
563         // Find corresponding block descriptor
564         bool recached = false;
565         BlockDescriptor search_key(d_ptr, device);
566         BusyBlocks::iterator block_itr = live_blocks.find(search_key);
567         if (block_itr != live_blocks.end())
568         {
569             // Remove from live blocks
570             search_key = *block_itr;
571             live_blocks.erase(block_itr);
572             cached_bytes[device].live -= search_key.bytes;
573 
574             // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
575             if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
576             {
577                 // Insert returned allocation into free blocks
578                 recached = true;
579                 cached_blocks.insert(search_key);
580                 cached_bytes[device].free += search_key.bytes;
581 
582                 if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
583                     device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
584                     (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
585             }
586         }
587 
588         // Unlock
589         mutex.Unlock();
590 
591         // First set to specified device (entrypoint may not be set)
592         if (device != entrypoint_device)
593         {
594             if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
595             if (CubDebug(error = cudaSetDevice(device))) return error;
596         }
597 
598         if (recached)
599         {
600             // Insert the ready event in the associated stream (must have current device set properly)
601             if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
602         }
603         else
604         {
605             // Free the allocation from the runtime and cleanup the event.
606             if (CubDebug(error = cudaFree(d_ptr))) return error;
607             if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
608 
609             if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
610                 device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
611         }
612 
613         // Reset device
614         if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
615         {
616             if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
617         }
618 
619         return error;
620     }
621 
622 
623     /**
624      * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
625      *
626      * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
627      * with which it was associated with during allocation, and it becomes available for reuse within other
628      * streams when all prior work submitted to \p active_stream has completed.
629      */
DeviceFreecub::CachingDeviceAllocator630     cudaError_t DeviceFree(
631         void*           d_ptr)
632     {
633         return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
634     }
635 
636 
637     /**
638      * \brief Frees all cached device allocations on all devices
639      */
FreeAllCachedcub::CachingDeviceAllocator640     cudaError_t FreeAllCached()
641     {
642         cudaError_t error         = cudaSuccess;
643         int entrypoint_device     = INVALID_DEVICE_ORDINAL;
644         int current_device        = INVALID_DEVICE_ORDINAL;
645 
646         mutex.Lock();
647 
648         while (!cached_blocks.empty())
649         {
650             // Get first block
651             CachedBlocks::iterator begin = cached_blocks.begin();
652 
653             // Get entry-point device ordinal if necessary
654             if (entrypoint_device == INVALID_DEVICE_ORDINAL)
655             {
656                 if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
657             }
658 
659             // Set current device ordinal if necessary
660             if (begin->device != current_device)
661             {
662                 if (CubDebug(error = cudaSetDevice(begin->device))) break;
663                 current_device = begin->device;
664             }
665 
666             // Free device memory
667             if (CubDebug(error = cudaFree(begin->d_ptr))) break;
668             if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
669 
670             // Reduce balance and erase entry
671             cached_bytes[current_device].free -= begin->bytes;
672 
673             if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
674                 current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
675 
676             cached_blocks.erase(begin);
677         }
678 
679         mutex.Unlock();
680 
681         // Attempt to revert back to entry-point device if necessary
682         if (entrypoint_device != INVALID_DEVICE_ORDINAL)
683         {
684             if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
685         }
686 
687         return error;
688     }
689 
690 
691     /**
692      * \brief Destructor
693      */
~CachingDeviceAllocatorcub::CachingDeviceAllocator694     virtual ~CachingDeviceAllocator()
695     {
696         if (!skip_cleanup)
697             FreeAllCached();
698     }
699 
700 };
701 
702 
703 
704 
705 /** @} */       // end group UtilMgmt
706 
707 }               // CUB namespace
708 CUB_NS_POSTFIX  // Optional outer namespace(s)
709