1 /*******************************************************************************
2     Copyright (c) 2015-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 //
25 // High level description of PMM is in the header file, here some implementation
26 // details are discussed.
27 //
28 // There is one PMM object per GPU and the PMM state among GPUs is completely
29 // separate with the exception of a few shared kmem caches.
30 //
31 // PMM allocates all of the memory it manages from PMA which is the common GPU
32 // Physical Memory Allocator shared by UVM and RM (it's included as part of RM,
33 // but logically separate from it).
34 //
35 // The state of each GPU memory chunk is tracked in uvm_gpu_chunk_t objects.
36 // Each chunk has a type, size and state. Type and size are persistent
37 // throughout chunk's lifetime while its state changes as it's allocated, split,
38 // merged and freed.
39 //
40 // PMM maintains a pre-allocated flat array of root chunks covering all possible
41 // physical allocations that can be returned from PMA. For simplicity, PMM
42 // always allocates 2M (UVM_CHUNK_SIZE_MAX) chunks from PMA and each naturally
43 // aligned 2M chunk represents a single root chunk. The root chunks array is
44 // indexed by the physical address of each chunk divided by UVM_CHUNK_SIZE_MAX
45 // allowing for a simple and fast lookup of root chunks.
46 //
47 // Each root chunk has a tracker for any pending operations on the root chunk
48 // (including all of its subchunks in case it's split) to support asynchronous
49 // alloc and free. Each tracker is protected by a separate bitlock (see
50 // root_chunk_lock()) as synchronizing any pending operations might take a long
51 // time and it would be undesirable for that to block other operations of PMM.
52 // Notably some synchronization is required as part of allocation to handle GPU
53 // lifetime issues across VA spaces (see comments in uvm_pmm_gpu_alloc()). Bit
54 // locks (instead of a mutex in each root chunk) are used to save space.
55 //
56 // All free chunks (UVM_PMM_GPU_CHUNK_STATE_FREE) are kept on free lists, with
57 // one list per each combination of memory type and chunk size (see usage of
58 // uvm_pmm_gpu_t::free_list for reference). This allows for a very quick
59 // allocation and freeing of chunks in case the right size is already available
60 // on alloc or no merges are required on free. See claim_free_chunk() for
61 // allocation and chunk_free_locked() for freeing.
62 //
63 // When a chunk is allocated it transitions into the temporarily pinned state
64 // (UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) until it's unpinned when it becomes
65 // allocated (UVM_PMM_GPU_CHUNK_STATE_ALLOCATED). This transition is only
66 // meaningful for user memory chunks where temporarily pinned chunks cannot be
67 // evicted. Kernel memory type chunks do not support eviction at all and they
68 // are transitioned into the allocated state as part of the allocation itself
69 // (see uvm_pmm_gpu_alloc_kernel). When the chunk is freed it transitions back
70 // to the free state and is placed on an appropriate free list.
71 //
72 // To support smaller allocations, PMM internally splits and merges root chunks
73 // as needed. Splitting and merging is protected by an exclusive lock
74 // (uvm_pmm_gpu_t::lock) to prevent PMM from over-allocating root chunks in case
75 // multiple threads race for a small allocation and there are no free chunks
76 // immediately available.
77 //
78 // Splitting is performed lazily, i.e. chunks are only split when a chunk of the
79 // requested type and size is not available. Splits are only done to the next
80 // smaller size and hence may need to be performed multiple times recursively to
81 // get to the desired chunk size. See alloc_chunk_with_splits(). All split
82 // chunks under the root chunk form a tree with all internal nodes being in
83 // split state and leaf nodes being in any of the free, allocated or pinned
84 // states.
85 //
86 // Merging is performed eagerly, i.e. whenever all chunks under a parent (split)
87 // chunk become free, they are merged into one bigger chunk. See
88 // free_chunk_with_merges().
89 //
90 // Splitting and merging already allocated chunks is also exposed to the users of
91 // allocated chunks. See uvm_pmm_gpu_split_chunk() and uvm_pmm_gpu_merge_chunk().
92 //
93 // As splits and merges are protected by a single PMM mutex, they are only
94 // performed when really necessary. See alloc_chunk() that falls back to split
95 // only as the last step and free_chunk() that similarly first tries performing
96 // a quick free.
97 //
98 // When a memory allocation from PMA fails and eviction is requested, PMM will
99 // check whether it can evict any user memory chunks to satisfy the request.
100 // All allocated user memory root chunks are tracked in an LRU list
101 // (root_chunks.va_block_used). A root chunk is moved to the tail of that list
102 // whenever any of its subchunks is allocated (unpinned) by a VA block (see
103 // uvm_pmm_gpu_unpin_allocated()). When a root chunk is selected for eviction,
104 // it has the eviction flag set (see pick_root_chunk_to_evict()). This flag
105 // affects many of the PMM operations on all of the subchunks of the root chunk
106 // being evicted. See usage of (root_)chunk_is_in_eviction(), in particular in
107 // chunk_free_locked() and claim_free_chunk().
108 //
109 // To evict a root chunk, all of its free subchunks are pinned, then all
110 // resident pages backed by it are moved to the CPU one VA block at a time.
111 // After all of them are moved, the root chunk is merged and returned to the
112 // caller. See evict_root_chunk() for details.
113 //
114 // Eviction is also possible to be triggered by PMA. This makes it possible for
115 // other PMA clients (most importantly RM which CUDA uses for non-UVM
116 // allocations) to successfully allocate memory from the user memory pool
117 // allocated by UVM. UVM registers two eviction callbacks with PMA that PMA
118 // calls as needed to perform the eviction:
119 //  - uvm_pmm_gpu_pma_evict_range - for evicting a physical range
120 //  - uvm_pmm_gpu_pma_evict_pages - for evicting a number of pages
121 //
122 // Both of them perform the eviction using the same building blocks as internal
123 // eviction, but see their implementation and references to pma.h for more
124 // details.
125 //
126 // PMM locking
127 // - PMM mutex
128 //   Exclusive lock protecting both internal and external splits and merges, and
129 //   eviction.
130 //
131 // - PMM list lock
132 //   Protects state transitions of chunks and their movement among lists.
133 //
134 // - PMM root chunk bit locks
135 //   Each bit lock protects the corresponding root chunk's allocation, freeing
136 //   from/to PMA, root chunk trackers, and root chunk indirect_peer mappings.
137 //
138 // - PMA allocation/eviction lock
139 //   A read-write semaphore used by the eviction path to flush any pending
140 //   allocations. See usage of pma_lock in alloc_root_chunk() and
141 //   uvm_pmm_gpu_pma_evict_range().
142 //
143 // == Trade-offs ===
144 //
145 // In general, PMM is optimized towards Pascal+ and 2M VA blocks (that's also
146 // the UVM_CHUNK_SIZE_MAX) as Pascal+ makes much heavier use of PMM:
147 //  - Oversubscription is Pascal+ only
148 //  - On pre-Pascal (UVM-Lite) CUDA currently pre-populates all managed memory
149 //    and hence performance matters mostly only during CUDA memory allocation.
150 //  - On Pascal+ CUDA doesn't pre-populate and memory is allocated on first
151 //    touch.
152 //
153 // The root chunk size matching the VA block chunk size allows PMM to avoid
154 // having to split and merge for the hopefully (HMM might make this hard) common
155 // allocation size of 2M on Pascal+.
156 //
157 // Careful benchmarks and tweaking of PMM are yet to be performed, but there is
158 // some evidence for PMA to potentially cause issues for oversubscription (see
159 // bug 1775408).
160 //
161 
162 #include "uvm_common.h"
163 #include "nv_uvm_interface.h"
164 #include "uvm_api.h"
165 #include "uvm_gpu.h"
166 #include "uvm_pmm_gpu.h"
167 #include "uvm_mem.h"
168 #include "uvm_mmu.h"
169 #include "uvm_global.h"
170 #include "uvm_kvmalloc.h"
171 #include "uvm_va_space.h"
172 #include "uvm_va_block.h"
173 #include "uvm_test.h"
174 #include "uvm_linux.h"
175 
176 static int uvm_global_oversubscription = 1;
177 module_param(uvm_global_oversubscription, int, S_IRUGO);
178 MODULE_PARM_DESC(uvm_global_oversubscription, "Enable (1) or disable (0) global oversubscription support.");
179 
180 #define UVM_PERF_PMA_BATCH_NONPINNED_ORDER_DEFAULT 6
181 
182 // Non-pinned root chunks are allocated in batches, in order to minimize the
183 // number of calls into PMA. The number of root chunks in the batch is:
184 // (1 << uvm_perf_pma_batch_nonpinned_order)
185 static unsigned uvm_perf_pma_batch_nonpinned_order = UVM_PERF_PMA_BATCH_NONPINNED_ORDER_DEFAULT;
186 module_param(uvm_perf_pma_batch_nonpinned_order, uint, S_IRUGO);
187 
188 // Helper type for refcounting cache
189 typedef struct
190 {
191     // Cache for given split size
192     struct kmem_cache *cache;
193 
194     // Number of GPUs using given split size
195     NvU32 refcount;
196 
197     // Name of cache
198     char name[32];
199 } kmem_cache_ref_t;
200 
201 static kmem_cache_ref_t g_pma_address_batch_cache_ref;
202 
203 struct uvm_pmm_gpu_chunk_suballoc_struct
204 {
205     // Number of allocated chunks (including pinned ones)
206     NvU32 allocated;
207 
208     // Number of pinned leaf chunks under this chunk
209     //
210     // Tracked only for suballocs of root chunks to know whether a root chunk
211     // can be evicted. This is not in the uvm_gpu_root_chunk_t itself to stop
212     // the root chunks array from growing too much.
213     // TODO: Bug 1765193: Consider moving this to a union with the parent
214     // pointer in uvm_gpu_chunk_t as root chunks never have a parent or just put
215     // in the root chunk directly.
216     // TODO: Bug 1765193: This could be NvU16 if we enforce the smallest chunk
217     // size to be at least 2^21 / 2^16 = 32 bytes.
218     NvU32 pinned_leaf_chunks;
219 
220     // Array of all child subchunks
221     // TODO: Bug 1765461: Can the array be inlined? It could save the parent
222     //       pointer.
223     uvm_gpu_chunk_t *subchunks[];
224 };
225 
226 typedef enum
227 {
228     CHUNK_WALK_PRE_ORDER,
229     CHUNK_WALK_POST_ORDER
230 } chunk_walk_order_t;
231 
232 typedef NV_STATUS (*chunk_walk_func_t)(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data);
233 
234 // Cache for allocation of uvm_pmm_gpu_chunk_suballoc_t. At index n it stores
235 // a suballoc structure for size 2**n.
236 //
237 // For convenience of init/deinit code level 0 is for allocation of chunks
238 static kmem_cache_ref_t chunk_split_cache[UVM_PMM_CHUNK_SPLIT_CACHE_SIZES];
239 #define CHUNK_CACHE chunk_split_cache[0].cache
240 
uvm_pmm_gpu_memory_type_string(uvm_pmm_gpu_memory_type_t type)241 const char *uvm_pmm_gpu_memory_type_string(uvm_pmm_gpu_memory_type_t type)
242 {
243     switch (type) {
244         UVM_ENUM_STRING_CASE(UVM_PMM_GPU_MEMORY_TYPE_USER);
245         UVM_ENUM_STRING_CASE(UVM_PMM_GPU_MEMORY_TYPE_USER_UNPROTECTED);
246         UVM_ENUM_STRING_CASE(UVM_PMM_GPU_MEMORY_TYPE_KERNEL);
247         UVM_ENUM_STRING_CASE(UVM_PMM_GPU_MEMORY_TYPE_KERNEL_UNPROTECTED);
248         UVM_ENUM_STRING_DEFAULT();
249     }
250 
251     BUILD_BUG_ON(UVM_PMM_GPU_MEMORY_TYPE_COUNT != 4);
252 }
253 
uvm_pmm_gpu_chunk_state_string(uvm_pmm_gpu_chunk_state_t state)254 const char *uvm_pmm_gpu_chunk_state_string(uvm_pmm_gpu_chunk_state_t state)
255 {
256     switch (state) {
257         UVM_ENUM_STRING_CASE(UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
258         UVM_ENUM_STRING_CASE(UVM_PMM_GPU_CHUNK_STATE_FREE);
259         UVM_ENUM_STRING_CASE(UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
260         UVM_ENUM_STRING_CASE(UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
261         UVM_ENUM_STRING_CASE(UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
262         UVM_ENUM_STRING_DEFAULT();
263     }
264 }
265 
266 // The PMA APIs that can be called from PMA eviction callbacks (pmaPinPages and
267 // pmaFreePages*) need to be called differently depending whether it's as part
268 // of PMA eviction or not. The PMM context is used to plumb that information
269 // through the stack in a couple of places.
270 typedef enum
271 {
272     PMM_CONTEXT_DEFAULT,
273     PMM_CONTEXT_PMA_EVICTION,
274 } uvm_pmm_context_t;
275 
276 // Freeing the root chunk not only needs to differentiate between two different
277 // contexts for calling pmaFreePages(), but also in some cases the free back to
278 // PMA needs to be skipped altogether.
279 typedef enum
280 {
281     FREE_ROOT_CHUNK_MODE_DEFAULT,
282     FREE_ROOT_CHUNK_MODE_PMA_EVICTION,
283     FREE_ROOT_CHUNK_MODE_SKIP_PMA_FREE
284 } free_root_chunk_mode_t;
285 
free_root_chunk_mode_from_pmm_context(uvm_pmm_context_t pmm_context)286 static free_root_chunk_mode_t free_root_chunk_mode_from_pmm_context(uvm_pmm_context_t pmm_context)
287 {
288     switch (pmm_context) {
289         case PMM_CONTEXT_DEFAULT:
290             return FREE_ROOT_CHUNK_MODE_DEFAULT;
291         case PMM_CONTEXT_PMA_EVICTION:
292             return FREE_ROOT_CHUNK_MODE_PMA_EVICTION;
293         default:
294             UVM_ASSERT_MSG(false, "Invalid PMM context: 0x%x\n", pmm_context);
295             return FREE_ROOT_CHUNK_MODE_DEFAULT;
296     }
297 }
298 
299 static NV_STATUS alloc_chunk(uvm_pmm_gpu_t *pmm,
300                              uvm_pmm_gpu_memory_type_t type,
301                              uvm_chunk_size_t chunk_size,
302                              uvm_pmm_alloc_flags_t flags,
303                              uvm_gpu_chunk_t **chunk);
304 static NV_STATUS alloc_root_chunk(uvm_pmm_gpu_t *pmm,
305                                   uvm_pmm_gpu_memory_type_t type,
306                                   uvm_pmm_alloc_flags_t flags,
307                                   uvm_gpu_chunk_t **chunk);
308 static void free_root_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, free_root_chunk_mode_t free_mode);
309 static NV_STATUS split_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
310 static void free_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
311 static void free_chunk_with_merges(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
312 static bool free_next_available_root_chunk(uvm_pmm_gpu_t *pmm, uvm_pmm_gpu_memory_type_t type);
313 static struct list_head *find_free_list(uvm_pmm_gpu_t *pmm,
314                                         uvm_pmm_gpu_memory_type_t type,
315                                         uvm_chunk_size_t chunk_size,
316                                         uvm_pmm_list_zero_t zero_type);
317 static bool check_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
318 static struct list_head *find_free_list_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
319 static void chunk_free_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
320 
root_chunk_index(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk)321 static size_t root_chunk_index(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk)
322 {
323     size_t index = root_chunk->chunk.address / UVM_CHUNK_SIZE_MAX;
324     UVM_ASSERT(index < pmm->root_chunks.count);
325     return index;
326 }
327 
root_chunk_lock(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk)328 static void root_chunk_lock(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk)
329 {
330     uvm_bit_lock(&pmm->root_chunks.bitlocks, root_chunk_index(pmm, root_chunk));
331 }
332 
uvm_assert_root_chunk_locked(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk)333 static void uvm_assert_root_chunk_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk)
334 {
335     uvm_assert_bit_locked(&pmm->root_chunks.bitlocks, root_chunk_index(pmm, root_chunk));
336 }
337 
root_chunk_unlock(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk)338 static void root_chunk_unlock(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk)
339 {
340     uvm_bit_unlock(&pmm->root_chunks.bitlocks, root_chunk_index(pmm, root_chunk));
341 }
342 
343 // TODO: Bug 1795559: Remove once PMA eviction is considered safe enough not to
344 // have an opt-out.
gpu_supports_pma_eviction(uvm_gpu_t * gpu)345 static bool gpu_supports_pma_eviction(uvm_gpu_t *gpu)
346 {
347     return uvm_global_oversubscription && uvm_parent_gpu_supports_eviction(gpu->parent);
348 }
349 
uvm_pmm_to_gpu(uvm_pmm_gpu_t * pmm)350 uvm_gpu_t *uvm_pmm_to_gpu(uvm_pmm_gpu_t *pmm)
351 {
352     return container_of(pmm, uvm_gpu_t, pmm);
353 }
354 
root_chunk_from_address(uvm_pmm_gpu_t * pmm,NvU64 addr)355 static uvm_gpu_root_chunk_t *root_chunk_from_address(uvm_pmm_gpu_t *pmm, NvU64 addr)
356 {
357     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
358     size_t index = addr / UVM_CHUNK_SIZE_MAX;
359     uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[index];
360 
361     UVM_ASSERT_MSG(addr <= gpu->mem_info.max_allocatable_address,
362                    "Address 0x%llx vidmem max phys 0x%llx GPU %s\n",
363                    addr,
364                    gpu->mem_info.max_allocatable_address,
365                    uvm_gpu_name(gpu));
366     UVM_ASSERT(root_chunk->chunk.address == UVM_ALIGN_DOWN(addr, UVM_CHUNK_SIZE_MAX));
367 
368     return root_chunk;
369 }
370 
root_chunk_from_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)371 static uvm_gpu_root_chunk_t *root_chunk_from_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
372 {
373     return root_chunk_from_address(pmm, chunk->address);
374 }
375 
chunk_is_root_chunk(uvm_gpu_chunk_t * chunk)376 static bool chunk_is_root_chunk(uvm_gpu_chunk_t *chunk)
377 {
378     return uvm_gpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_MAX;
379 }
380 
chunk_is_root_chunk_pinned(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)381 static bool chunk_is_root_chunk_pinned(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
382 {
383     uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
384 
385     uvm_assert_spinlock_locked(&pmm->list_lock);
386 
387     chunk = &root_chunk->chunk;
388 
389     if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED)
390         return true;
391     else if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
392         return false;
393 
394     UVM_ASSERT(chunk->suballoc);
395 
396     return chunk->suballoc->pinned_leaf_chunks > 0;
397 }
398 
399 // Pin a chunk and update its root chunk's pinned leaf chunks count if the
400 // chunk is not a root chunk.
chunk_pin(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)401 static void chunk_pin(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
402 {
403     uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
404 
405     uvm_assert_spinlock_locked(&pmm->list_lock);
406     UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
407     chunk->state = UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED;
408 
409     if (chunk_is_root_chunk(chunk))
410         return;
411 
412     // For subchunks, update the pinned leaf chunks count tracked in the
413     // suballoc of the root chunk.
414     chunk = &root_chunk->chunk;
415 
416     // The passed-in subchunk is not the root chunk so the root chunk has to be
417     // split.
418     UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT, "chunk state %s\n",
419             uvm_pmm_gpu_chunk_state_string(chunk->state));
420 
421     chunk->suballoc->pinned_leaf_chunks++;
422 }
423 
424 // Unpin a chunk and update its root chunk's pinned leaf chunks count if the
425 // chunk is not a root chunk.
chunk_unpin(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_pmm_gpu_chunk_state_t new_state)426 static void chunk_unpin(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_pmm_gpu_chunk_state_t new_state)
427 {
428     uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
429 
430     uvm_assert_spinlock_locked(&pmm->list_lock);
431     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
432     UVM_ASSERT(chunk->va_block == NULL);
433     UVM_ASSERT(chunk_is_root_chunk_pinned(pmm, chunk));
434     UVM_ASSERT(new_state != UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
435 
436     chunk->state = new_state;
437 
438     if (chunk_is_root_chunk(chunk))
439         return;
440 
441     // For subchunks, update the pinned leaf chunks count tracked in the
442     // suballoc of the root chunk.
443     chunk = &root_chunk->chunk;
444 
445     // The passed-in subchunk is not the root chunk so the root chunk has to be
446     // split.
447     UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT, "chunk state %s\n",
448             uvm_pmm_gpu_chunk_state_string(chunk->state));
449 
450     UVM_ASSERT(chunk->suballoc->pinned_leaf_chunks != 0);
451     chunk->suballoc->pinned_leaf_chunks--;
452 }
453 
uvm_pmm_gpu_memory_type_is_user(uvm_pmm_gpu_memory_type_t type)454 bool uvm_pmm_gpu_memory_type_is_user(uvm_pmm_gpu_memory_type_t type)
455 {
456     UVM_ASSERT(type < UVM_PMM_GPU_MEMORY_TYPE_COUNT);
457 
458     switch (type) {
459         case UVM_PMM_GPU_MEMORY_TYPE_USER: // Alias UVM_PMM_GPU_MEMORY_TYPE_USER_PROTECTED
460         case UVM_PMM_GPU_MEMORY_TYPE_USER_UNPROTECTED:
461             return true;
462         default:
463             return false;
464     }
465 }
466 
memory_type_is_protected(uvm_pmm_gpu_memory_type_t type)467 static bool memory_type_is_protected(uvm_pmm_gpu_memory_type_t type)
468 {
469     switch (type) {
470         case UVM_PMM_GPU_MEMORY_TYPE_USER: // Alias UVM_PMM_GPU_MEMORY_TYPE_USER_PROTECTED
471         case UVM_PMM_GPU_MEMORY_TYPE_KERNEL: // Alias UVM_PMM_GPU_MEMORY_TYPE_KERNEL_PROTECTED:
472             return true;
473         default:
474             return false;
475     }
476 }
477 
uvm_gpu_chunk_set_in_eviction(uvm_gpu_chunk_t * chunk,bool in_eviction)478 static void uvm_gpu_chunk_set_in_eviction(uvm_gpu_chunk_t *chunk, bool in_eviction)
479 {
480     UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
481     UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_MAX);
482     chunk->in_eviction = in_eviction;
483 }
484 
485 // A helper that queries the eviction flag of root chunk of the given chunk.
486 // Eviction is only tracked for root chunks.
chunk_is_in_eviction(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)487 static bool chunk_is_in_eviction(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
488 {
489     return root_chunk_from_chunk(pmm, chunk)->chunk.in_eviction;
490 }
491 
uvm_gpu_chunk_get_gpu(const uvm_gpu_chunk_t * chunk)492 uvm_gpu_t *uvm_gpu_chunk_get_gpu(const uvm_gpu_chunk_t *chunk)
493 {
494     uvm_gpu_t *gpu = uvm_gpu_get(uvm_gpu_id_from_index(chunk->gpu_index));
495     UVM_ASSERT(gpu);
496 
497     return gpu;
498 }
499 
uvm_gpu_chunk_to_page(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)500 struct page *uvm_gpu_chunk_to_page(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
501 {
502     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
503     NvU64 sys_addr = chunk->address + gpu->parent->system_bus.memory_window_start;
504     unsigned long pfn = sys_addr >> PAGE_SHIFT;
505 
506     UVM_ASSERT(sys_addr + uvm_gpu_chunk_get_size(chunk) <= gpu->parent->system_bus.memory_window_end + 1);
507     UVM_ASSERT(gpu->mem_info.numa.enabled);
508 
509     return pfn_to_page(pfn);
510 }
511 
uvm_pmm_gpu_sync(uvm_pmm_gpu_t * pmm)512 void uvm_pmm_gpu_sync(uvm_pmm_gpu_t *pmm)
513 {
514     size_t i;
515 
516     if (!pmm->initialized)
517         return;
518 
519     // Just go over all root chunks and sync the ones that are not PMA OWNED.
520     // This is slow, but uvm_pmm_gpu_sync() is a rarely used operation not
521     // critical for performance.
522     for (i = 0; i < pmm->root_chunks.count; ++i) {
523         uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[i];
524 
525         root_chunk_lock(pmm, root_chunk);
526         if (root_chunk->chunk.state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED) {
527             NV_STATUS status = uvm_tracker_wait(&root_chunk->tracker);
528             if (status != NV_OK)
529                 UVM_ASSERT(status == uvm_global_get_status());
530         }
531         root_chunk_unlock(pmm, root_chunk);
532     }
533 }
534 
pmm_squash_memory_type(uvm_pmm_gpu_memory_type_t type)535 static uvm_pmm_gpu_memory_type_t pmm_squash_memory_type(uvm_pmm_gpu_memory_type_t type)
536 {
537     if (g_uvm_global.conf_computing_enabled)
538         return type;
539 
540     // Enforce the contract that when the Confidential Computing feature is
541     // disabled, all user types are alike, as well as all kernel types,
542     // respectively. See uvm_pmm_gpu_memory_type_t.
543     if (uvm_pmm_gpu_memory_type_is_user(type))
544         return UVM_PMM_GPU_MEMORY_TYPE_USER;
545 
546     return UVM_PMM_GPU_MEMORY_TYPE_KERNEL;
547 }
548 
uvm_pmm_gpu_alloc(uvm_pmm_gpu_t * pmm,size_t num_chunks,uvm_chunk_size_t chunk_size,uvm_pmm_gpu_memory_type_t mem_type,uvm_pmm_alloc_flags_t flags,uvm_gpu_chunk_t ** chunks,uvm_tracker_t * out_tracker)549 NV_STATUS uvm_pmm_gpu_alloc(uvm_pmm_gpu_t *pmm,
550                             size_t num_chunks,
551                             uvm_chunk_size_t chunk_size,
552                             uvm_pmm_gpu_memory_type_t mem_type,
553                             uvm_pmm_alloc_flags_t flags,
554                             uvm_gpu_chunk_t **chunks,
555                             uvm_tracker_t *out_tracker)
556 {
557     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
558     NV_STATUS status;
559     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
560     size_t i;
561 
562     UVM_ASSERT((unsigned)mem_type < UVM_PMM_GPU_MEMORY_TYPE_COUNT);
563     UVM_ASSERT_MSG(is_power_of_2(chunk_size), "chunk size %u\n", chunk_size);
564     UVM_ASSERT_MSG(chunk_size & pmm->chunk_sizes[mem_type], "chunk size %u\n", chunk_size);
565     UVM_ASSERT(num_chunks == 0 || chunks);
566     UVM_ASSERT((flags & UVM_PMM_ALLOC_FLAGS_MASK) == flags);
567 
568     if (flags & UVM_PMM_ALLOC_FLAGS_EVICT) {
569         // If eviction is requested then VA block locks need to be lockable
570         uvm_assert_lockable_order(UVM_LOCK_ORDER_VA_BLOCK);
571     }
572 
573     mem_type = pmm_squash_memory_type(mem_type);
574     for (i = 0; i < num_chunks; i++) {
575         uvm_gpu_root_chunk_t *root_chunk;
576 
577         status = alloc_chunk(pmm, mem_type, chunk_size, flags, &chunks[i]);
578         if (status != NV_OK)
579             goto error;
580 
581         root_chunk = root_chunk_from_chunk(pmm, chunks[i]);
582 
583         root_chunk_lock(pmm, root_chunk);
584         uvm_tracker_remove_completed(&root_chunk->tracker);
585         status = uvm_tracker_add_tracker_safe(&local_tracker, &root_chunk->tracker);
586         root_chunk_unlock(pmm, root_chunk);
587 
588         if (status != NV_OK) {
589             i++;
590             goto error;
591         }
592     }
593 
594     // Before we return to the caller, we need to ensure that the tracker only
595     // contains tracker entries belonging to the PMM's GPU. Otherwise we
596     // could leak trackers for other GPUs into VA spaces which never
597     // registered those GPUs, causing lifetime problems when those GPUs go
598     // away.
599     status = uvm_tracker_wait_for_other_gpus(&local_tracker, gpu);
600     if (status != NV_OK)
601         goto error;
602 
603     if (out_tracker) {
604         status = uvm_tracker_add_tracker_safe(out_tracker, &local_tracker);
605         uvm_tracker_clear(&local_tracker);
606         if (status != NV_OK)
607             goto error;
608     }
609 
610     return uvm_tracker_wait_deinit(&local_tracker);
611 
612 error:
613     uvm_tracker_deinit(&local_tracker);
614     while (i-- > 0)
615         free_chunk(pmm, chunks[i]);
616 
617     // Reset the array to make error handling easier for callers.
618     memset(chunks, 0, sizeof(chunks[0]) * num_chunks);
619 
620     return status;
621 }
622 
pmm_gpu_alloc_kernel(uvm_pmm_gpu_t * pmm,size_t num_chunks,uvm_chunk_size_t chunk_size,uvm_pmm_gpu_memory_type_t memory_type,uvm_pmm_alloc_flags_t flags,uvm_gpu_chunk_t ** chunks,uvm_tracker_t * out_tracker)623 static NV_STATUS pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
624                                       size_t num_chunks,
625                                       uvm_chunk_size_t chunk_size,
626                                       uvm_pmm_gpu_memory_type_t memory_type,
627                                       uvm_pmm_alloc_flags_t flags,
628                                       uvm_gpu_chunk_t **chunks,
629                                       uvm_tracker_t *out_tracker)
630 {
631     size_t i;
632     NV_STATUS status = uvm_pmm_gpu_alloc(pmm, num_chunks, chunk_size, memory_type, flags, chunks, out_tracker);
633     if (status != NV_OK)
634         return status;
635 
636     for (i = 0; i < num_chunks; ++i) {
637         UVM_ASSERT(chunks[i]->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
638 
639         uvm_spin_lock(&pmm->list_lock);
640         chunk_unpin(pmm, chunks[i], UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
641         chunks[i]->is_referenced = false;
642         uvm_spin_unlock(&pmm->list_lock);
643     }
644 
645     return NV_OK;
646 }
647 
chunk_update_lists_locked(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)648 static void chunk_update_lists_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
649 {
650     uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
651 
652     uvm_assert_spinlock_locked(&pmm->list_lock);
653 
654     if (uvm_pmm_gpu_memory_type_is_user(chunk->type)) {
655         if (chunk_is_root_chunk_pinned(pmm, chunk)) {
656             UVM_ASSERT(root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT ||
657                        root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
658             list_del_init(&root_chunk->chunk.list);
659         }
660         else if (root_chunk->chunk.state != UVM_PMM_GPU_CHUNK_STATE_FREE) {
661             UVM_ASSERT(root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT ||
662                        root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
663             list_move_tail(&root_chunk->chunk.list, &pmm->root_chunks.va_block_used);
664         }
665     }
666 
667     // TODO: Bug 1757148: Improve fragmentation of split chunks
668     if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_FREE)
669         list_move_tail(&chunk->list, find_free_list_chunk(pmm, chunk));
670     else if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED)
671         list_del_init(&chunk->list);
672 }
673 
gpu_unpin_temp(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_va_block_t * va_block,bool is_referenced)674 static void gpu_unpin_temp(uvm_pmm_gpu_t *pmm,
675                            uvm_gpu_chunk_t *chunk,
676                            uvm_va_block_t *va_block,
677                            bool is_referenced)
678 {
679     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
680     UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
681 
682     INIT_LIST_HEAD(&chunk->list);
683 
684     uvm_spin_lock(&pmm->list_lock);
685 
686     UVM_ASSERT(!chunk->va_block);
687     UVM_ASSERT(va_block);
688     UVM_ASSERT(chunk->va_block_page_index < uvm_va_block_num_cpu_pages(va_block));
689 
690     chunk_unpin(pmm, chunk, UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
691     chunk->is_referenced = is_referenced;
692     chunk->va_block = va_block;
693     chunk_update_lists_locked(pmm, chunk);
694 
695     uvm_spin_unlock(&pmm->list_lock);
696 }
697 
uvm_pmm_gpu_unpin_allocated(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_va_block_t * va_block)698 void uvm_pmm_gpu_unpin_allocated(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block)
699 {
700     gpu_unpin_temp(pmm, chunk, va_block, false);
701 }
702 
uvm_pmm_gpu_unpin_referenced(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_va_block_t * va_block)703 void uvm_pmm_gpu_unpin_referenced(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block)
704 {
705     gpu_unpin_temp(pmm, chunk, va_block, true);
706 }
707 
uvm_pmm_gpu_free(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_tracker_t * tracker)708 void uvm_pmm_gpu_free(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_tracker_t *tracker)
709 {
710     NV_STATUS status;
711 
712     if (!chunk)
713         return;
714 
715     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
716                chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
717 
718     if (tracker) {
719         uvm_gpu_root_chunk_t *root_chunk;
720 
721         uvm_tracker_remove_completed(tracker);
722 
723         root_chunk = root_chunk_from_chunk(pmm, chunk);
724         root_chunk_lock(pmm, root_chunk);
725 
726         // Remove any completed entries from the root tracker to prevent it from
727         // growing too much over time.
728         uvm_tracker_remove_completed(&root_chunk->tracker);
729 
730         status = uvm_tracker_add_tracker_safe(&root_chunk->tracker, tracker);
731         if (status != NV_OK)
732             UVM_ASSERT(status == uvm_global_get_status());
733 
734         root_chunk_unlock(pmm, root_chunk);
735     }
736 
737     free_chunk(pmm, chunk);
738 }
739 
num_subchunks(uvm_gpu_chunk_t * parent)740 static NvU32 num_subchunks(uvm_gpu_chunk_t *parent)
741 {
742     uvm_chunk_size_t parent_size, child_size;
743     UVM_ASSERT(parent->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
744     parent_size = uvm_gpu_chunk_get_size(parent);
745     child_size = uvm_gpu_chunk_get_size(parent->suballoc->subchunks[0]);
746     return (NvU32)uvm_div_pow2_64(parent_size, child_size);
747 }
748 
next_sibling(uvm_gpu_chunk_t * chunk)749 static uvm_gpu_chunk_t *next_sibling(uvm_gpu_chunk_t *chunk)
750 {
751     uvm_gpu_chunk_t *parent = chunk->parent;
752     size_t index;
753 
754     UVM_ASSERT(parent);
755     UVM_ASSERT(parent->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
756 
757     index = (size_t)uvm_div_pow2_64(chunk->address - parent->address, uvm_gpu_chunk_get_size(chunk));
758     UVM_ASSERT(index < num_subchunks(parent));
759 
760     ++index;
761     if (index == num_subchunks(parent))
762         return NULL;
763 
764     return parent->suballoc->subchunks[index];
765 }
766 
767 // Check that the chunk is in a mergeable state: all children must be pinned or
768 // or all children must be allocated with the same reverse mapping.
769 //
770 // Always returns true so it can be called from an assert macro.
assert_chunk_mergeable(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)771 static bool assert_chunk_mergeable(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
772 {
773     uvm_gpu_chunk_t *first_child = chunk->suballoc->subchunks[0];
774     uvm_va_block_t *child_va_block = first_child->va_block;
775     size_t i;
776 
777     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
778     UVM_ASSERT(first_child->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
779                first_child->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
780 
781     for (i = 1; i < num_subchunks(chunk); i++) {
782         uvm_gpu_chunk_t *child = chunk->suballoc->subchunks[i];
783 
784         UVM_ASSERT(child->state == first_child->state);
785         if (first_child->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
786             uvm_gpu_chunk_t *prev_child = chunk->suballoc->subchunks[i-1];
787 
788             UVM_ASSERT(child->va_block == child_va_block);
789             UVM_ASSERT(child->va_block_page_index ==
790                        prev_child->va_block_page_index + uvm_gpu_chunk_get_size(prev_child) / PAGE_SIZE);
791             UVM_ASSERT(child->is_referenced == prev_child->is_referenced);
792         }
793     }
794 
795     if (first_child->state == UVM_PMM_GPU_CHUNK_STATE_FREE) {
796         UVM_ASSERT(chunk->suballoc->allocated == 0);
797     }
798     else {
799         UVM_ASSERT_MSG(chunk->suballoc->allocated == num_subchunks(chunk), "%u != %u\n",
800                 chunk->suballoc->allocated, num_subchunks(chunk));
801     }
802 
803     return true;
804 }
805 
806 // Merges a previously-split chunk. Assumes that all of its children have
807 // uniform state. This only merges leaves, so none of the children can be in the
808 // split state themselves.
809 //
810 // The children need to be removed from any lists before the merge.
811 //
812 // The merged chunk inherits the former state of its children.
merge_gpu_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)813 static void merge_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
814 {
815     uvm_pmm_gpu_chunk_suballoc_t *suballoc;
816     uvm_gpu_chunk_t *subchunk;
817     uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
818     uvm_pmm_gpu_chunk_state_t child_state;
819     size_t i, num_sub = num_subchunks(chunk);
820 
821     uvm_assert_mutex_locked(&pmm->lock);
822     UVM_ASSERT(assert_chunk_mergeable(pmm, chunk));
823 
824     // Transition the chunk state under the list lock first and then clean up
825     // the subchunk state.
826     uvm_spin_lock(&pmm->list_lock);
827 
828     child_state = chunk->suballoc->subchunks[0]->state;
829 
830     if (child_state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
831         subchunk = chunk->suballoc->subchunks[0];
832         UVM_ASSERT(subchunk->va_block);
833         chunk->va_block = subchunk->va_block;
834         chunk->va_block_page_index = subchunk->va_block_page_index;
835         chunk->is_referenced = subchunk->is_referenced;
836     }
837     else if (child_state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
838         UVM_ASSERT(root_chunk->chunk.suballoc->pinned_leaf_chunks >= num_sub);
839         root_chunk->chunk.suballoc->pinned_leaf_chunks += 1 - num_sub;
840     }
841 
842     chunk->state = child_state;
843     suballoc = chunk->suballoc;
844     chunk->suballoc = NULL;
845 
846     // The resulting chunk is assumed to be non-zero as a simplification,
847     // instead of checking that all the subchunks are zero, since callers of
848     // uvm_pmm_gpu_alloc are not required to clear it. However, we think that
849     // this covers all relevant cases since it is uncommon to split a chunk and
850     // not to use any of the subchunks later on.
851     chunk->is_zero = false;
852 
853     uvm_spin_unlock(&pmm->list_lock);
854 
855     for (i = 0; i < num_sub; i++) {
856         subchunk = suballoc->subchunks[i];
857 
858         // The subchunks should have been removed from their lists prior to the
859         // merge.
860         UVM_ASSERT(list_empty(&subchunk->list));
861 
862         if (child_state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED)
863             UVM_ASSERT(subchunk->va_block != NULL);
864 
865         kmem_cache_free(CHUNK_CACHE, subchunk);
866     }
867 
868     kmem_cache_free(chunk_split_cache[ilog2(num_sub)].cache, suballoc);
869 }
870 
871 // Checks that chunk is below ancestor in the tree. Always returns true so it
872 // can be called from an assert macro.
assert_chunk_under(uvm_gpu_chunk_t * chunk,uvm_gpu_chunk_t * ancestor)873 static bool assert_chunk_under(uvm_gpu_chunk_t *chunk, uvm_gpu_chunk_t *ancestor)
874 {
875     UVM_ASSERT(ancestor->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
876     UVM_ASSERT(ancestor->suballoc);
877     UVM_ASSERT(ancestor->address <= chunk->address);
878     UVM_ASSERT(chunk->address < ancestor->address + uvm_gpu_chunk_get_size(ancestor));
879     UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) <= uvm_gpu_chunk_get_size(ancestor));
880     return true;
881 }
882 
883 // Traverses the chunk tree from start in the given traversal order.
884 //
885 // If the callback returns a status value of NV_WARN_NOTHING_TO_DO when doing
886 // pre-order traversal, the traversal skips walking below that chunk. In all
887 // other cases, returning any non-NV_OK value stops the walk immediately and
888 // returns that status to the caller.
889 //
890 // Be careful modifying the tree from the callback. Changing the tree below the
891 // input chunk is fine and modifying the input chunk itself is fine, but the
892 // callback must not modify the tree above the input chunk. If that is needed,
893 // return a non-NV_OK status from the walk and re-start the walk.
chunk_walk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * start,chunk_walk_func_t func,void * data,chunk_walk_order_t order)894 static NV_STATUS chunk_walk(uvm_pmm_gpu_t *pmm,
895                             uvm_gpu_chunk_t *start,
896                             chunk_walk_func_t func,
897                             void *data,
898                             chunk_walk_order_t order)
899 {
900     NV_STATUS status = NV_OK;
901     uvm_gpu_chunk_t *curr, *sibling;
902 
903     curr = start;
904 
905     do {
906         if (curr != start)
907             UVM_ASSERT(assert_chunk_under(curr, start));
908 
909         if (order == CHUNK_WALK_PRE_ORDER) {
910             status = func(pmm, curr, data);
911             if (status != NV_OK && status != NV_WARN_NOTHING_TO_DO)
912                 return status;
913         }
914 
915         // Skip downward traversal on pre-order if requested
916         if (status != NV_WARN_NOTHING_TO_DO && curr->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT) {
917             // If the chunk is split, walk down
918             curr = curr->suballoc->subchunks[0];
919         }
920         else {
921             // This is a leaf chunk. If not start itself, check siblings.
922             while (curr != start) {
923                 if (order == CHUNK_WALK_POST_ORDER) {
924                     status = func(pmm, curr, data);
925                     if (status != NV_OK)
926                         return status;
927                 }
928 
929                 sibling = next_sibling(curr);
930                 if (sibling) {
931                     curr = sibling;
932                     break;
933                 }
934 
935                 // curr is the last chunk in its parent. Walk up and try again.
936                 curr = curr->parent;
937                 UVM_ASSERT(curr);
938                 UVM_ASSERT(curr->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
939             }
940         }
941     } while (curr != start);
942 
943     // Invoke the final callback for start
944     if (order == CHUNK_WALK_POST_ORDER)
945         return func(pmm, curr, data);
946 
947     return NV_OK;
948 }
949 
chunk_walk_pre_order(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * start,chunk_walk_func_t func,void * data)950 static NV_STATUS chunk_walk_pre_order(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *start, chunk_walk_func_t func, void *data)
951 {
952     return chunk_walk(pmm, start, func, data, CHUNK_WALK_PRE_ORDER);
953 }
954 
chunk_walk_post_order(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * start,chunk_walk_func_t func,void * data)955 static NV_STATUS chunk_walk_post_order(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *start, chunk_walk_func_t func, void *data)
956 {
957     return chunk_walk(pmm, start, func, data, CHUNK_WALK_POST_ORDER);
958 }
959 
960 typedef struct
961 {
962     // Target size for the leaf subchunks
963     uvm_chunk_size_t min_size;
964 
965     // Number of subchunks split to this point. If the subchunks array is non-
966     // NULL, this is the number of elements currently in the array.
967     size_t num_subchunks_curr;
968 
969     // Number of subchunks needed for the whole split
970     size_t num_subchunks_total;
971 
972     // Storage for the final split chunks. May be NULL.
973     uvm_gpu_chunk_t **subchunks;
974 
975     // For testing
976     bool inject_error;
977 } split_walk_t;
978 
split_walk_func(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,void * data)979 static NV_STATUS split_walk_func(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data)
980 {
981     uvm_chunk_size_t chunk_size, child_size;
982     uvm_chunk_sizes_mask_t chunk_sizes = pmm->chunk_sizes[chunk->type];
983     size_t i, num_children;
984     split_walk_t *args = data;
985     NV_STATUS status;
986 
987     chunk_size = uvm_gpu_chunk_get_size(chunk);
988     UVM_ASSERT(chunk_size > args->min_size);
989 
990     child_size = uvm_chunk_find_prev_size(chunk_sizes, chunk_size);
991     UVM_ASSERT(child_size != UVM_CHUNK_SIZE_INVALID);
992     num_children = chunk_size / child_size;
993 
994     if (unlikely(args->inject_error)) {
995         // Inject errors on the last split. inject_split_error is a bitfield,
996         // so we must take the lock to modify it. This path is only used in
997         // testing.
998         if (child_size == args->min_size &&
999             args->num_subchunks_curr + num_children == args->num_subchunks_total) {
1000             uvm_spin_lock(&pmm->list_lock);
1001             chunk->inject_split_error = true;
1002             uvm_spin_unlock(&pmm->list_lock);
1003         }
1004     }
1005 
1006     status = split_gpu_chunk(pmm, chunk);
1007     if (status != NV_OK)
1008         return status;
1009 
1010     // If we've hit our target, add all child subchunks to the array
1011     if (child_size == args->min_size) {
1012         for (i = 0; i < num_children; i++) {
1013             UVM_ASSERT(args->num_subchunks_curr < args->num_subchunks_total);
1014             if (args->subchunks)
1015                 args->subchunks[args->num_subchunks_curr] = chunk->suballoc->subchunks[i];
1016             ++args->num_subchunks_curr;
1017         }
1018 
1019         // No need to walk below this chunk
1020         return NV_WARN_NOTHING_TO_DO;
1021     }
1022 
1023     return NV_OK;
1024 }
1025 
merge_walk_func(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,void * data)1026 static NV_STATUS merge_walk_func(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data)
1027 {
1028     // The merge walk uses post-order traversal, so all subchunks are guaranteed
1029     // to have already been merged.
1030     if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
1031         merge_gpu_chunk(pmm, chunk);
1032     return NV_OK;
1033 }
1034 
uvm_pmm_gpu_merge_chunk_locked(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)1035 static void uvm_pmm_gpu_merge_chunk_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
1036 {
1037     NV_STATUS status;
1038 
1039     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT ||
1040                chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
1041                chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
1042 
1043     uvm_assert_mutex_locked(&pmm->lock);
1044 
1045     status = chunk_walk_post_order(pmm, chunk, merge_walk_func, NULL);
1046 
1047     // merge_walk_func can't fail
1048     UVM_ASSERT(status == NV_OK);
1049     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
1050                chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
1051 }
1052 
uvm_pmm_gpu_split_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_chunk_size_t subchunk_size,uvm_gpu_chunk_t ** subchunks)1053 NV_STATUS uvm_pmm_gpu_split_chunk(uvm_pmm_gpu_t *pmm,
1054                                   uvm_gpu_chunk_t *chunk,
1055                                   uvm_chunk_size_t subchunk_size,
1056                                   uvm_gpu_chunk_t **subchunks)
1057 {
1058     NV_STATUS status;
1059     split_walk_t walk_args =
1060     {
1061         .min_size               = subchunk_size,
1062         .num_subchunks_curr     = 0,
1063         .num_subchunks_total    = uvm_gpu_chunk_get_size(chunk) / subchunk_size,
1064         .subchunks              = subchunks,
1065         .inject_error           = chunk->inject_split_error,
1066     };
1067 
1068     UVM_ASSERT(is_power_of_2(subchunk_size));
1069     UVM_ASSERT(subchunk_size & pmm->chunk_sizes[chunk->type]);
1070     UVM_ASSERT(subchunk_size < uvm_gpu_chunk_get_size(chunk));
1071 
1072     uvm_mutex_lock(&pmm->lock);
1073 
1074     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
1075                chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
1076 
1077     // If we're supposed to inject an error, clear out the root chunk's flag so
1078     // we can inject after nearly all chunks have been split. Otherwise
1079     // split_gpu_chunk will fail on the first try, without creating the tree.
1080     if (unlikely(walk_args.inject_error)) {
1081         // inject_split_error is a bitfield, so we must take the lock to modify
1082         // it. This path is only used in testing.
1083         uvm_spin_lock(&pmm->list_lock);
1084         chunk->inject_split_error = false;
1085         uvm_spin_unlock(&pmm->list_lock);
1086     }
1087 
1088     status = chunk_walk_pre_order(pmm, chunk, split_walk_func, &walk_args);
1089     if (status != NV_OK) {
1090         // Put the chunk back in its original state
1091         uvm_pmm_gpu_merge_chunk_locked(pmm, chunk);
1092     }
1093     else {
1094         UVM_ASSERT(walk_args.num_subchunks_curr == walk_args.num_subchunks_total);
1095     }
1096 
1097     uvm_mutex_unlock(&pmm->lock);
1098     return status;
1099 }
1100 
1101 typedef struct
1102 {
1103     size_t num_written;
1104     size_t num_to_write;
1105     size_t num_to_skip;
1106     uvm_gpu_chunk_t **subchunks;
1107 } get_subchunks_walk_t;
1108 
get_subchunks_walk_func(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,void * data)1109 static NV_STATUS get_subchunks_walk_func(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data)
1110 {
1111     get_subchunks_walk_t *args = data;
1112 
1113     // We're only collecting leaf chunks
1114     if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
1115         return NV_OK;
1116 
1117     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
1118                chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
1119 
1120     if (args->num_to_skip) {
1121         --args->num_to_skip;
1122         return NV_OK;
1123     }
1124 
1125     UVM_ASSERT(args->num_written < args->num_to_write);
1126     args->subchunks[args->num_written++] = chunk;
1127 
1128     // Bail immediately once we hit our limit. Note that this is not an error:
1129     // we just need to exit the walk.
1130     if (args->num_written == args->num_to_write)
1131         return NV_ERR_OUT_OF_RANGE;
1132 
1133     return NV_OK;
1134 }
1135 
uvm_pmm_gpu_get_subchunks(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * parent,size_t start_index,size_t num_subchunks,uvm_gpu_chunk_t ** subchunks)1136 size_t uvm_pmm_gpu_get_subchunks(uvm_pmm_gpu_t *pmm,
1137                                  uvm_gpu_chunk_t *parent,
1138                                  size_t start_index,
1139                                  size_t num_subchunks,
1140                                  uvm_gpu_chunk_t **subchunks)
1141 {
1142     NV_STATUS status;
1143 
1144     get_subchunks_walk_t walk_args =
1145     {
1146         .num_written    = 0,
1147         .num_to_write   = num_subchunks,
1148         .num_to_skip    = start_index,
1149         .subchunks      = subchunks,
1150     };
1151 
1152     if (num_subchunks == 0)
1153         return 0;
1154 
1155     UVM_ASSERT(parent->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
1156                parent->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
1157                parent->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
1158 
1159     uvm_mutex_lock(&pmm->lock);
1160 
1161     // Either pre- or post-order would work. Pick post-order just because we
1162     // only care about leaf chunks and we may exit early, so we'd get slightly
1163     // fewer callbacks.
1164     status = chunk_walk_post_order(pmm, parent, get_subchunks_walk_func, &walk_args);
1165     if (status != NV_OK) {
1166         UVM_ASSERT(status == NV_ERR_OUT_OF_RANGE);
1167         UVM_ASSERT(walk_args.num_written == walk_args.num_to_write);
1168     }
1169 
1170     uvm_mutex_unlock(&pmm->lock);
1171     return walk_args.num_written;
1172 }
1173 
list_first_chunk(struct list_head * list)1174 static uvm_gpu_chunk_t *list_first_chunk(struct list_head *list)
1175 {
1176     return list_first_entry_or_null(list, uvm_gpu_chunk_t, list);
1177 }
1178 
uvm_pmm_gpu_merge_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)1179 void uvm_pmm_gpu_merge_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
1180 {
1181     uvm_mutex_lock(&pmm->lock);
1182     uvm_pmm_gpu_merge_chunk_locked(pmm, chunk);
1183     uvm_mutex_unlock(&pmm->lock);
1184 }
1185 
root_chunk_unmap_indirect_peer(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk,uvm_gpu_t * other_gpu)1186 static void root_chunk_unmap_indirect_peer(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, uvm_gpu_t *other_gpu)
1187 {
1188     uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
1189     size_t index = root_chunk_index(pmm, root_chunk);
1190     long long new_count;
1191     NV_STATUS status;
1192 
1193     indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(other_gpu->id)];
1194 
1195     uvm_assert_root_chunk_locked(pmm, root_chunk);
1196     UVM_ASSERT(indirect_peer->dma_addrs);
1197     UVM_ASSERT(root_chunk->chunk.state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
1198     UVM_ASSERT(uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, other_gpu->id));
1199 
1200     // The tracker could have work which requires the indirect peer mappings to
1201     // remain until finished, such as PTE unmaps of this chunk from indirect
1202     // peers, so we need to wait. We also need to wait on the entire tracker,
1203     // not just other_gpu's entries, because there might be implicit chained
1204     // dependencies in the tracker.
1205     //
1206     // We know there can't be any other work which requires these mappings:
1207     // - If we're freeing the root chunk back to PMA or switching types of the
1208     //   root chunk, nothing else can reference the chunk.
1209     //
1210     // - If the chunk is still allocated then global peer access must be in the
1211     //   process of being disabled, say because one of the GPUs is being
1212     //   unregistered. We know that all VA spaces must have already called
1213     //   disable_peers and have waited on those PTE unmaps. The chunk could be
1214     //   freed concurrently with this indirect peer unmap, but that will be
1215     //   serialized by the root chunk lock.
1216     status = uvm_tracker_wait(&root_chunk->tracker);
1217     if (status != NV_OK)
1218         UVM_ASSERT(uvm_global_get_status() != NV_OK);
1219 
1220     uvm_parent_gpu_unmap_cpu_pages(other_gpu->parent, indirect_peer->dma_addrs[index], UVM_CHUNK_SIZE_MAX);
1221     uvm_processor_mask_clear(&root_chunk->indirect_peers_mapped, other_gpu->id);
1222     new_count = atomic64_dec_return(&indirect_peer->map_count);
1223     UVM_ASSERT(new_count >= 0);
1224 }
1225 
root_chunk_unmap_indirect_peers(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk)1226 static void root_chunk_unmap_indirect_peers(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk)
1227 {
1228     uvm_gpu_id_t other_gpu_id;
1229 
1230     // Root chunks should use a global processor mask as they are not bound to
1231     // a specific VA space. However, indirect peers are not supported when SMC
1232     // partitioning is enabled and, therefore, we can obtain the uvm_gpu_t
1233     // object directly from the uvm_parent_gpu_t object's id.
1234     for_each_gpu_id_in_mask(other_gpu_id, &root_chunk->indirect_peers_mapped) {
1235         uvm_gpu_t *other_gpu = uvm_gpu_get(other_gpu_id);
1236         root_chunk_unmap_indirect_peer(pmm, root_chunk, other_gpu);
1237     }
1238 }
1239 
uvm_pmm_gpu_indirect_peer_init(uvm_pmm_gpu_t * pmm,uvm_gpu_t * accessing_gpu)1240 NV_STATUS uvm_pmm_gpu_indirect_peer_init(uvm_pmm_gpu_t *pmm, uvm_gpu_t *accessing_gpu)
1241 {
1242     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1243     NvU64 *dma_addrs;
1244     uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
1245     NV_STATUS status = NV_OK;
1246 
1247     indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(accessing_gpu->id)];
1248 
1249     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
1250     UVM_ASSERT(uvm_gpus_are_indirect_peers(gpu, accessing_gpu));
1251     UVM_ASSERT(!indirect_peer->dma_addrs);
1252     UVM_ASSERT(atomic64_read(&indirect_peer->map_count) == 0);
1253 
1254     // Each root chunk tracks whether it has a mapping to a given indirect peer,
1255     // so we don't need to initialize this array.
1256     dma_addrs = uvm_kvmalloc(pmm->root_chunks.count * sizeof(dma_addrs[0]));
1257     if (!dma_addrs)
1258         status = NV_ERR_NO_MEMORY;
1259     else
1260         indirect_peer->dma_addrs = dma_addrs;
1261 
1262     return status;
1263 }
1264 
check_indirect_peer_empty(uvm_pmm_gpu_t * pmm,uvm_gpu_t * other_gpu)1265 static bool check_indirect_peer_empty(uvm_pmm_gpu_t *pmm, uvm_gpu_t *other_gpu)
1266 {
1267     uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
1268     size_t i;
1269 
1270     indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(other_gpu->id)];
1271 
1272     for (i = 0; i < pmm->root_chunks.count; i++) {
1273         uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[i];
1274 
1275         // This doesn't take the root chunk lock because checking the mask is an
1276         // atomic operation.
1277         if (uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, other_gpu->id)) {
1278             UVM_ASSERT(atomic64_read(&indirect_peer->map_count) > 0);
1279             return false;
1280         }
1281     }
1282 
1283     UVM_ASSERT(atomic64_read(&indirect_peer->map_count) == 0);
1284     return true;
1285 }
1286 
uvm_pmm_gpu_indirect_peer_destroy(uvm_pmm_gpu_t * pmm,uvm_gpu_t * other_gpu)1287 void uvm_pmm_gpu_indirect_peer_destroy(uvm_pmm_gpu_t *pmm, uvm_gpu_t *other_gpu)
1288 {
1289     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1290     uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
1291     size_t i;
1292 
1293     indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(other_gpu->id)];
1294 
1295     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
1296     UVM_ASSERT(uvm_gpus_are_indirect_peers(gpu, other_gpu));
1297 
1298     if (!indirect_peer->dma_addrs) {
1299         UVM_ASSERT(check_indirect_peer_empty(pmm, other_gpu));
1300         return;
1301     }
1302 
1303     // Just go over all root chunks and unmap them. This is slow, but it is not
1304     // a frequent operation.
1305     for (i = 0; i < pmm->root_chunks.count && atomic64_read(&indirect_peer->map_count); i++) {
1306         uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[i];
1307 
1308         // Take the root chunk lock to prevent chunks from transitioning in or
1309         // out of the PMA_OWNED state, and to serialize updates to the tracker
1310         // and indirect_peers_mapped mask. Note that indirect peers besides
1311         // other_gpu could be trying to create mappings concurrently.
1312         root_chunk_lock(pmm, root_chunk);
1313 
1314         if (root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED)
1315             UVM_ASSERT(uvm_processor_mask_empty(&root_chunk->indirect_peers_mapped));
1316         else if (uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, other_gpu->id))
1317             root_chunk_unmap_indirect_peer(pmm, root_chunk, other_gpu);
1318 
1319         root_chunk_unlock(pmm, root_chunk);
1320     }
1321 
1322     UVM_ASSERT(check_indirect_peer_empty(pmm, other_gpu));
1323 
1324     uvm_kvfree(indirect_peer->dma_addrs);
1325     indirect_peer->dma_addrs = NULL;
1326 }
1327 
uvm_pmm_gpu_indirect_peer_map(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_gpu_t * accessing_gpu)1328 NV_STATUS uvm_pmm_gpu_indirect_peer_map(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_gpu_t *accessing_gpu)
1329 {
1330     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1331     uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
1332     uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
1333     size_t index = root_chunk_index(pmm, root_chunk);
1334     NV_STATUS status = NV_OK;
1335 
1336     indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(accessing_gpu->id)];
1337 
1338     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
1339                chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
1340 
1341     UVM_ASSERT(uvm_gpus_are_indirect_peers(gpu, accessing_gpu));
1342     UVM_ASSERT(indirect_peer->dma_addrs);
1343 
1344     // Serialize:
1345     //  - Concurrent mappings to this root chunk (same or different GPUs)
1346     //  - Concurrent unmappings of this root chunk (must be a different GPU)
1347     root_chunk_lock(pmm, root_chunk);
1348 
1349     if (!uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, accessing_gpu->id)) {
1350         status = uvm_parent_gpu_map_cpu_pages(accessing_gpu->parent,
1351                                               uvm_gpu_chunk_to_page(pmm, &root_chunk->chunk),
1352                                               UVM_CHUNK_SIZE_MAX,
1353                                               &indirect_peer->dma_addrs[index]);
1354         if (status == NV_OK) {
1355             uvm_processor_mask_set(&root_chunk->indirect_peers_mapped, accessing_gpu->id);
1356             atomic64_inc(&indirect_peer->map_count);
1357         }
1358     }
1359 
1360     root_chunk_unlock(pmm, root_chunk);
1361     return status;
1362 }
1363 
uvm_pmm_gpu_indirect_peer_addr(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_gpu_t * accessing_gpu)1364 NvU64 uvm_pmm_gpu_indirect_peer_addr(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_gpu_t *accessing_gpu)
1365 {
1366     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1367     uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
1368     uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
1369     size_t index = root_chunk_index(pmm, root_chunk);
1370     NvU64 chunk_offset = chunk->address - root_chunk->chunk.address;
1371 
1372     indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(accessing_gpu->id)];
1373 
1374     UVM_ASSERT(uvm_gpus_are_indirect_peers(gpu, accessing_gpu));
1375     UVM_ASSERT(indirect_peer->dma_addrs);
1376     UVM_ASSERT(uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, accessing_gpu->id));
1377     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
1378                chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
1379                chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
1380 
1381     return indirect_peer->dma_addrs[index] + chunk_offset;
1382 }
1383 
uvm_pmm_gpu_peer_phys_address(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_gpu_t * accessing_gpu)1384 uvm_gpu_phys_address_t uvm_pmm_gpu_peer_phys_address(uvm_pmm_gpu_t *pmm,
1385                                                      uvm_gpu_chunk_t *chunk,
1386                                                      uvm_gpu_t *accessing_gpu)
1387 {
1388     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1389     uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(accessing_gpu, gpu);
1390     uvm_aperture_t aperture = uvm_gpu_peer_aperture(accessing_gpu, gpu);
1391     NvU64 addr;
1392 
1393     if (peer_caps->is_indirect_peer)
1394         addr = uvm_pmm_gpu_indirect_peer_addr(pmm, chunk, accessing_gpu);
1395     else if (uvm_gpus_are_nvswitch_connected(accessing_gpu, gpu))
1396         addr = chunk->address + gpu->parent->nvswitch_info.fabric_memory_window_start;
1397     else
1398         addr = chunk->address;
1399 
1400     return uvm_gpu_phys_address(aperture, addr);
1401 }
1402 
uvm_pmm_gpu_peer_copy_address(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_gpu_t * accessing_gpu)1403 uvm_gpu_address_t uvm_pmm_gpu_peer_copy_address(uvm_pmm_gpu_t *pmm,
1404                                                 uvm_gpu_chunk_t *chunk,
1405                                                 uvm_gpu_t *accessing_gpu)
1406 {
1407     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1408     uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(accessing_gpu, gpu);
1409     uvm_gpu_identity_mapping_t *gpu_peer_mapping;
1410 
1411     if (peer_caps->is_indirect_peer ||
1412         (accessing_gpu->parent->peer_copy_mode == UVM_GPU_PEER_COPY_MODE_PHYSICAL)) {
1413         // Indirect peers are accessed as sysmem addresses, so they don't need
1414         // to use identity mappings.
1415         return uvm_gpu_address_from_phys(uvm_pmm_gpu_peer_phys_address(pmm, chunk, accessing_gpu));
1416     }
1417 
1418     UVM_ASSERT(accessing_gpu->parent->peer_copy_mode == UVM_GPU_PEER_COPY_MODE_VIRTUAL);
1419     gpu_peer_mapping = uvm_gpu_get_peer_mapping(accessing_gpu, gpu->id);
1420 
1421     return uvm_gpu_address_virtual(gpu_peer_mapping->base + chunk->address);
1422 }
1423 
evict_root_chunk_from_va_block(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk,uvm_va_block_t * va_block)1424 static NV_STATUS evict_root_chunk_from_va_block(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, uvm_va_block_t *va_block)
1425 {
1426     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1427     NV_STATUS status;
1428     uvm_tracker_t tracker = UVM_TRACKER_INIT();
1429 
1430     UVM_ASSERT(va_block);
1431 
1432     // To evict the chunks from the VA block we need to lock it, but we already
1433     // have the PMM lock held. Unlock it first and re-lock it after.
1434     uvm_mutex_unlock(&pmm->lock);
1435 
1436     uvm_mutex_lock(&va_block->lock);
1437 
1438     status = uvm_va_block_evict_chunks(va_block, gpu, &root_chunk->chunk, &tracker);
1439 
1440     uvm_mutex_unlock(&va_block->lock);
1441 
1442     // The block has been retained by find_and_retain_va_block_to_evict(),
1443     // release it here as it's not needed any more. Notably do that even if
1444     // uvm_va_block_evict_chunks() fails.
1445     uvm_va_block_release(va_block);
1446 
1447     if (status == NV_OK) {
1448         root_chunk_lock(pmm, root_chunk);
1449         status = uvm_tracker_add_tracker_safe(&root_chunk->tracker, &tracker);
1450         root_chunk_unlock(pmm, root_chunk);
1451     }
1452 
1453     uvm_tracker_deinit(&tracker);
1454 
1455     uvm_mutex_lock(&pmm->lock);
1456 
1457     return status;
1458 }
1459 
uvm_pmm_gpu_mark_chunk_evicted(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)1460 void uvm_pmm_gpu_mark_chunk_evicted(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
1461 {
1462     uvm_spin_lock(&pmm->list_lock);
1463 
1464     UVM_ASSERT(chunk_is_in_eviction(pmm, chunk));
1465     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
1466     UVM_ASSERT(chunk->va_block != NULL);
1467 
1468     chunk->va_block = NULL;
1469     chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
1470     chunk_pin(pmm, chunk);
1471 
1472     uvm_spin_unlock(&pmm->list_lock);
1473 }
1474 
pin_free_chunks_func(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,void * data)1475 static NV_STATUS pin_free_chunks_func(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data)
1476 {
1477     uvm_assert_mutex_locked(&pmm->lock);
1478 
1479     uvm_spin_lock(&pmm->list_lock);
1480 
1481     UVM_ASSERT(chunk_is_in_eviction(pmm, chunk));
1482 
1483     if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_FREE) {
1484         list_del_init(&chunk->list);
1485         chunk_pin(pmm, chunk);
1486         if (chunk->parent)
1487             chunk->parent->suballoc->allocated++;
1488     }
1489 
1490     uvm_spin_unlock(&pmm->list_lock);
1491 
1492     return NV_OK;
1493 }
1494 
free_first_pinned_chunk_func(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,void * data)1495 static NV_STATUS free_first_pinned_chunk_func(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data)
1496 {
1497     uvm_assert_mutex_locked(&pmm->lock);
1498 
1499     UVM_ASSERT(!chunk_is_in_eviction(pmm, chunk));
1500 
1501     if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
1502         free_chunk_with_merges(pmm, chunk);
1503         return NV_ERR_MORE_DATA_AVAILABLE;
1504     }
1505 
1506     return NV_OK;
1507 }
1508 
1509 typedef struct
1510 {
1511     uvm_va_block_t *va_block_to_evict_from;
1512 } evict_data_t;
1513 
find_and_retain_va_block_to_evict(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,void * data)1514 static NV_STATUS find_and_retain_va_block_to_evict(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data)
1515 {
1516     NV_STATUS status = NV_OK;
1517     evict_data_t *evict_data = (evict_data_t *)data;
1518 
1519     UVM_ASSERT(evict_data->va_block_to_evict_from == NULL);
1520 
1521     uvm_spin_lock(&pmm->list_lock);
1522 
1523     // All free chunks should have been pinned already by pin_free_chunks_func().
1524     UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
1525                    chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
1526                    chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT,
1527                    "state %s\n", uvm_pmm_gpu_chunk_state_string(chunk->state));
1528 
1529     if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
1530         UVM_ASSERT(chunk->va_block);
1531         evict_data->va_block_to_evict_from = chunk->va_block;
1532         uvm_va_block_retain(chunk->va_block);
1533         status = NV_ERR_MORE_DATA_AVAILABLE;
1534     }
1535 
1536     uvm_spin_unlock(&pmm->list_lock);
1537 
1538     return status;
1539 }
1540 
root_chunk_has_elevated_page(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk)1541 static bool root_chunk_has_elevated_page(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk)
1542 {
1543     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1544     uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
1545     struct page *page;
1546 
1547     if (!gpu->mem_info.numa.enabled)
1548         return false;
1549 
1550     page = uvm_gpu_chunk_to_page(pmm, chunk);
1551 
1552     return page_count(page) > UVM_CHUNK_SIZE_MAX / PAGE_SIZE;
1553 }
1554 
evict_root_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk,uvm_pmm_context_t pmm_context)1555 static NV_STATUS evict_root_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, uvm_pmm_context_t pmm_context)
1556 {
1557     NV_STATUS status;
1558     NV_STATUS free_status;
1559     uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
1560     const uvm_pmm_gpu_memory_type_t type = chunk->type;
1561 
1562     uvm_assert_mutex_locked(&pmm->lock);
1563 
1564     // First pin all the free subchunks
1565     status = chunk_walk_pre_order(pmm, chunk, pin_free_chunks_func, NULL);
1566     UVM_ASSERT(status == NV_OK);
1567     while (1) {
1568         evict_data_t evict = {0};
1569         status = chunk_walk_pre_order(pmm, chunk, find_and_retain_va_block_to_evict, &evict);
1570 
1571         // find_and_retain_va_block_to_evict() returns NV_ERR_MORE_DATA_AVAILABLE
1572         // immediately after finding the first VA block to evict from and NV_OK
1573         // if no more blocks are left.
1574         if (status != NV_ERR_MORE_DATA_AVAILABLE) {
1575             UVM_ASSERT(status == NV_OK);
1576             break;
1577         }
1578 
1579         // Evict the chunks from the VA block. Notably this will unlock and
1580         // re-lock the PMM mutex. This is ok as we don't rely on any PMM state
1581         // that can change across the calls. In particular, the walk to pick the
1582         // next VA block to evict above is always started from the root chunk.
1583         status = evict_root_chunk_from_va_block(pmm, root_chunk, evict.va_block_to_evict_from);
1584         if (status != NV_OK)
1585             goto error;
1586     }
1587 
1588     // All of the leaf chunks should be pinned now, merge them all back into a
1589     // pinned root chunk.
1590     uvm_pmm_gpu_merge_chunk_locked(pmm, chunk);
1591 
1592     uvm_spin_lock(&pmm->list_lock);
1593 
1594     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
1595     uvm_gpu_chunk_set_in_eviction(chunk, false);
1596 
1597     chunk->is_zero = false;
1598 
1599     uvm_spin_unlock(&pmm->list_lock);
1600 
1601     // Bug 2085760: Check if there is any page within the evicted chunk with an
1602     // elevated refcount. In such case there is another holder of the page,
1603     // which prevents us from reusing it. This can happen on systems where
1604     // struct pages backed by GPU memory are directly available to third-party
1605     // device drivers. Note that at this point, the chunk ends up not being in
1606     // a chunk free list. We can just free it, so PMA will handle the page with
1607     // elevated refcount.
1608     if (root_chunk_has_elevated_page(pmm, root_chunk)) {
1609         free_root_chunk(pmm, root_chunk, free_root_chunk_mode_from_pmm_context(pmm_context));
1610         return NV_ERR_IN_USE;
1611     }
1612 
1613     UVM_ASSERT(check_chunk(pmm, chunk));
1614 
1615     return NV_OK;
1616 
1617 error:
1618     // On error we need to free all the chunks that we were able to evict so
1619     // far. They should all be pinned.
1620 
1621     // Clear the eviction state so any new chunks freed by other threads are
1622     // actually freed instead of pinned. We need the list lock to make the
1623     // eviction check and conditional pin in chunk_free_locked atomic with our
1624     // free-if-pinned loop below.
1625     uvm_spin_lock(&pmm->list_lock);
1626 
1627     uvm_gpu_chunk_set_in_eviction(chunk, false);
1628 
1629     // In case we didn't manage to evict any chunks and hence the root is still
1630     // unpinned, we need to put it back on an eviction list.
1631     // chunk_update_lists_locked() will do that.
1632     chunk_update_lists_locked(pmm, chunk);
1633 
1634     uvm_spin_unlock(&pmm->list_lock);
1635 
1636     do {
1637         free_status = chunk_walk_pre_order(pmm, chunk, free_first_pinned_chunk_func, NULL);
1638     } while (free_status == NV_ERR_MORE_DATA_AVAILABLE);
1639     UVM_ASSERT(free_status == NV_OK);
1640 
1641     (void)free_next_available_root_chunk(pmm, type);
1642 
1643     return status;
1644 }
1645 
chunk_is_evictable(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)1646 static bool chunk_is_evictable(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
1647 {
1648     uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
1649 
1650     uvm_assert_spinlock_locked(&pmm->list_lock);
1651 
1652     if (root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED)
1653         return false;
1654 
1655     if (chunk_is_root_chunk_pinned(pmm, chunk))
1656         return false;
1657 
1658     if (chunk_is_in_eviction(pmm, chunk))
1659         return false;
1660 
1661     // An evictable chunk's root should be on one of the eviction lists.
1662     UVM_ASSERT(!list_empty(&root_chunk->chunk.list));
1663 
1664     return true;
1665 }
1666 
chunk_start_eviction(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)1667 static void chunk_start_eviction(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
1668 {
1669     uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
1670     chunk = &root_chunk->chunk;
1671 
1672     uvm_assert_spinlock_locked(&pmm->list_lock);
1673 
1674     UVM_ASSERT(chunk_is_evictable(pmm, chunk));
1675     UVM_ASSERT(!list_empty(&chunk->list));
1676 
1677     list_del_init(&chunk->list);
1678     uvm_gpu_chunk_set_in_eviction(chunk, true);
1679 }
1680 
root_chunk_update_eviction_list(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,struct list_head * list)1681 static void root_chunk_update_eviction_list(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, struct list_head *list)
1682 {
1683     uvm_spin_lock(&pmm->list_lock);
1684 
1685     UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_MAX);
1686     UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
1687     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
1688                chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
1689 
1690     if (!chunk_is_root_chunk_pinned(pmm, chunk) && !chunk_is_in_eviction(pmm, chunk)) {
1691         // An unpinned chunk not selected for eviction should be on one of the
1692         // eviction lists.
1693         UVM_ASSERT(!list_empty(&chunk->list));
1694 
1695         list_move_tail(&chunk->list, list);
1696     }
1697 
1698     uvm_spin_unlock(&pmm->list_lock);
1699 }
1700 
uvm_pmm_gpu_mark_root_chunk_used(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)1701 void uvm_pmm_gpu_mark_root_chunk_used(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
1702 {
1703     root_chunk_update_eviction_list(pmm, chunk, &pmm->root_chunks.va_block_used);
1704 }
1705 
uvm_pmm_gpu_mark_root_chunk_unused(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)1706 void uvm_pmm_gpu_mark_root_chunk_unused(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
1707 {
1708     root_chunk_update_eviction_list(pmm, chunk, &pmm->root_chunks.va_block_unused);
1709 }
1710 
pick_root_chunk_to_evict(uvm_pmm_gpu_t * pmm)1711 static uvm_gpu_root_chunk_t *pick_root_chunk_to_evict(uvm_pmm_gpu_t *pmm)
1712 {
1713     uvm_gpu_chunk_t *chunk;
1714 
1715     uvm_spin_lock(&pmm->list_lock);
1716 
1717     // Check if there are root chunks sitting in the free lists. Non-zero
1718     // chunks are preferred.
1719     chunk = list_first_chunk(find_free_list(pmm,
1720                                             UVM_PMM_GPU_MEMORY_TYPE_USER,
1721                                             UVM_CHUNK_SIZE_MAX,
1722                                             UVM_PMM_LIST_NO_ZERO));
1723     if (chunk)
1724         UVM_ASSERT(!chunk->is_zero);
1725 
1726     if (!chunk) {
1727         chunk = list_first_chunk(find_free_list(pmm,
1728                                                 UVM_PMM_GPU_MEMORY_TYPE_USER,
1729                                                 UVM_CHUNK_SIZE_MAX,
1730                                                 UVM_PMM_LIST_ZERO));
1731         if (chunk)
1732             UVM_ASSERT(chunk->is_zero);
1733     }
1734 
1735     if (!chunk)
1736         chunk = list_first_chunk(&pmm->root_chunks.va_block_unused);
1737 
1738     // TODO: Bug 1765193: Move the chunks to the tail of the used list whenever
1739     // they get mapped.
1740     if (!chunk)
1741         chunk = list_first_chunk(&pmm->root_chunks.va_block_used);
1742 
1743     if (chunk)
1744         chunk_start_eviction(pmm, chunk);
1745 
1746     uvm_spin_unlock(&pmm->list_lock);
1747 
1748     if (chunk)
1749         return root_chunk_from_chunk(pmm, chunk);
1750     return NULL;
1751 }
1752 
pick_and_evict_root_chunk(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_pmm_context_t pmm_context,uvm_gpu_chunk_t ** out_chunk)1753 static NV_STATUS pick_and_evict_root_chunk(uvm_pmm_gpu_t *pmm,
1754                                            uvm_pmm_gpu_memory_type_t type,
1755                                            uvm_pmm_context_t pmm_context,
1756                                            uvm_gpu_chunk_t **out_chunk)
1757 {
1758     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1759     NV_STATUS status;
1760     uvm_gpu_chunk_t *chunk;
1761     uvm_gpu_root_chunk_t *root_chunk;
1762 
1763     UVM_ASSERT(uvm_parent_gpu_supports_eviction(gpu->parent));
1764 
1765     uvm_assert_mutex_locked(&pmm->lock);
1766 
1767     root_chunk = pick_root_chunk_to_evict(pmm);
1768     if (!root_chunk)
1769         return NV_ERR_NO_MEMORY;
1770 
1771     status = evict_root_chunk(pmm, root_chunk, pmm_context);
1772     if (status != NV_OK)
1773         return status;
1774 
1775     chunk = &root_chunk->chunk;
1776 
1777     if (uvm_pmm_gpu_memory_type_is_kernel(type)) {
1778         NvU32 flags = 0;
1779         if (pmm_context == PMM_CONTEXT_PMA_EVICTION)
1780             flags |= UVM_PMA_CALLED_FROM_PMA_EVICTION;
1781 
1782         // Transitioning user memory type to kernel memory type requires pinning
1783         // it so that PMA doesn't pick it for eviction.
1784         status = nvUvmInterfacePmaPinPages(pmm->pma,
1785                                            &chunk->address,
1786                                            1,
1787                                            UVM_CHUNK_SIZE_MAX,
1788                                            flags);
1789         if (status == NV_ERR_IN_USE) {
1790             // Pinning can fail if some of the pages have been chosen for
1791             // eviction already. In that case free the root chunk back to PMA
1792             // and let the caller retry.
1793             free_root_chunk(pmm, root_chunk, free_root_chunk_mode_from_pmm_context(pmm_context));
1794 
1795             return status;
1796         }
1797 
1798         UVM_ASSERT_MSG(status == NV_OK,
1799                        "pmaPinPages(root_chunk=0x%llx) failed unexpectedly: %s\n",
1800                        chunk->address,
1801                        nvstatusToString(status));
1802 
1803         // Unmap any indirect peer physical mappings for this chunk, since
1804         // kernel chunks generally don't need them.
1805         root_chunk_lock(pmm, root_chunk);
1806         root_chunk_unmap_indirect_peers(pmm, root_chunk);
1807         root_chunk_unlock(pmm, root_chunk);
1808 
1809         uvm_spin_lock(&pmm->list_lock);
1810         chunk->type = type;
1811         uvm_spin_unlock(&pmm->list_lock);
1812     }
1813 
1814     *out_chunk = chunk;
1815     return NV_OK;
1816 }
1817 
pick_and_evict_root_chunk_retry(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_pmm_context_t pmm_context,uvm_gpu_chunk_t ** out_chunk)1818 static NV_STATUS pick_and_evict_root_chunk_retry(uvm_pmm_gpu_t *pmm,
1819                                                  uvm_pmm_gpu_memory_type_t type,
1820                                                  uvm_pmm_context_t pmm_context,
1821                                                  uvm_gpu_chunk_t **out_chunk)
1822 {
1823     NV_STATUS status;
1824 
1825     // Eviction can fail if the chunk gets selected for PMA eviction at
1826     // the same time. Keep retrying.
1827     do {
1828         status = pick_and_evict_root_chunk(pmm, type, pmm_context, out_chunk);
1829     } while (status == NV_ERR_IN_USE);
1830 
1831     return status;
1832 }
1833 
find_free_chunk_locked(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_chunk_size_t chunk_size,uvm_pmm_list_zero_t zero_type)1834 static uvm_gpu_chunk_t *find_free_chunk_locked(uvm_pmm_gpu_t *pmm,
1835                                                uvm_pmm_gpu_memory_type_t type,
1836                                                uvm_chunk_size_t chunk_size,
1837                                                uvm_pmm_list_zero_t zero_type)
1838 {
1839     struct list_head *free_list = find_free_list(pmm, type, chunk_size, zero_type);
1840     uvm_gpu_chunk_t *tmp, *chunk;
1841 
1842     uvm_assert_spinlock_locked(&pmm->list_lock);
1843 
1844     list_for_each_entry_safe(chunk, tmp, free_list, list) {
1845         if (zero_type == UVM_PMM_LIST_ZERO)
1846             UVM_ASSERT(chunk->is_zero);
1847         else
1848             UVM_ASSERT(!chunk->is_zero);
1849 
1850         if (chunk_is_in_eviction(pmm, chunk)) {
1851             // Remove chunks that have been picked for eviction from the free
1852             // lists. The eviction path does it with pin_free_chunks_func(),
1853             // but there is a window between when a root chunk is chosen for
1854             // eviction and all of its subchunks are removed from free lists.
1855             list_del_init(&chunk->list);
1856         }
1857         else {
1858             // Bug 2085760: When NUMA GPU is enabled, also check that the root
1859             // chunk containing the candidate free chunk doesn't have any page
1860             // escaped to another driver. If that is the case, just skip such
1861             // chunk hoping that the page will eventually lose the extra
1862             // reference.
1863             // References can only be added when a virtual mapping to the page
1864             // exists, so once a chunk in the free list has no elevated pages
1865             // the chunk is safe to reuse.
1866             if (!root_chunk_has_elevated_page(pmm, root_chunk_from_chunk(pmm, chunk)))
1867                 return chunk;
1868         }
1869     }
1870 
1871     return NULL;
1872 }
1873 
claim_free_chunk(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_chunk_size_t chunk_size)1874 static uvm_gpu_chunk_t *claim_free_chunk(uvm_pmm_gpu_t *pmm, uvm_pmm_gpu_memory_type_t type, uvm_chunk_size_t chunk_size)
1875 {
1876     uvm_gpu_chunk_t *chunk;
1877 
1878     uvm_spin_lock(&pmm->list_lock);
1879 
1880     // Prefer zero free chunks as they are likely going to be used for a new
1881     // allocation.
1882     //
1883     // TODO: Bug 2446832: Allow callers to request non-zero chunks in PMM
1884     // allocation functions, so we don't waste zero chunks.
1885     chunk = find_free_chunk_locked(pmm, type, chunk_size, UVM_PMM_LIST_ZERO);
1886 
1887     if (!chunk)
1888         chunk = find_free_chunk_locked(pmm, type, chunk_size, UVM_PMM_LIST_NO_ZERO);
1889 
1890     if (!chunk)
1891         goto out;
1892 
1893     UVM_ASSERT_MSG(uvm_gpu_chunk_get_size(chunk) == chunk_size, "chunk size %u expected %u\n",
1894             uvm_gpu_chunk_get_size(chunk), chunk_size);
1895     UVM_ASSERT(chunk->type == type);
1896     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_FREE);
1897     UVM_ASSERT(!chunk_is_in_eviction(pmm, chunk));
1898 
1899     if (chunk->parent) {
1900         UVM_ASSERT(chunk->parent->suballoc);
1901         UVM_ASSERT(chunk->parent->type == type);
1902         UVM_ASSERT(chunk->parent->suballoc->allocated < num_subchunks(chunk->parent));
1903         chunk->parent->suballoc->allocated++;
1904     }
1905 
1906     chunk_pin(pmm, chunk);
1907     chunk_update_lists_locked(pmm, chunk);
1908 
1909 out:
1910     uvm_spin_unlock(&pmm->list_lock);
1911 
1912     return chunk;
1913 }
1914 
alloc_or_evict_root_chunk(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_pmm_alloc_flags_t flags,uvm_gpu_chunk_t ** chunk_out)1915 static NV_STATUS alloc_or_evict_root_chunk(uvm_pmm_gpu_t *pmm,
1916                                            uvm_pmm_gpu_memory_type_t type,
1917                                            uvm_pmm_alloc_flags_t flags,
1918                                            uvm_gpu_chunk_t **chunk_out)
1919 {
1920     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1921     NV_STATUS status;
1922     uvm_gpu_chunk_t *chunk;
1923 
1924     status = alloc_root_chunk(pmm, type, flags, &chunk);
1925     if (status != NV_OK) {
1926         if ((flags & UVM_PMM_ALLOC_FLAGS_EVICT) && uvm_parent_gpu_supports_eviction(gpu->parent))
1927             status = pick_and_evict_root_chunk_retry(pmm, type, PMM_CONTEXT_DEFAULT, chunk_out);
1928 
1929         return status;
1930     }
1931 
1932     *chunk_out = chunk;
1933     return status;
1934 }
1935 
1936 // Same as alloc_or_evit_root_chunk(), but without the PMM lock held.
alloc_or_evict_root_chunk_unlocked(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_pmm_alloc_flags_t flags,uvm_gpu_chunk_t ** chunk_out)1937 static NV_STATUS alloc_or_evict_root_chunk_unlocked(uvm_pmm_gpu_t *pmm,
1938                                                     uvm_pmm_gpu_memory_type_t type,
1939                                                     uvm_pmm_alloc_flags_t flags,
1940                                                     uvm_gpu_chunk_t **chunk_out)
1941 {
1942     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1943     NV_STATUS status;
1944     uvm_gpu_chunk_t *chunk;
1945 
1946     status = alloc_root_chunk(pmm, type, flags, &chunk);
1947     if (status != NV_OK) {
1948         if ((flags & UVM_PMM_ALLOC_FLAGS_EVICT) && uvm_parent_gpu_supports_eviction(gpu->parent)) {
1949             uvm_mutex_lock(&pmm->lock);
1950             status = pick_and_evict_root_chunk_retry(pmm, type, PMM_CONTEXT_DEFAULT, chunk_out);
1951             uvm_mutex_unlock(&pmm->lock);
1952         }
1953 
1954         return status;
1955     }
1956 
1957     *chunk_out = chunk;
1958     return status;
1959 }
1960 
alloc_chunk_with_splits(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_chunk_size_t chunk_size,uvm_pmm_alloc_flags_t flags,uvm_gpu_chunk_t ** out_chunk)1961 static NV_STATUS alloc_chunk_with_splits(uvm_pmm_gpu_t *pmm,
1962                                          uvm_pmm_gpu_memory_type_t type,
1963                                          uvm_chunk_size_t chunk_size,
1964                                          uvm_pmm_alloc_flags_t flags,
1965                                          uvm_gpu_chunk_t **out_chunk)
1966 {
1967     NV_STATUS status;
1968     uvm_chunk_size_t cur_size;
1969     uvm_gpu_chunk_t *chunk;
1970     uvm_chunk_sizes_mask_t chunk_sizes = pmm->chunk_sizes[type];
1971 
1972     uvm_assert_mutex_locked(&pmm->lock);
1973     UVM_ASSERT(chunk_size != UVM_CHUNK_SIZE_MAX);
1974 
1975     // Check for a free chunk again in case a different thread freed something
1976     // up while this thread was waiting for the PMM lock.
1977     chunk = claim_free_chunk(pmm, type, chunk_size);
1978     if (chunk) {
1979         // A free chunk was claimed, return immediately.
1980         UVM_ASSERT(check_chunk(pmm, chunk));
1981 
1982         *out_chunk = chunk;
1983         return NV_OK;
1984     }
1985 
1986     cur_size = chunk_size;
1987 
1988     // Look for a bigger free chunk that can be split
1989     for_each_chunk_size_from(cur_size, chunk_sizes) {
1990         chunk = claim_free_chunk(pmm, type, cur_size);
1991         if (chunk)
1992             break;
1993     }
1994 
1995     if (unlikely(!chunk)) {
1996         status = alloc_or_evict_root_chunk(pmm, type, flags, &chunk);
1997         if (status != NV_OK)
1998             return status;
1999         cur_size = UVM_CHUNK_SIZE_MAX;
2000         UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == cur_size);
2001     }
2002 
2003     UVM_ASSERT(chunk);
2004 
2005     for_each_chunk_size_rev_from(cur_size, chunk_sizes) {
2006         NvU32 i;
2007         uvm_gpu_chunk_t *parent;
2008 
2009         UVM_ASSERT(uvm_gpu_chunk_get_size(chunk)  == cur_size);
2010         UVM_ASSERT(chunk->type  == type);
2011         UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
2012 
2013         if (chunk->parent) {
2014             UVM_ASSERT(chunk->parent->suballoc);
2015             UVM_ASSERT(uvm_gpu_chunk_get_size(chunk->parent) == uvm_chunk_find_next_size(chunk_sizes, cur_size));
2016             UVM_ASSERT(chunk->parent->type == type);
2017             UVM_ASSERT_MSG(chunk->parent->suballoc->allocated <= num_subchunks(chunk->parent), "allocated %u num %u\n",
2018                     chunk->parent->suballoc->allocated, num_subchunks(chunk->parent));
2019         }
2020 
2021         if (cur_size == chunk_size) {
2022             *out_chunk = chunk;
2023             return NV_OK;
2024         }
2025 
2026         status = split_gpu_chunk(pmm, chunk);
2027         if (status != NV_OK) {
2028             free_chunk_with_merges(pmm, chunk);
2029             return status;
2030         }
2031 
2032         parent = chunk;
2033 
2034         // Use the first subchunk for further splitting, if needed.
2035         chunk = parent->suballoc->subchunks[0];
2036 
2037         // And add the rest to the free list
2038         uvm_spin_lock(&pmm->list_lock);
2039 
2040         for (i = 1; i < num_subchunks(parent); ++i)
2041             chunk_free_locked(pmm, parent->suballoc->subchunks[i]);
2042 
2043         uvm_spin_unlock(&pmm->list_lock);
2044     }
2045     UVM_PANIC();
2046 }
2047 
2048 // Allocates a single chunk of a given size. If needed, splits a chunk of
2049 // bigger size or, if that is not possible, allocates from PMA or evicts.
alloc_chunk(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_chunk_size_t chunk_size,uvm_pmm_alloc_flags_t flags,uvm_gpu_chunk_t ** out_chunk)2050 NV_STATUS alloc_chunk(uvm_pmm_gpu_t *pmm,
2051                       uvm_pmm_gpu_memory_type_t type,
2052                       uvm_chunk_size_t chunk_size,
2053                       uvm_pmm_alloc_flags_t flags,
2054                       uvm_gpu_chunk_t **out_chunk)
2055 {
2056     NV_STATUS status;
2057     uvm_gpu_chunk_t *chunk;
2058 
2059     chunk = claim_free_chunk(pmm, type, chunk_size);
2060     if (chunk) {
2061         // A free chunk could be claimed, we are done.
2062         goto out;
2063     }
2064 
2065     if (chunk_size == UVM_CHUNK_SIZE_MAX) {
2066         // For chunks of root chunk size we won't be doing any splitting so we
2067         // can just directly try allocating without holding the PMM lock. If
2068         // eviction is necessary, the lock will be acquired internally.
2069         status = alloc_or_evict_root_chunk_unlocked(pmm, type, flags, &chunk);
2070         if (status != NV_OK)
2071             return status;
2072 
2073         goto out;
2074     }
2075 
2076     // We didn't find a free chunk and we will require splits so acquire the
2077     // PMM lock.
2078     uvm_mutex_lock(&pmm->lock);
2079 
2080     status = alloc_chunk_with_splits(pmm, type, chunk_size, flags, &chunk);
2081 
2082     uvm_mutex_unlock(&pmm->lock);
2083 
2084     if (status != NV_OK) {
2085         (void)free_next_available_root_chunk(pmm, type);
2086         return status;
2087     }
2088 
2089 out:
2090     *out_chunk = chunk;
2091 
2092     return NV_OK;
2093 }
2094 
2095 // Initialize the given root chunk. If the initial state is
2096 // UVM_PMM_GPU_CHUNK_STATE_FREE, the chunk is added to the corresponding free
2097 // list.
2098 //
2099 // PMA lock must be held by the caller
init_root_chunk(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_gpu_root_chunk_t * root_chunk,uvm_pmm_gpu_chunk_state_t initial_state,bool is_zero)2100 static void init_root_chunk(uvm_pmm_gpu_t *pmm,
2101                             uvm_pmm_gpu_memory_type_t type,
2102                             uvm_gpu_root_chunk_t *root_chunk,
2103                             uvm_pmm_gpu_chunk_state_t initial_state,
2104                             bool is_zero)
2105 {
2106     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
2107     uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
2108 
2109     uvm_assert_rwsem_locked(&pmm->pma_lock);
2110 
2111     root_chunk_lock(pmm, root_chunk);
2112 
2113     uvm_tracker_init(&root_chunk->tracker);
2114 
2115     uvm_spin_lock(&pmm->list_lock);
2116 
2117     UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED,
2118                    "Address 0x%llx state %s GPU %s\n",
2119                    chunk->address,
2120                    uvm_pmm_gpu_chunk_state_string(chunk->state),
2121                    uvm_gpu_name(gpu));
2122 
2123     UVM_ASSERT(chunk->parent == NULL);
2124     UVM_ASSERT(chunk->suballoc == NULL);
2125     UVM_ASSERT(chunk->va_block == NULL);
2126     UVM_ASSERT(chunk->va_block_page_index == PAGES_PER_UVM_VA_BLOCK);
2127     UVM_ASSERT(list_empty(&chunk->list));
2128     UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_MAX);
2129     UVM_ASSERT(!root_chunk_has_elevated_page(pmm, root_chunk));
2130 
2131     UVM_ASSERT(initial_state == UVM_PMM_GPU_CHUNK_STATE_FREE ||
2132                initial_state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
2133 
2134     chunk->type = type;
2135     chunk->state = initial_state;
2136     chunk->is_zero = is_zero;
2137 
2138     chunk_update_lists_locked(pmm, chunk);
2139 
2140     uvm_spin_unlock(&pmm->list_lock);
2141 
2142     root_chunk_unlock(pmm, root_chunk);
2143 }
2144 
alloc_root_chunk(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_pmm_alloc_flags_t flags,uvm_gpu_chunk_t ** out_chunk)2145 NV_STATUS alloc_root_chunk(uvm_pmm_gpu_t *pmm,
2146                            uvm_pmm_gpu_memory_type_t type,
2147                            uvm_pmm_alloc_flags_t flags,
2148                            uvm_gpu_chunk_t **out_chunk)
2149 {
2150     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
2151     NV_STATUS status;
2152     UvmPmaAllocationOptions options = {0};
2153     NvU32 num_chunks;
2154     NvU32 i;
2155     bool used_kmem_cache = false;
2156     UvmGpuPointer pa;
2157     UvmGpuPointer *pas;
2158 
2159     // TODO: Bug 2444368: On P9 systems, PMA scrubbing is very slow. For now,
2160     // zero the chunk within UVM. Re-evaluate this condition once PMA scrubbing
2161     // is improved.
2162     //
2163     // TODO: Bug 2446832: Most (all?) kernel chunks don't require scrubbing.
2164     // Also, user pages that are about to be overwritten, don't need to be
2165     // zeroed, either. Add an interface to uvm_pmm_gpu_alloc for callers to
2166     // specify when they don't need zeroed pages.
2167     const bool skip_pma_scrubbing = gpu->mem_info.numa.enabled;
2168     UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(type) || uvm_pmm_gpu_memory_type_is_kernel(type));
2169 
2170     options.flags = UVM_PMA_ALLOCATE_DONT_EVICT;
2171 
2172     if (uvm_pmm_gpu_memory_type_is_kernel(type) || !gpu_supports_pma_eviction(gpu))
2173         options.flags |= UVM_PMA_ALLOCATE_PINNED;
2174 
2175     if (skip_pma_scrubbing)
2176         options.flags |= UVM_PMA_ALLOCATE_NO_ZERO;
2177 
2178     // TODO: Bug 200480500: Batching is currently disabled on P9. Re-enable
2179     // when the performance of best-effort allocations is verified.
2180     if (gpu->mem_info.numa.enabled)
2181         flags |= UVM_PMM_ALLOC_FLAGS_DONT_BATCH;
2182 
2183     // When the Confidential Computing feature is enabled, allocate GPU memory
2184     // in the protected region, unless specified otherwise.
2185     if (g_uvm_global.conf_computing_enabled && memory_type_is_protected(type))
2186         options.flags |= UVM_PMA_ALLOCATE_PROTECTED_REGION;
2187 
2188     if (!gpu->parent->rm_info.isSimulated &&
2189         !(options.flags & UVM_PMA_ALLOCATE_PINNED) &&
2190         !(flags & UVM_PMM_ALLOC_FLAGS_DONT_BATCH)) {
2191         num_chunks = 1 << uvm_perf_pma_batch_nonpinned_order;
2192 
2193         // Allocate a batch of root chunks in order to reduce the number of
2194         // calls to PMA. The first one is returned as allocated, the rest are
2195         // added to the corresponding free list.
2196         pas = kmem_cache_alloc(g_pma_address_batch_cache_ref.cache, NV_UVM_GFP_FLAGS);
2197         if (!pas)
2198             return NV_ERR_NO_MEMORY;
2199 
2200         // Make the allocation best-effort to avoid retries if the whole batch
2201         // cannot be allocated.
2202         options.flags |= UVM_PMA_ALLOCATE_ALLOW_PARTIAL;
2203 
2204         used_kmem_cache = true;
2205     }
2206     else {
2207         num_chunks = 1;
2208 
2209         pas = &pa;
2210     }
2211 
2212     // Acquire the PMA lock for read so that uvm_pmm_gpu_pma_evict_range() can
2213     // flush out any pending allocs.
2214     uvm_down_read(&pmm->pma_lock);
2215 
2216     status = nvUvmInterfacePmaAllocPages(pmm->pma, num_chunks, UVM_CHUNK_SIZE_MAX, &options, pas);
2217     if (status != NV_OK)
2218         goto exit_unlock;
2219 
2220     // Batched allocations are best-effort. Therefore, we need to adjust the
2221     // number of allocated chunks.
2222     if (used_kmem_cache) {
2223         UVM_ASSERT(options.numPagesAllocated <= num_chunks);
2224         UVM_ASSERT(options.numPagesAllocated > 0);
2225         num_chunks = options.numPagesAllocated;
2226     }
2227 
2228     for (i = 0; i < num_chunks; ++i) {
2229         uvm_pmm_gpu_chunk_state_t initial_state;
2230         uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_address(pmm, pas[i]);
2231         uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
2232 
2233         if (i == 0) {
2234             initial_state = UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED;
2235             *out_chunk = chunk;
2236         }
2237         else {
2238             initial_state = UVM_PMM_GPU_CHUNK_STATE_FREE;
2239         }
2240 
2241         UVM_ASSERT_MSG(IS_ALIGNED(pas[i], UVM_CHUNK_SIZE_MAX), "Address 0x%llx\n", pas[i]);
2242         UVM_ASSERT(chunk->address == pas[i]);
2243 
2244         init_root_chunk(pmm,
2245                         type,
2246                         root_chunk,
2247                         initial_state,
2248                         !!(options.resultFlags & UVM_PMA_ALLOCATE_RESULT_IS_ZERO));
2249     }
2250 
2251 exit_unlock:
2252     uvm_up_read(&pmm->pma_lock);
2253 
2254     if (used_kmem_cache)
2255         kmem_cache_free(g_pma_address_batch_cache_ref.cache, pas);
2256 
2257     return status;
2258 }
2259 
free_root_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk,free_root_chunk_mode_t free_mode)2260 void free_root_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, free_root_chunk_mode_t free_mode)
2261 {
2262     NV_STATUS status;
2263     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
2264     uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
2265     NvU32 flags = 0;
2266 
2267     // Acquire the PMA lock for read so that uvm_pmm_gpu_pma_evict_range() can
2268     // flush out any pending frees.
2269     uvm_down_read(&pmm->pma_lock);
2270 
2271     root_chunk_lock(pmm, root_chunk);
2272 
2273     root_chunk_unmap_indirect_peers(pmm, root_chunk);
2274 
2275     status = uvm_tracker_wait_deinit(&root_chunk->tracker);
2276     if (status != NV_OK) {
2277         // TODO: Bug 1766184: Handle RC/ECC. For now just go ahead and free the chunk anyway.
2278         UVM_ASSERT(uvm_global_get_status() != NV_OK);
2279     }
2280 
2281     uvm_spin_lock(&pmm->list_lock);
2282 
2283     UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED,
2284                    "Address 0x%llx state %s GPU %s\n",
2285                    chunk->address,
2286                    uvm_pmm_gpu_chunk_state_string(chunk->state),
2287                    uvm_gpu_name(gpu));
2288     UVM_ASSERT(list_empty(&chunk->list));
2289 
2290     chunk_unpin(pmm, chunk, UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
2291 
2292     uvm_spin_unlock(&pmm->list_lock);
2293 
2294     root_chunk_unlock(pmm, root_chunk);
2295 
2296     if (free_mode == FREE_ROOT_CHUNK_MODE_SKIP_PMA_FREE) {
2297         uvm_up_read(&pmm->pma_lock);
2298         return;
2299     }
2300 
2301     if (free_mode == FREE_ROOT_CHUNK_MODE_PMA_EVICTION)
2302         flags |= UVM_PMA_CALLED_FROM_PMA_EVICTION;
2303 
2304     if (chunk->is_zero)
2305         flags |= UVM_PMA_FREE_IS_ZERO;
2306 
2307     nvUvmInterfacePmaFreePages(pmm->pma, &chunk->address, 1, UVM_CHUNK_SIZE_MAX, flags);
2308 
2309     uvm_up_read(&pmm->pma_lock);
2310 }
2311 
2312 // Splits the input chunk into subchunks of the next size down. The chunk state
2313 // can be UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED or
2314 // UVM_PMM_GPU_CHUNK_STATE_ALLOCATED.
2315 //
2316 // UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED: This is a split for allocation.
2317 //
2318 // UVM_PMM_GPU_CHUNK_STATE_ALLOCATED: This is an in-place split. The new chunks
2319 // are also marked allocated and they inherit the reverse map from the original.
2320 //
2321 // The PMM lock must be held when calling this function.
split_gpu_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2322 NV_STATUS split_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2323 {
2324     uvm_chunk_size_t chunk_size = uvm_gpu_chunk_get_size(chunk);
2325     uvm_chunk_sizes_mask_t chunk_sizes = pmm->chunk_sizes[chunk->type];
2326     uvm_chunk_size_t subchunk_size;
2327     size_t cache_idx, num_sub;
2328     int i;
2329     NV_STATUS status;
2330     uvm_pmm_gpu_chunk_suballoc_t *suballoc;
2331     uvm_gpu_chunk_t *subchunk;
2332     uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
2333 
2334     uvm_assert_mutex_locked(&pmm->lock);
2335     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
2336                chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
2337 
2338     subchunk_size = uvm_chunk_find_prev_size(chunk_sizes, chunk_size);
2339     UVM_ASSERT(subchunk_size != UVM_CHUNK_SIZE_INVALID);
2340 
2341     num_sub = chunk_size / subchunk_size;
2342     cache_idx = ilog2(num_sub);
2343     UVM_ASSERT(chunk_split_cache[cache_idx].cache != NULL);
2344 
2345     suballoc = nv_kmem_cache_zalloc(chunk_split_cache[cache_idx].cache, NV_UVM_GFP_FLAGS);
2346     if (suballoc == NULL)
2347         return NV_ERR_NO_MEMORY;
2348 
2349     for (i = 0; i < num_sub; i++) {
2350         // If requested, inject a failure on the last subchunk
2351         if (unlikely(chunk->inject_split_error) && i == num_sub - 1) {
2352             status = NV_ERR_NO_MEMORY;
2353             goto cleanup;
2354         }
2355 
2356         subchunk = nv_kmem_cache_zalloc(CHUNK_CACHE, NV_UVM_GFP_FLAGS);
2357         if (!subchunk) {
2358             status = NV_ERR_NO_MEMORY;
2359             goto cleanup;
2360         }
2361         suballoc->subchunks[i] = subchunk;
2362 
2363         subchunk->gpu_index = chunk->gpu_index;
2364         subchunk->address = chunk->address + i * subchunk_size;
2365         subchunk->type = chunk->type;
2366         uvm_gpu_chunk_set_size(subchunk, subchunk_size);
2367         subchunk->parent = chunk;
2368         subchunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
2369         subchunk->is_zero = chunk->is_zero;
2370         INIT_LIST_HEAD(&subchunk->list);
2371 
2372         // The child inherits the parent's state.
2373         subchunk->state = chunk->state;
2374 
2375         if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
2376             UVM_ASSERT(chunk->va_block);
2377             uvm_assert_mutex_locked(&chunk->va_block->lock);
2378             subchunk->va_block = chunk->va_block;
2379             subchunk->va_block_page_index = chunk->va_block_page_index + (i * subchunk_size) / PAGE_SIZE;
2380             subchunk->is_referenced = chunk->is_referenced;
2381         }
2382     }
2383 
2384     // We're splitting an allocated or pinned chunk in-place.
2385     suballoc->allocated = num_sub;
2386 
2387     // Now that all of the subchunk state has been initialized, transition the
2388     // parent into the split state under the list lock.
2389     uvm_spin_lock(&pmm->list_lock);
2390 
2391     chunk->suballoc = suballoc;
2392 
2393     if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
2394         chunk->va_block = NULL;
2395         chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
2396         chunk->is_referenced = false;
2397     }
2398     else if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
2399         // -1 for the parent chunk that is going to transition into the split state.
2400         root_chunk->chunk.suballoc->pinned_leaf_chunks += num_sub - 1;
2401 
2402         // When a pinned root chunk gets split, the count starts at 0 not
2403         // accounting for the root chunk itself so add the 1 back.
2404         if (chunk_is_root_chunk(chunk))
2405             root_chunk->chunk.suballoc->pinned_leaf_chunks += 1;
2406     }
2407 
2408     chunk->state = UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT;
2409 
2410     uvm_spin_unlock(&pmm->list_lock);
2411 
2412     return NV_OK;
2413 cleanup:
2414     for (i = 0; i < num_sub; i++) {
2415         if (suballoc->subchunks[i] == NULL)
2416             break;
2417         kmem_cache_free(CHUNK_CACHE, suballoc->subchunks[i]);
2418     }
2419     kmem_cache_free(chunk_split_cache[cache_idx].cache, suballoc);
2420     return status;
2421 }
2422 
2423 // Sanity check the chunk, the chunk's tree, and any mappings to the chunk. The
2424 // chunk must be newly-freed or newly-allocated, but its state may not reflect
2425 // that yet.
2426 //
2427 // This function always returns true so it can be called from an assert macro.
check_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2428 static bool check_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2429 {
2430     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
2431     uvm_chunk_sizes_mask_t chunk_sizes = pmm->chunk_sizes[chunk->type];
2432     uvm_gpu_chunk_t *parent = chunk->parent;
2433     uvm_chunk_size_t chunk_size = uvm_gpu_chunk_get_size(chunk);
2434     uvm_chunk_size_t parent_size;
2435 
2436     UVM_ASSERT(chunk_size & chunk_sizes);
2437     UVM_ASSERT(IS_ALIGNED(chunk->address, chunk_size));
2438     UVM_ASSERT(uvm_id_equal(uvm_gpu_id_from_index(chunk->gpu_index), gpu->id));
2439 
2440 
2441     // See pmm_squash_memory_type().
2442     if (!g_uvm_global.conf_computing_enabled)
2443         UVM_ASSERT((chunk->type == UVM_PMM_GPU_MEMORY_TYPE_USER) || (chunk->type == UVM_PMM_GPU_MEMORY_TYPE_KERNEL));
2444 
2445     if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
2446         UVM_ASSERT(chunk_size > uvm_chunk_find_first_size(chunk_sizes));
2447 
2448     if (parent) {
2449         UVM_ASSERT(parent->type == chunk->type);
2450 
2451         parent_size = uvm_gpu_chunk_get_size(parent);
2452         UVM_ASSERT(uvm_chunk_find_next_size(chunk_sizes, chunk_size) == parent_size);
2453         UVM_ASSERT(parent_size <= uvm_chunk_find_last_size(chunk_sizes));
2454 
2455         UVM_ASSERT(parent->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
2456         UVM_ASSERT(parent->suballoc);
2457         UVM_ASSERT(parent->suballoc->allocated > 0);
2458         UVM_ASSERT(parent->suballoc->allocated <= num_subchunks(parent));
2459 
2460         UVM_ASSERT(parent->address <= chunk->address);
2461         UVM_ASSERT(chunk->address < parent->address + parent_size);
2462     }
2463     else {
2464         UVM_ASSERT(chunk_size == uvm_chunk_find_last_size(chunk_sizes));
2465     }
2466 
2467     if (uvm_pmm_sysmem_mappings_indirect_supported()) {
2468         uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
2469         uvm_gpu_id_t other_gpu_id;
2470 
2471         root_chunk_lock(pmm, root_chunk);
2472 
2473         // See root_chunk_unmap_indirect_peers for the usage of uvm_gpu_get
2474         for_each_gpu_id_in_mask(other_gpu_id, &root_chunk->indirect_peers_mapped) {
2475             uvm_gpu_t *other_gpu = uvm_gpu_get(other_gpu_id);
2476             NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(pmm, chunk, other_gpu);
2477             uvm_reverse_map_t reverse_map;
2478             size_t num_mappings;
2479 
2480             num_mappings = uvm_pmm_sysmem_mappings_dma_to_virt(&other_gpu->pmm_reverse_sysmem_mappings,
2481                                                                peer_addr,
2482                                                                uvm_gpu_chunk_get_size(chunk),
2483                                                                &reverse_map,
2484                                                                1);
2485             UVM_ASSERT(num_mappings == 0);
2486         }
2487 
2488         root_chunk_unlock(pmm, root_chunk);
2489     }
2490 
2491     return true;
2492 }
2493 
chunk_is_last_allocated_child(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2494 static bool chunk_is_last_allocated_child(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2495 {
2496     uvm_assert_spinlock_locked(&pmm->list_lock);
2497 
2498     if (!chunk->parent)
2499         return false;
2500 
2501     return chunk->parent->suballoc->allocated == 1;
2502 }
2503 
chunk_free_locked(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2504 static void chunk_free_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2505 {
2506     uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
2507 
2508     uvm_assert_spinlock_locked(&pmm->list_lock);
2509 
2510     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
2511                chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
2512 
2513     if (root_chunk->chunk.in_eviction) {
2514         // A root chunk with pinned subchunks would never be picked for eviction
2515         // so this one has to be in the allocated state. Pin it and let the
2516         // evicting thread pick it up.
2517         UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
2518         UVM_ASSERT(chunk->va_block != NULL);
2519         UVM_ASSERT(chunk->va_block_page_index != PAGES_PER_UVM_VA_BLOCK);
2520         UVM_ASSERT(list_empty(&chunk->list));
2521         chunk->va_block = NULL;
2522         chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
2523         chunk->is_zero = false;
2524         chunk_pin(pmm, chunk);
2525         return;
2526     }
2527 
2528     if (chunk->parent) {
2529         UVM_ASSERT(chunk->parent->suballoc->allocated > 0);
2530         --chunk->parent->suballoc->allocated;
2531         if (chunk->parent->suballoc->allocated == 0) {
2532             // Freeing the last subchunk should trigger a merge and the PMM
2533             // mutex is required to perform it.
2534             uvm_assert_mutex_locked(&pmm->lock);
2535         }
2536     }
2537 
2538     if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
2539         chunk_unpin(pmm, chunk, UVM_PMM_GPU_CHUNK_STATE_FREE);
2540     }
2541     else {
2542         chunk->state = UVM_PMM_GPU_CHUNK_STATE_FREE;
2543         chunk->va_block = NULL;
2544     }
2545 
2546     chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
2547     chunk->is_zero = false;
2548 
2549     chunk_update_lists_locked(pmm, chunk);
2550 }
2551 
try_chunk_free(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2552 static bool try_chunk_free(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2553 {
2554     bool freed = false;
2555 
2556     uvm_spin_lock(&pmm->list_lock);
2557 
2558     UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED || !chunk->is_referenced);
2559 
2560     chunk->inject_split_error = false;
2561 
2562     // Chunks that are the last allocated child need to trigger a merge and are
2563     // handled by free_or_prepare_for_merge().
2564     if (!chunk_is_last_allocated_child(pmm, chunk)) {
2565         chunk_free_locked(pmm, chunk);
2566         freed = true;
2567     }
2568 
2569     uvm_spin_unlock(&pmm->list_lock);
2570 
2571     return freed;
2572 }
2573 
2574 // Return NULL if the chunk could be freed immediately. Otherwise, if the chunk
2575 // was the last allocated child, return the parent chunk to be merged with all
2576 // of its children taken off the free list in TEMP_PINNED state.
free_or_prepare_for_merge(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2577 static uvm_gpu_chunk_t *free_or_prepare_for_merge(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2578 {
2579     uvm_gpu_chunk_t *parent = NULL;
2580     NvU32 i;
2581 
2582     uvm_assert_mutex_locked(&pmm->lock);
2583 
2584     if (!chunk->parent) {
2585         bool freed = try_chunk_free(pmm, chunk);
2586 
2587         // Freeing a root chunk should never fail
2588         UVM_ASSERT(freed);
2589 
2590         return NULL;
2591     }
2592 
2593     uvm_spin_lock(&pmm->list_lock);
2594 
2595     if (chunk_is_last_allocated_child(pmm, chunk))
2596         parent = chunk->parent;
2597 
2598     chunk_free_locked(pmm, chunk);
2599 
2600     if (parent == NULL) {
2601         UVM_ASSERT(chunk->parent->suballoc->allocated != 0);
2602         goto done;
2603     }
2604 
2605     UVM_ASSERT(chunk->parent->suballoc->allocated == 0);
2606 
2607     // Pin all the subchunks to prepare them for being merged.
2608     for (i = 0; i < num_subchunks(chunk->parent); ++i) {
2609         uvm_gpu_chunk_t *subchunk = chunk->parent->suballoc->subchunks[i];
2610 
2611         UVM_ASSERT(subchunk->state == UVM_PMM_GPU_CHUNK_STATE_FREE);
2612 
2613         list_del_init(&subchunk->list);
2614         subchunk->state = UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED;
2615     }
2616     root_chunk_from_chunk(pmm, chunk)->chunk.suballoc->pinned_leaf_chunks += num_subchunks(chunk->parent);
2617 
2618     chunk->parent->suballoc->allocated = num_subchunks(chunk->parent);
2619     parent = chunk->parent;
2620 
2621 done:
2622     uvm_spin_unlock(&pmm->list_lock);
2623 
2624     return parent;
2625 }
2626 
free_chunk_with_merges(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2627 static void free_chunk_with_merges(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2628 {
2629     uvm_assert_mutex_locked(&pmm->lock);
2630 
2631     while (1) {
2632         // When called from the free_chunk path this check_chunk is redundant,
2633         // but we have some PMM-internal direct calls of this function.
2634         UVM_ASSERT(check_chunk(pmm, chunk));
2635 
2636         chunk = free_or_prepare_for_merge(pmm, chunk);
2637         if (!chunk)
2638             break;
2639 
2640         merge_gpu_chunk(pmm, chunk);
2641     }
2642 }
2643 
2644 // Mark the chunk as free and put it on the free list. If this is a suballocated
2645 // chunk and the parent has no more allocated chunks, the parent is freed and so
2646 // on up the tree.
free_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2647 static void free_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2648 {
2649     bool try_free = true;
2650     const bool is_root = chunk_is_root_chunk(chunk);
2651     const uvm_pmm_gpu_memory_type_t type = chunk->type;
2652 
2653     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
2654                chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
2655 
2656     UVM_ASSERT(check_chunk(pmm, chunk));
2657 
2658     if (try_chunk_free(pmm, chunk)) {
2659         try_free = is_root;
2660     }
2661     else {
2662         // Freeing a chunk can only fail if it requires merging. Take the PMM lock
2663         // and free it with merges supported.
2664         uvm_mutex_lock(&pmm->lock);
2665         free_chunk_with_merges(pmm, chunk);
2666         uvm_mutex_unlock(&pmm->lock);
2667     }
2668 
2669     // Once try_chunk_free succeeds or free_chunk_with_merges returns, it's no
2670     // longer safe to access chunk in general. All you know is that the
2671     // chunk you freed was put on the free list by the call. Since the spin lock
2672     // has been dropped, any other thread could have come in and allocated the
2673     // chunk in the meantime. Therefore, this next step just looks for a
2674     // root chunk to free, without assuming that one is actually there.
2675 
2676     if (try_free)
2677         (void)free_next_available_root_chunk(pmm, type);
2678 }
2679 
2680 // Finds and frees the next root chunk of the given type (if any) that can be
2681 // freed. Returns true if a root chunk was freed, or false otherwise.
free_next_available_root_chunk(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type)2682 bool free_next_available_root_chunk(uvm_pmm_gpu_t *pmm, uvm_pmm_gpu_memory_type_t type)
2683 {
2684     uvm_gpu_chunk_t *result;
2685 
2686     UVM_ASSERT(uvm_chunk_find_last_size(pmm->chunk_sizes[type]) == UVM_CHUNK_SIZE_MAX);
2687 
2688     uvm_spin_lock(&pmm->list_lock);
2689 
2690     // Prefer non-zero free chunk as memory is about to be released to PMA
2691     result = list_first_chunk(find_free_list(pmm, type, UVM_CHUNK_SIZE_MAX, UVM_PMM_LIST_NO_ZERO));
2692     if (result)
2693         UVM_ASSERT(!result->is_zero);
2694 
2695     if (!result) {
2696         result = list_first_chunk(find_free_list(pmm, type, UVM_CHUNK_SIZE_MAX, UVM_PMM_LIST_ZERO));
2697         if (result)
2698             UVM_ASSERT(result->is_zero);
2699     }
2700 
2701     if (result != NULL) {
2702         list_del_init(&result->list);
2703         UVM_ASSERT(result->state == UVM_PMM_GPU_CHUNK_STATE_FREE);
2704         UVM_ASSERT(uvm_gpu_chunk_get_size(result) == UVM_CHUNK_SIZE_MAX);
2705         UVM_ASSERT(result->type == type);
2706 
2707         // The chunk has been freed and removed from the free list so it
2708         // can't get allocated again, but it could be targeted for eviction
2709         // by physical address. Pin it temporarily to protect the chunk from
2710         // eviction between dropping the list lock and taking the root chunk
2711         // lock.
2712         chunk_pin(pmm, result);
2713     }
2714 
2715     uvm_spin_unlock(&pmm->list_lock);
2716 
2717     if (result != NULL) {
2718         free_root_chunk(pmm, root_chunk_from_chunk(pmm, result), FREE_ROOT_CHUNK_MODE_DEFAULT);
2719         return true;
2720     }
2721 
2722     return false;
2723 }
2724 
2725 // Get free list for the given chunk size and type
find_free_list(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_chunk_size_t chunk_size,uvm_pmm_list_zero_t zero_type)2726 struct list_head *find_free_list(uvm_pmm_gpu_t *pmm,
2727                                  uvm_pmm_gpu_memory_type_t type,
2728                                  uvm_chunk_size_t chunk_size,
2729                                  uvm_pmm_list_zero_t zero_type)
2730 {
2731     uvm_chunk_sizes_mask_t chunk_sizes = pmm->chunk_sizes[type];
2732     size_t idx = hweight_long(chunk_sizes & (chunk_size - 1));
2733     UVM_ASSERT(is_power_of_2(chunk_size));
2734     UVM_ASSERT_MSG(chunk_size & chunk_sizes, "chunk size 0x%x chunk sizes 0x%x\n", chunk_size, chunk_sizes);
2735     return &pmm->free_list[type][idx][zero_type];
2736 }
2737 
find_free_list_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2738 struct list_head *find_free_list_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2739 {
2740     return find_free_list(pmm,
2741                           chunk->type,
2742                           uvm_gpu_chunk_get_size(chunk),
2743                           chunk->is_zero? UVM_PMM_LIST_ZERO : UVM_PMM_LIST_NO_ZERO);
2744 }
2745 
uvm_pmm_should_inject_pma_eviction_error(uvm_pmm_gpu_t * pmm)2746 static bool uvm_pmm_should_inject_pma_eviction_error(uvm_pmm_gpu_t *pmm)
2747 {
2748     uvm_assert_mutex_locked(&pmm->lock);
2749 
2750     if (unlikely(pmm->inject_pma_evict_error_after_num_chunks > 0))
2751         return --pmm->inject_pma_evict_error_after_num_chunks == 0;
2752 
2753     return false;
2754 }
2755 
2756 // See the documentation of pmaEvictPagesCb_t in pma.h for details of the
2757 // expected semantics.
uvm_pmm_gpu_pma_evict_pages(void * void_pmm,NvU32 page_size,NvU64 * pages,NvU32 num_pages_to_evict,NvU64 phys_start,NvU64 phys_end,UVM_PMA_GPU_MEMORY_TYPE mem_type)2758 static NV_STATUS uvm_pmm_gpu_pma_evict_pages(void *void_pmm,
2759                                              NvU32 page_size,
2760                                              NvU64 *pages,
2761                                              NvU32 num_pages_to_evict,
2762                                              NvU64 phys_start,
2763                                              NvU64 phys_end,
2764                                              UVM_PMA_GPU_MEMORY_TYPE mem_type)
2765 {
2766     NV_STATUS status;
2767     uvm_pmm_gpu_t *pmm = (uvm_pmm_gpu_t *)void_pmm;
2768     uvm_gpu_chunk_t *chunk;
2769     NvU64 num_pages_evicted_so_far = 0;
2770     NvU64 num_pages_left_to_evict = num_pages_to_evict;
2771     const NvU64 pages_per_chunk = UVM_CHUNK_SIZE_MAX / page_size;
2772     bool all_pages_are_zero = true;
2773 
2774     UVM_ASSERT(IS_ALIGNED(UVM_CHUNK_SIZE_MAX, page_size));
2775     UVM_ASSERT(UVM_CHUNK_SIZE_MAX >= page_size);
2776 
2777     // Currently, when the Confidential Computing feature is enabled, the
2778     // entirety of vidmem is protected.
2779     if (g_uvm_global.conf_computing_enabled && (mem_type != UVM_PMA_GPU_MEMORY_TYPE_PROTECTED))
2780         return NV_ERR_INVALID_ARGUMENT;
2781 
2782     while (num_pages_left_to_evict > 0) {
2783         uvm_gpu_root_chunk_t *root_chunk;
2784         uvm_page_index_t page_index;
2785         NvU64 pages_this_time = min(pages_per_chunk, num_pages_left_to_evict);
2786 
2787         uvm_mutex_lock(&pmm->lock);
2788 
2789         if (uvm_pmm_should_inject_pma_eviction_error(pmm)) {
2790             status = NV_ERR_NO_MEMORY;
2791         }
2792         else {
2793             status = pick_and_evict_root_chunk_retry(pmm,
2794                                                      UVM_PMM_GPU_MEMORY_TYPE_KERNEL,
2795                                                      PMM_CONTEXT_PMA_EVICTION,
2796                                                      &chunk);
2797         }
2798         uvm_mutex_unlock(&pmm->lock);
2799 
2800         // TODO: Bug 1795559: Consider waiting for any pinned user allocations
2801         // to be unpinned.
2802         if (status != NV_OK)
2803             goto error;
2804 
2805         root_chunk = root_chunk_from_chunk(pmm, chunk);
2806 
2807         if (chunk->address < phys_start || chunk->address + UVM_CHUNK_SIZE_MAX > phys_end) {
2808             // If the chunk we get is outside of the physical range requested,
2809             // just give up and return an error.
2810             //
2811             // TODO: Bug 1795559: PMA pre-populates the array of pages with a
2812             // list of candidates that were unpinned before triggering eviction.
2813             // If they were marked for eviction, we could fall back to evicting
2814             // those instead and be sure that it succeeds.
2815             free_root_chunk(pmm, root_chunk, FREE_ROOT_CHUNK_MODE_PMA_EVICTION);
2816             status = NV_ERR_NO_MEMORY;
2817             goto error;
2818         }
2819 
2820         all_pages_are_zero = all_pages_are_zero && chunk->is_zero;
2821 
2822         // Free the root chunk as far as PMM's state is concerned, but skip the
2823         // free back to PMA as that would make it available for other PMA
2824         // allocations.
2825         free_root_chunk(pmm, root_chunk, FREE_ROOT_CHUNK_MODE_SKIP_PMA_FREE);
2826 
2827         for (page_index = 0; page_index < pages_this_time; page_index++)
2828             pages[num_pages_evicted_so_far++] = chunk->address + page_index * page_size;
2829 
2830         num_pages_left_to_evict -= pages_this_time;
2831 
2832         // If we didn't use a whole root chunk, free its tail back to PMA
2833         // directly.
2834         if (pages_this_time != pages_per_chunk) {
2835             NvU64 address = chunk->address + pages_this_time * page_size;
2836             NvU64 num_pages = pages_per_chunk - pages_this_time;
2837             NvU32 free_flags = UVM_PMA_CALLED_FROM_PMA_EVICTION | UVM_PMA_ALLOCATE_CONTIGUOUS;
2838 
2839             if (chunk->is_zero)
2840                 free_flags |= UVM_PMA_FREE_IS_ZERO;
2841 
2842             // Free the whole tail as a contiguous allocation
2843             nvUvmInterfacePmaFreePages(pmm->pma, &address, num_pages, page_size, free_flags);
2844         }
2845     }
2846 
2847     return NV_OK;
2848 
2849 error:
2850     // On error, free all of the evicted pages back to PMA directly.
2851     if (num_pages_evicted_so_far > 0) {
2852         NvU32 free_flags = UVM_PMA_CALLED_FROM_PMA_EVICTION;
2853 
2854         if (all_pages_are_zero)
2855             free_flags |= UVM_PMA_FREE_IS_ZERO;
2856 
2857         nvUvmInterfacePmaFreePages(pmm->pma, pages, num_pages_evicted_so_far, page_size, free_flags);
2858     }
2859 
2860     return status;
2861 }
2862 
uvm_pmm_gpu_pma_evict_pages_wrapper(void * void_pmm,NvU32 page_size,NvU64 * pages,NvU32 num_pages_to_evict,NvU64 phys_start,NvU64 phys_end,UVM_PMA_GPU_MEMORY_TYPE mem_type)2863 static NV_STATUS uvm_pmm_gpu_pma_evict_pages_wrapper(void *void_pmm,
2864                                                      NvU32 page_size,
2865                                                      NvU64 *pages,
2866                                                      NvU32 num_pages_to_evict,
2867                                                      NvU64 phys_start,
2868                                                      NvU64 phys_end,
2869                                                      UVM_PMA_GPU_MEMORY_TYPE mem_type)
2870 {
2871     NV_STATUS status;
2872 
2873     // RM invokes the eviction callbacks with its API lock held, but not its GPU
2874     // lock.
2875     uvm_record_lock_rm_api();
2876     status = uvm_pmm_gpu_pma_evict_pages(void_pmm, page_size, pages, num_pages_to_evict, phys_start, phys_end, mem_type);
2877     uvm_record_unlock_rm_api();
2878     return status;
2879 }
2880 
uvm_pmm_gpu_pma_evict_pages_wrapper_entry(void * void_pmm,NvU64 page_size,NvU64 * pages,NvU32 num_pages_to_evict,NvU64 phys_start,NvU64 phys_end,UVM_PMA_GPU_MEMORY_TYPE mem_type)2881 static NV_STATUS uvm_pmm_gpu_pma_evict_pages_wrapper_entry(void *void_pmm,
2882                                                            NvU64 page_size,
2883                                                            NvU64 *pages,
2884                                                            NvU32 num_pages_to_evict,
2885                                                            NvU64 phys_start,
2886                                                            NvU64 phys_end,
2887                                                            UVM_PMA_GPU_MEMORY_TYPE mem_type)
2888 {
2889     UVM_ENTRY_RET(uvm_pmm_gpu_pma_evict_pages_wrapper(void_pmm,
2890                                                       page_size,
2891                                                       pages,
2892                                                       num_pages_to_evict,
2893                                                       phys_start,
2894                                                       phys_end,
2895                                                       mem_type));
2896 }
2897 
2898 // See the documentation of pmaEvictRangeCb_t in pma.h for details of the
2899 // expected semantics.
uvm_pmm_gpu_pma_evict_range(void * void_pmm,NvU64 phys_begin,NvU64 phys_end,UVM_PMA_GPU_MEMORY_TYPE mem_type)2900 static NV_STATUS uvm_pmm_gpu_pma_evict_range(void *void_pmm,
2901                                              NvU64 phys_begin,
2902                                              NvU64 phys_end,
2903                                              UVM_PMA_GPU_MEMORY_TYPE mem_type)
2904 {
2905     NV_STATUS status;
2906     uvm_pmm_gpu_t *pmm = (uvm_pmm_gpu_t *)void_pmm;
2907     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
2908     NvU64 address = UVM_ALIGN_DOWN(phys_begin, UVM_CHUNK_SIZE_MAX);
2909 
2910     UVM_ASSERT_MSG(phys_begin <= phys_end, "range [0x%llx, 0x%llx]\n", phys_begin, phys_end);
2911     UVM_ASSERT_MSG(phys_end <= gpu->mem_info.max_allocatable_address,
2912                    "range [0x%llx, 0x%llx]\n",
2913                    phys_begin,
2914                    phys_end);
2915 
2916     // Make sure that all pending allocations, that could have started before
2917     // the eviction callback was called, are done. This is required to guarantee
2918     // that any address that, PMA thinks, is owned by UVM has been indeed recorded
2919     // in PMM's state. Taking the pma_lock in write mode will make sure all
2920     // readers (pending allocations and frees) are done, but will also
2921     // unnecessarily stop new allocations from starting until it's released.
2922     // TODO: Bug 1795559: SRCU would likely be better for this type of
2923     // synchronization, but that's GPL. Figure out whether we can do anything
2924     // better easily.
2925     uvm_down_write(&pmm->pma_lock);
2926     uvm_up_write(&pmm->pma_lock);
2927 
2928     for (; address <= phys_end; address += UVM_CHUNK_SIZE_MAX) {
2929         uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_address(pmm, address);
2930         uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
2931         bool eviction_started = false;
2932         uvm_spin_loop_t spin;
2933         bool should_inject_error;
2934 
2935         uvm_spin_loop_init(&spin);
2936 
2937         // Wait until we can start eviction or the chunk is returned to PMA
2938         do {
2939             uvm_spin_lock(&pmm->list_lock);
2940 
2941             if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED) {
2942                 UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
2943 
2944                 if (chunk_is_evictable(pmm, chunk)) {
2945                     chunk_start_eviction(pmm, chunk);
2946                     eviction_started = true;
2947                 }
2948             }
2949 
2950             uvm_spin_unlock(&pmm->list_lock);
2951 
2952             // TODO: Bug 1795559: Replace this with a wait queue.
2953             if (UVM_SPIN_LOOP(&spin) == NV_ERR_TIMEOUT_RETRY) {
2954                 UVM_ERR_PRINT("Stuck waiting for root chunk 0x%llx to be unpinned, giving up\n", chunk->address);
2955                 return NV_ERR_NO_MEMORY;
2956             }
2957         } while (!eviction_started && chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
2958 
2959         // The eviction callback gets called with a physical range that might be
2960         // only partially allocated by UVM. Skip the chunks that UVM doesn't own.
2961         if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED)
2962             continue;
2963 
2964         uvm_mutex_lock(&pmm->lock);
2965 
2966         status = evict_root_chunk(pmm, root_chunk, PMM_CONTEXT_PMA_EVICTION);
2967         should_inject_error = uvm_pmm_should_inject_pma_eviction_error(pmm);
2968 
2969         uvm_mutex_unlock(&pmm->lock);
2970 
2971         if (status != NV_OK)
2972             return status;
2973 
2974         free_root_chunk(pmm, root_chunk, FREE_ROOT_CHUNK_MODE_PMA_EVICTION);
2975 
2976         if (should_inject_error)
2977             return NV_ERR_NO_MEMORY;
2978     }
2979 
2980     // Make sure that all pending frees for chunks that the eviction above could
2981     // have observed as PMA owned are done. This is required to guarantee that
2982     // any address that, PMM thinks, is owned by PMA, has been actually freed
2983     // back to PMA. Taking the pma_lock in write mode will make sure all
2984     // readers (pending frees) are done, but will also unnecessarily stop new
2985     // allocations and frees from starting until it's released.
2986     uvm_down_write(&pmm->pma_lock);
2987     uvm_up_write(&pmm->pma_lock);
2988 
2989     return NV_OK;
2990 }
2991 
uvm_pmm_gpu_pma_evict_range_wrapper(void * void_pmm,NvU64 phys_begin,NvU64 phys_end,UVM_PMA_GPU_MEMORY_TYPE mem_type)2992 static NV_STATUS uvm_pmm_gpu_pma_evict_range_wrapper(void *void_pmm,
2993                                                      NvU64 phys_begin,
2994                                                      NvU64 phys_end,
2995                                                      UVM_PMA_GPU_MEMORY_TYPE mem_type)
2996 {
2997     NV_STATUS status;
2998 
2999     // RM invokes the eviction callbacks with its API lock held, but not its GPU
3000     // lock.
3001     uvm_record_lock_rm_api();
3002     status = uvm_pmm_gpu_pma_evict_range(void_pmm, phys_begin, phys_end, mem_type);
3003     uvm_record_unlock_rm_api();
3004     return status;
3005 }
3006 
uvm_pmm_gpu_pma_evict_range_wrapper_entry(void * void_pmm,NvU64 phys_begin,NvU64 phys_end,UVM_PMA_GPU_MEMORY_TYPE mem_type)3007 static NV_STATUS uvm_pmm_gpu_pma_evict_range_wrapper_entry(void *void_pmm,
3008                                                            NvU64 phys_begin,
3009                                                            NvU64 phys_end,
3010                                                            UVM_PMA_GPU_MEMORY_TYPE mem_type)
3011 {
3012     UVM_ENTRY_RET(uvm_pmm_gpu_pma_evict_range_wrapper(void_pmm, phys_begin, phys_end, mem_type));
3013 }
3014 
deinit_chunk_split_cache(uvm_pmm_gpu_t * pmm)3015 static void deinit_chunk_split_cache(uvm_pmm_gpu_t *pmm)
3016 {
3017     unsigned long subchunk_count_log2;
3018 
3019     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
3020 
3021     for_each_set_bit(subchunk_count_log2, pmm->chunk_split_cache_initialized, UVM_PMM_CHUNK_SPLIT_CACHE_SIZES) {
3022         UVM_ASSERT(chunk_split_cache[subchunk_count_log2].refcount > 0);
3023         UVM_ASSERT(chunk_split_cache[subchunk_count_log2].cache);
3024 
3025         if (--chunk_split_cache[subchunk_count_log2].refcount == 0)
3026             kmem_cache_destroy_safe(&chunk_split_cache[subchunk_count_log2].cache);
3027 
3028         __clear_bit(subchunk_count_log2, pmm->chunk_split_cache_initialized);
3029     }
3030 }
3031 
init_chunk_split_cache_level(uvm_pmm_gpu_t * pmm,size_t level)3032 static NV_STATUS init_chunk_split_cache_level(uvm_pmm_gpu_t *pmm, size_t level)
3033 {
3034     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
3035 
3036     if (!test_bit(level, pmm->chunk_split_cache_initialized)) {
3037         if (!chunk_split_cache[level].cache) {
3038             size_t size;
3039             size_t align;
3040             if (level == 0) {
3041                 strncpy(chunk_split_cache[level].name, "uvm_gpu_chunk_t", sizeof(chunk_split_cache[level].name) - 1);
3042                 size = sizeof(uvm_gpu_chunk_t);
3043                 align = __alignof__(uvm_gpu_chunk_t);
3044             } else {
3045                 snprintf(chunk_split_cache[level].name,
3046                          sizeof(chunk_split_cache[level].name),
3047                          "uvm_gpu_chunk_%u", (unsigned)level);
3048                 size = sizeof(uvm_pmm_gpu_chunk_suballoc_t) + (sizeof(uvm_gpu_chunk_t *) << level);
3049                 align = __alignof__(uvm_pmm_gpu_chunk_suballoc_t);
3050             }
3051             chunk_split_cache[level].cache =
3052                 nv_kmem_cache_create(chunk_split_cache[level].name, size, align);
3053 
3054 
3055             if (!chunk_split_cache[level].cache)
3056                 return NV_ERR_NO_MEMORY;
3057 
3058             UVM_ASSERT(chunk_split_cache[level].refcount == 0);
3059         } else {
3060             UVM_ASSERT(chunk_split_cache[level].refcount > 0);
3061         }
3062 
3063         ++chunk_split_cache[level].refcount;
3064         UVM_ASSERT_MSG(chunk_split_cache[level].refcount != 0, "Overflow of refcount\n");
3065 
3066         __set_bit(level, pmm->chunk_split_cache_initialized);
3067     }
3068 
3069     return NV_OK;
3070 }
3071 
3072 // Initializes the split cache for given GPU.
3073 //
3074 // It walks through all memory splits - in other words all ratios of neighboring
3075 // pairs of sizes - and allocates kmem cache for them, unless they are already
3076 // allocated.
3077 //
3078 // It also bumps the refcount if this GPU did not use such split yet.
init_chunk_split_cache(uvm_pmm_gpu_t * pmm)3079 static NV_STATUS init_chunk_split_cache(uvm_pmm_gpu_t *pmm)
3080 {
3081     NV_STATUS status;
3082     uvm_pmm_gpu_memory_type_t type;
3083 
3084     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
3085 
3086     for (type = 0; type < UVM_PMM_GPU_MEMORY_TYPE_COUNT; type++) {
3087         uvm_chunk_size_t prev_size, cur_size;
3088         uvm_chunk_sizes_mask_t chunk_sizes = pmm->chunk_sizes[type];
3089         // Iterate over each pair of neighboring sizes. Note that same level
3090         // may be visited multiple times and it is handled internally by
3091         // init_chunk_split_cache_level
3092         prev_size = uvm_chunk_find_first_size(chunk_sizes);
3093         cur_size = uvm_chunk_find_next_size(chunk_sizes, prev_size);
3094         for_each_chunk_size_from(cur_size, chunk_sizes) {
3095             size_t subchunk_count = cur_size / prev_size;
3096             size_t level = ilog2(subchunk_count);
3097             status = init_chunk_split_cache_level(pmm, level);
3098             if (status != NV_OK)
3099                 return status;
3100 
3101             prev_size = cur_size;
3102         }
3103     }
3104 
3105     return init_chunk_split_cache_level(pmm, 0);
3106 }
3107 
init_pma_address_batch_cache(uvm_pmm_gpu_t * pmm)3108 static NV_STATUS init_pma_address_batch_cache(uvm_pmm_gpu_t *pmm)
3109 {
3110     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
3111 
3112     if (!g_pma_address_batch_cache_ref.cache) {
3113         const size_t address_batch_size = sizeof(UvmGpuPointer) << uvm_perf_pma_batch_nonpinned_order;
3114 
3115         snprintf(g_pma_address_batch_cache_ref.name,
3116                  sizeof(g_pma_address_batch_cache_ref.name),
3117                  "pma_address_batch");
3118         g_pma_address_batch_cache_ref.cache =
3119             nv_kmem_cache_create(g_pma_address_batch_cache_ref.name,
3120                               address_batch_size, __alignof__(UvmGpuPointer));
3121 
3122         if (!g_pma_address_batch_cache_ref.cache)
3123             return NV_ERR_NO_MEMORY;
3124 
3125         UVM_ASSERT(g_pma_address_batch_cache_ref.refcount == 0);
3126     }
3127     else {
3128         UVM_ASSERT(g_pma_address_batch_cache_ref.refcount > 0);
3129     }
3130 
3131     pmm->pma_address_cache_initialized = true;
3132 
3133     ++g_pma_address_batch_cache_ref.refcount;
3134     UVM_ASSERT_MSG(g_pma_address_batch_cache_ref.refcount != 0, "Overflow of refcount\n");
3135 
3136     return NV_OK;
3137 }
3138 
deinit_pma_address_batch_cache(uvm_pmm_gpu_t * pmm)3139 static void deinit_pma_address_batch_cache(uvm_pmm_gpu_t *pmm)
3140 {
3141     if (pmm->pma_address_cache_initialized) {
3142         UVM_ASSERT(g_pma_address_batch_cache_ref.refcount > 0);
3143         UVM_ASSERT(g_pma_address_batch_cache_ref.cache);
3144 
3145         if (--g_pma_address_batch_cache_ref.refcount == 0)
3146             kmem_cache_destroy_safe(&g_pma_address_batch_cache_ref.cache);
3147 
3148         pmm->pma_address_cache_initialized = false;
3149     }
3150 }
3151 
deinit_caches(uvm_pmm_gpu_t * pmm)3152 static void deinit_caches(uvm_pmm_gpu_t *pmm)
3153 {
3154     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
3155 
3156     deinit_pma_address_batch_cache(pmm);
3157     deinit_chunk_split_cache(pmm);
3158 }
3159 
init_caches(uvm_pmm_gpu_t * pmm)3160 static NV_STATUS init_caches(uvm_pmm_gpu_t *pmm)
3161 {
3162     NV_STATUS status;
3163 
3164     status = init_pma_address_batch_cache(pmm);
3165     if (status != NV_OK)
3166         goto cleanup;
3167 
3168     status = init_chunk_split_cache(pmm);
3169     if (status != NV_OK)
3170         goto cleanup;
3171 
3172     return NV_OK;
3173 
3174 cleanup:
3175     deinit_caches(pmm);
3176 
3177     return status;
3178 }
3179 
3180 typedef struct
3181 {
3182     // Start/end of the physical region to be traversed (IN)
3183     NvU64 phys_start;
3184     NvU64 phys_end;
3185 
3186     // Pointer to the array of mappins where to store results (OUT)
3187     uvm_reverse_map_t *mappings;
3188 
3189     // Number of entries written to mappings (OUT)
3190     NvU32 num_mappings;
3191 } get_chunk_mappings_data_t;
3192 
3193 // Chunk traversal function used for phys-to-virt translation. These are the
3194 // possible return values.
3195 //
3196 // - NV_ERR_OUT_OF_RANGE: no allocated physical chunks were found
3197 // - NV_ERR_MORE_DATA_AVAILABLE: allocated physical chunks were found
3198 // - NV_OK: allocated physical chunks may have been found. Check num_mappings
get_chunk_mappings_in_range(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,void * data)3199 static NV_STATUS get_chunk_mappings_in_range(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data)
3200 {
3201     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
3202     get_chunk_mappings_data_t *get_chunk_mappings_data = (get_chunk_mappings_data_t *)data;
3203     NvU64 chunk_end = chunk->address + uvm_gpu_chunk_get_size(chunk) - 1;
3204 
3205     uvm_assert_mutex_locked(&pmm->lock);
3206 
3207     // Kernel chunks do not have assigned VA blocks so we can just skip them
3208     if (uvm_pmm_gpu_memory_type_is_kernel(chunk->type))
3209         return NV_WARN_NOTHING_TO_DO;
3210 
3211     // This chunk is located before the requested physical range. Skip its
3212     // children and keep going
3213     if (chunk_end < get_chunk_mappings_data->phys_start)
3214         return NV_WARN_NOTHING_TO_DO;
3215 
3216     // We are beyond the search phys range. Stop traversing.
3217     if (chunk->address > get_chunk_mappings_data->phys_end) {
3218         if (get_chunk_mappings_data->num_mappings > 0)
3219             return NV_ERR_MORE_DATA_AVAILABLE;
3220         else
3221             return NV_ERR_OUT_OF_RANGE;
3222     }
3223 
3224     uvm_spin_lock(&pmm->list_lock);
3225 
3226     // Return results for allocated leaf chunks, only
3227     if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
3228         uvm_reverse_map_t *reverse_map;
3229 
3230         UVM_ASSERT(chunk->va_block);
3231         uvm_va_block_retain(chunk->va_block);
3232 
3233         reverse_map = &get_chunk_mappings_data->mappings[get_chunk_mappings_data->num_mappings];
3234 
3235         reverse_map->va_block = chunk->va_block;
3236         reverse_map->region   = uvm_va_block_region(chunk->va_block_page_index,
3237                                                     chunk->va_block_page_index + uvm_gpu_chunk_get_size(chunk) / PAGE_SIZE);
3238         reverse_map->owner    = gpu->id;
3239 
3240         // If we land in the middle of a chunk, adjust the offset
3241         if (get_chunk_mappings_data->phys_start > chunk->address) {
3242             NvU64 offset = get_chunk_mappings_data->phys_start - chunk->address;
3243 
3244             reverse_map->region.first += offset / PAGE_SIZE;
3245         }
3246 
3247         // If the physical range doesn't cover the whole chunk, adjust num_pages
3248         if (get_chunk_mappings_data->phys_end < chunk_end)
3249             reverse_map->region.outer -= (chunk_end - get_chunk_mappings_data->phys_end) / PAGE_SIZE;
3250 
3251         ++get_chunk_mappings_data->num_mappings;
3252     }
3253 
3254     uvm_spin_unlock(&pmm->list_lock);
3255 
3256     return NV_OK;
3257 }
3258 
uvm_pmm_gpu_phys_to_virt(uvm_pmm_gpu_t * pmm,NvU64 phys_addr,NvU64 region_size,uvm_reverse_map_t * out_mappings)3259 NvU32 uvm_pmm_gpu_phys_to_virt(uvm_pmm_gpu_t *pmm, NvU64 phys_addr, NvU64 region_size, uvm_reverse_map_t *out_mappings)
3260 {
3261     NvU64 chunk_base_addr = UVM_ALIGN_DOWN(phys_addr, UVM_CHUNK_SIZE_MAX);
3262     NvU64 size_in_chunk = min(UVM_CHUNK_SIZE_MAX - (phys_addr - chunk_base_addr), region_size);
3263     NvU32 num_mappings = 0;
3264 
3265     UVM_ASSERT(PAGE_ALIGNED(phys_addr));
3266     UVM_ASSERT(PAGE_ALIGNED(region_size));
3267 
3268     uvm_mutex_lock(&pmm->lock);
3269 
3270     // Traverse the whole requested region
3271     do {
3272         NV_STATUS status = NV_OK;
3273         uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_address(pmm, phys_addr);
3274         uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
3275         get_chunk_mappings_data_t get_chunk_mappings_data;
3276 
3277         get_chunk_mappings_data.phys_start   = phys_addr;
3278         get_chunk_mappings_data.phys_end     = phys_addr + size_in_chunk - 1;
3279         get_chunk_mappings_data.mappings     = out_mappings + num_mappings;
3280         get_chunk_mappings_data.num_mappings = 0;
3281 
3282         // Walk the chunks for the current root chunk
3283         status = chunk_walk_pre_order(pmm,
3284                                       chunk,
3285                                       get_chunk_mappings_in_range,
3286                                       &get_chunk_mappings_data);
3287         if (status == NV_ERR_OUT_OF_RANGE)
3288             break;
3289 
3290         if (get_chunk_mappings_data.num_mappings > 0) {
3291             UVM_ASSERT(status == NV_OK || status == NV_ERR_MORE_DATA_AVAILABLE);
3292             num_mappings += get_chunk_mappings_data.num_mappings;
3293         }
3294         else {
3295             UVM_ASSERT(status == NV_OK);
3296         }
3297 
3298         region_size -= size_in_chunk;
3299         phys_addr += size_in_chunk;
3300         size_in_chunk = min((NvU64)UVM_CHUNK_SIZE_MAX, region_size);
3301     } while (region_size > 0);
3302 
3303     uvm_mutex_unlock(&pmm->lock);
3304 
3305     return num_mappings;
3306 }
3307 
3308 #if UVM_IS_CONFIG_HMM()
3309 
devmem_page_to_pmm(struct page * page)3310 static uvm_pmm_gpu_t *devmem_page_to_pmm(struct page *page)
3311 {
3312     return container_of(page->pgmap, uvm_pmm_gpu_t, devmem.pagemap);
3313 }
3314 
devmem_page_to_chunk_locked(struct page * page)3315 static uvm_gpu_chunk_t *devmem_page_to_chunk_locked(struct page *page)
3316 {
3317     uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
3318     NvU64 chunk_addr = ((NvU64)page_to_pfn(page) << PAGE_SHIFT) - pmm->devmem.pagemap.range.start;
3319     size_t index = chunk_addr / UVM_CHUNK_SIZE_MAX;
3320     uvm_gpu_chunk_t *root_chunk;
3321     uvm_gpu_chunk_t *chunk;
3322     uvm_gpu_chunk_t *parent;
3323     uvm_chunk_size_t chunk_size;
3324 
3325     UVM_ASSERT(index < pmm->root_chunks.count);
3326     root_chunk = &pmm->root_chunks.array[index].chunk;
3327     UVM_ASSERT(root_chunk->address == UVM_ALIGN_DOWN(chunk_addr, UVM_CHUNK_SIZE_MAX));
3328 
3329     // Find the uvm_gpu_chunk_t that corresponds to the device private struct
3330     // page's PFN. The loop is only 0, 1, or 2 iterations.
3331     for (chunk = root_chunk;
3332          uvm_gpu_chunk_get_size(chunk) != page_size(page);
3333          chunk = parent->suballoc->subchunks[index]) {
3334 
3335         parent = chunk;
3336         UVM_ASSERT(parent->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
3337         UVM_ASSERT(parent->suballoc);
3338 
3339         chunk_size = uvm_gpu_chunk_get_size(parent->suballoc->subchunks[0]);
3340         index = (size_t)uvm_div_pow2_64(chunk_addr - parent->address, chunk_size);
3341         UVM_ASSERT(index < num_subchunks(parent));
3342     }
3343 
3344     UVM_ASSERT(chunk->address = chunk_addr);
3345     UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
3346     UVM_ASSERT(chunk->is_referenced);
3347 
3348     return chunk;
3349 }
3350 
uvm_pmm_devmem_page_to_chunk(struct page * page)3351 uvm_gpu_chunk_t *uvm_pmm_devmem_page_to_chunk(struct page *page)
3352 {
3353     uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
3354     uvm_gpu_chunk_t *chunk;
3355 
3356     UVM_ASSERT(is_device_private_page(page));
3357 
3358     uvm_spin_lock(&pmm->list_lock);
3359     chunk = devmem_page_to_chunk_locked(page);
3360     uvm_spin_unlock(&pmm->list_lock);
3361 
3362     return chunk;
3363 }
3364 
uvm_pmm_devmem_page_to_gpu_id(struct page * page)3365 uvm_gpu_id_t uvm_pmm_devmem_page_to_gpu_id(struct page *page)
3366 {
3367     uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
3368     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
3369 
3370     UVM_ASSERT(is_device_private_page(page));
3371 
3372     return gpu->id;
3373 }
3374 
3375 // Check there are no orphan pages. This should be only called as part of
3376 // removing a GPU: after all work is stopped and all va_blocks have been
3377 // destroyed. By now there should be no device-private page references left as
3378 // there are no va_space's left on this GPU and orphan pages should be removed
3379 // by va_space destruction or unregistration from the GPU.
uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t * pmm)3380 static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
3381 {
3382     size_t i;
3383     bool ret = true;
3384     unsigned long pfn;
3385     struct range range = pmm->devmem.pagemap.range;
3386 
3387     if (!pmm->initialized || !uvm_hmm_is_enabled_system_wide())
3388         return ret;
3389 
3390     // Scan all the root chunks looking for subchunks which are still
3391     // referenced.
3392     for (i = 0; i < pmm->root_chunks.count; i++) {
3393         uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[i];
3394 
3395         root_chunk_lock(pmm, root_chunk);
3396         if (root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
3397             ret = false;
3398         root_chunk_unlock(pmm, root_chunk);
3399     }
3400 
3401     for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
3402         struct page *page = pfn_to_page(pfn);
3403 
3404         if (!is_device_private_page(page)) {
3405             ret = false;
3406             break;
3407         }
3408 
3409         if (page_count(page)) {
3410             ret = false;
3411             break;
3412         }
3413     }
3414 
3415     return ret;
3416 }
3417 
devmem_page_free(struct page * page)3418 static void devmem_page_free(struct page *page)
3419 {
3420     uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
3421     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
3422     uvm_gpu_chunk_t *chunk;
3423 
3424     page->zone_device_data = NULL;
3425 
3426     // We should be calling free_chunk() except that it acquires a mutex and
3427     // we may be in an interrupt context where we can't do that. Instead,
3428     // do a lazy free. Note that we have to use a "normal" spin lock because
3429     // the UVM context is not available.
3430     spin_lock(&pmm->list_lock.lock);
3431 
3432     chunk = devmem_page_to_chunk_locked(page);
3433     UVM_ASSERT(chunk->is_referenced);
3434     chunk->is_referenced = false;
3435     list_add_tail(&chunk->list, &pmm->root_chunks.va_block_lazy_free);
3436 
3437     spin_unlock(&pmm->list_lock.lock);
3438 
3439     nv_kthread_q_schedule_q_item(&gpu->parent->lazy_free_q,
3440                                  &pmm->root_chunks.va_block_lazy_free_q_item);
3441 }
3442 
3443 // This is called by HMM when the CPU faults on a ZONE_DEVICE private entry.
devmem_fault(struct vm_fault * vmf)3444 static vm_fault_t devmem_fault(struct vm_fault *vmf)
3445 {
3446     uvm_va_space_t *va_space = vmf->page->zone_device_data;
3447 
3448     if (!va_space)
3449         return VM_FAULT_SIGBUS;
3450 
3451     return uvm_va_space_cpu_fault_hmm(va_space, vmf->vma, vmf);
3452 }
3453 
devmem_fault_entry(struct vm_fault * vmf)3454 static vm_fault_t devmem_fault_entry(struct vm_fault *vmf)
3455 {
3456     UVM_ENTRY_RET(devmem_fault(vmf));
3457 }
3458 
3459 static const struct dev_pagemap_ops uvm_pmm_devmem_ops =
3460 {
3461     .page_free = devmem_page_free,
3462     .migrate_to_ram = devmem_fault_entry,
3463 };
3464 
devmem_init(uvm_pmm_gpu_t * pmm)3465 static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
3466 {
3467     unsigned long size = pmm->root_chunks.count * UVM_CHUNK_SIZE_MAX;
3468     uvm_pmm_gpu_devmem_t *devmem = &pmm->devmem;
3469     struct resource *res;
3470     void *ptr;
3471     NV_STATUS status;
3472 
3473     if (!uvm_hmm_is_enabled_system_wide()) {
3474         devmem->pagemap.owner = NULL;
3475         return NV_OK;
3476     }
3477 
3478     res = request_free_mem_region(&iomem_resource, size, "nvidia-uvm-hmm");
3479     if (IS_ERR(res)) {
3480         UVM_ERR_PRINT("request_free_mem_region() err %ld\n", PTR_ERR(res));
3481         status = errno_to_nv_status(PTR_ERR(res));
3482         goto err;
3483     }
3484 
3485     devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
3486     devmem->pagemap.range.start = res->start;
3487     devmem->pagemap.range.end = res->end;
3488     devmem->pagemap.nr_range = 1;
3489     devmem->pagemap.ops = &uvm_pmm_devmem_ops;
3490     devmem->pagemap.owner = &g_uvm_global;
3491 
3492     // Numa node ID doesn't matter for ZONE_DEVICE private pages.
3493     ptr = memremap_pages(&devmem->pagemap, NUMA_NO_NODE);
3494     if (IS_ERR(ptr)) {
3495         UVM_ERR_PRINT("memremap_pages() err %ld\n", PTR_ERR(ptr));
3496         status = errno_to_nv_status(PTR_ERR(ptr));
3497         goto err_release;
3498     }
3499 
3500     return NV_OK;
3501 
3502 err_release:
3503     release_mem_region(res->start, resource_size(res));
3504 err:
3505     devmem->pagemap.owner = NULL;
3506     return status;
3507 }
3508 
devmem_deinit(uvm_pmm_gpu_t * pmm)3509 static void devmem_deinit(uvm_pmm_gpu_t *pmm)
3510 {
3511     uvm_pmm_gpu_devmem_t *devmem = &pmm->devmem;
3512 
3513     if (!devmem->pagemap.owner)
3514         return;
3515 
3516     memunmap_pages(&devmem->pagemap);
3517     release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range));
3518 }
3519 
uvm_pmm_gpu_devmem_get_pfn(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)3520 unsigned long uvm_pmm_gpu_devmem_get_pfn(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
3521 {
3522     return (pmm->devmem.pagemap.range.start + chunk->address) >> PAGE_SHIFT;
3523 }
3524 
3525 #endif // UVM_IS_CONFIG_HMM()
3526 
3527 #if !UVM_IS_CONFIG_HMM()
devmem_init(uvm_pmm_gpu_t * pmm)3528 static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
3529 {
3530     return NV_OK;
3531 }
3532 
devmem_deinit(uvm_pmm_gpu_t * pmm)3533 static void devmem_deinit(uvm_pmm_gpu_t *pmm)
3534 {
3535 }
3536 
uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t * pmm)3537 static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
3538 {
3539     return true;
3540 }
3541 #endif // UVM_IS_CONFIG_HMM()
3542 
process_lazy_free(uvm_pmm_gpu_t * pmm)3543 static void process_lazy_free(uvm_pmm_gpu_t *pmm)
3544 {
3545     uvm_gpu_chunk_t *chunk;
3546 
3547     uvm_spin_lock(&pmm->list_lock);
3548 
3549     // Note: We can't use list_for_each_safe_entry() because we drop the lock
3550     // in the loop. Instead, just keep removing the first entry until the list
3551     // is empty.
3552     while (!list_empty(&pmm->root_chunks.va_block_lazy_free)) {
3553         chunk = list_first_entry(&pmm->root_chunks.va_block_lazy_free, uvm_gpu_chunk_t, list);
3554         list_del_init(&chunk->list);
3555         uvm_spin_unlock(&pmm->list_lock);
3556 
3557         free_chunk(pmm, chunk);
3558 
3559         uvm_spin_lock(&pmm->list_lock);
3560     }
3561 
3562     uvm_spin_unlock(&pmm->list_lock);
3563 }
3564 
process_lazy_free_entry(void * args)3565 static void process_lazy_free_entry(void *args)
3566 {
3567     UVM_ENTRY_VOID(process_lazy_free(args));
3568 }
3569 
uvm_pmm_gpu_init(uvm_pmm_gpu_t * pmm)3570 NV_STATUS uvm_pmm_gpu_init(uvm_pmm_gpu_t *pmm)
3571 {
3572     uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
3573     const uvm_chunk_sizes_mask_t chunk_size_init[][UVM_PMM_GPU_MEMORY_TYPE_COUNT] =
3574     {
3575         { gpu->parent->mmu_user_chunk_sizes,
3576           gpu->parent->mmu_user_chunk_sizes,
3577           gpu->parent->mmu_kernel_chunk_sizes,
3578           gpu->parent->mmu_kernel_chunk_sizes },
3579         { 0, 0, uvm_mem_kernel_chunk_sizes(gpu), uvm_mem_kernel_chunk_sizes(gpu)},
3580     };
3581     NV_STATUS status = NV_OK;
3582     size_t i, j, k;
3583 
3584     // UVM_CHUNK_SIZE_INVALID is UVM_CHUNK_SIZE_MAX shifted left by 1. This protects
3585     // UVM_CHUNK_SIZE_INVALID from being negative
3586     BUILD_BUG_ON(UVM_CHUNK_SIZE_MAX >= UVM_CHUNK_SIZE_INVALID);
3587 
3588     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
3589 
3590     for (i = 0; i < ARRAY_SIZE(pmm->free_list); i++) {
3591         for (j = 0; j < ARRAY_SIZE(pmm->free_list[i]); j++) {
3592             for (k = 0; k < ARRAY_SIZE(pmm->free_list[i][j]); k++)
3593                 INIT_LIST_HEAD(&pmm->free_list[i][j][k]);
3594         }
3595     }
3596     INIT_LIST_HEAD(&pmm->root_chunks.va_block_used);
3597     INIT_LIST_HEAD(&pmm->root_chunks.va_block_unused);
3598     INIT_LIST_HEAD(&pmm->root_chunks.va_block_lazy_free);
3599     nv_kthread_q_item_init(&pmm->root_chunks.va_block_lazy_free_q_item, process_lazy_free_entry, pmm);
3600 
3601     uvm_mutex_init(&pmm->lock, UVM_LOCK_ORDER_PMM);
3602     uvm_init_rwsem(&pmm->pma_lock, UVM_LOCK_ORDER_PMM_PMA);
3603     uvm_spin_lock_init(&pmm->list_lock, UVM_LOCK_ORDER_LEAF);
3604 
3605     pmm->initialized = true;
3606 
3607     for (i = 0; i < UVM_PMM_GPU_MEMORY_TYPE_COUNT; i++) {
3608         pmm->chunk_sizes[i] = 0;
3609         // Add the common root chunk size to all memory types
3610         pmm->chunk_sizes[i] |= UVM_CHUNK_SIZE_MAX;
3611         for (j = 0; j < ARRAY_SIZE(chunk_size_init); j++)
3612             pmm->chunk_sizes[i] |= chunk_size_init[j][i];
3613 
3614         UVM_ASSERT(pmm->chunk_sizes[i] < UVM_CHUNK_SIZE_INVALID);
3615         UVM_ASSERT_MSG(hweight_long(pmm->chunk_sizes[i]) <= UVM_MAX_CHUNK_SIZES,
3616                 "chunk sizes %lu, max chunk sizes %u\n", hweight_long(pmm->chunk_sizes[i]), UVM_MAX_CHUNK_SIZES);
3617     }
3618 
3619     status = init_caches(pmm);
3620     if (status != NV_OK)
3621         goto cleanup;
3622 
3623     // Assert that max physical address of the GPU is not unreasonably big for
3624     // creating the flat array of root chunks. 256GB should provide a reasonable
3625     // amount of future-proofing and results in 128K chunks which is still
3626     // manageable.
3627     UVM_ASSERT_MSG(gpu->mem_info.max_allocatable_address < UVM_GPU_MAX_PHYS_MEM,
3628                    "Max physical address 0x%llx exceeds limit of 0x%llx\n",
3629                    gpu->mem_info.max_allocatable_address,
3630                    UVM_GPU_MAX_PHYS_MEM);
3631 
3632     // Align up the size to have a root chunk for the last part of the FB. PMM
3633     // won't be able to allocate it, if it doesn't fit a whole root chunk, but
3634     // it's convenient to have it for uvm_test_pma_alloc_free().
3635     pmm->root_chunks.count = UVM_ALIGN_UP(gpu->mem_info.max_allocatable_address, UVM_CHUNK_SIZE_MAX) /
3636                              UVM_CHUNK_SIZE_MAX;
3637     pmm->root_chunks.array = uvm_kvmalloc_zero(sizeof(*pmm->root_chunks.array) * pmm->root_chunks.count);
3638     if (!pmm->root_chunks.array) {
3639         status = NV_ERR_NO_MEMORY;
3640         goto cleanup;
3641     }
3642 
3643     // Initialize all root chunks to be PMA owned and set their addresses
3644     for (i = 0; i < pmm->root_chunks.count; ++i) {
3645         uvm_gpu_chunk_t *chunk = &pmm->root_chunks.array[i].chunk;
3646 
3647         INIT_LIST_HEAD(&chunk->list);
3648         chunk->gpu_index = uvm_id_gpu_index(gpu->id);
3649         chunk->state = UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED;
3650         uvm_gpu_chunk_set_size(chunk, UVM_CHUNK_SIZE_MAX);
3651         chunk->address = i * UVM_CHUNK_SIZE_MAX;
3652         chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
3653     }
3654 
3655     status = uvm_bit_locks_init(&pmm->root_chunks.bitlocks, pmm->root_chunks.count, UVM_LOCK_ORDER_PMM_ROOT_CHUNK);
3656     if (status != NV_OK)
3657         goto cleanup;
3658 
3659     if (gpu->mem_info.size != 0) {
3660         status = uvm_rm_locked_call(nvUvmInterfaceGetPmaObject(uvm_gpu_device_handle(gpu), &pmm->pma, &pmm->pma_stats));
3661 
3662         if (status != NV_OK)
3663             goto cleanup;
3664 
3665         if (gpu_supports_pma_eviction(gpu)) {
3666             status = nvUvmInterfacePmaRegisterEvictionCallbacks(pmm->pma,
3667                                                                 uvm_pmm_gpu_pma_evict_pages_wrapper_entry,
3668                                                                 uvm_pmm_gpu_pma_evict_range_wrapper_entry,
3669                                                                 pmm);
3670             if (status != NV_OK)
3671                 goto cleanup;
3672         }
3673     }
3674 
3675     status = devmem_init(pmm);
3676     if (status != NV_OK)
3677         goto cleanup;
3678 
3679     return NV_OK;
3680 cleanup:
3681     uvm_pmm_gpu_deinit(pmm);
3682     return status;
3683 }
3684 
3685 // Return to PMA any remaining free root chunks. Currently only USER
3686 // (non-pinned) chunks are pre-allocated, so the KERNEL free list should be
3687 // empty at this point. However, we may want to batch the allocation of pinned
3688 // pages in the future, too.
release_free_root_chunks(uvm_pmm_gpu_t * pmm)3689 static void release_free_root_chunks(uvm_pmm_gpu_t *pmm)
3690 {
3691     uvm_pmm_gpu_memory_type_t type;
3692 
3693     for (type = 0; type < UVM_PMM_GPU_MEMORY_TYPE_COUNT; ++type) {
3694         uvm_pmm_list_zero_t zero_type;
3695 
3696         while (free_next_available_root_chunk(pmm, type))
3697             ;
3698 
3699         for (zero_type = 0; zero_type < UVM_PMM_LIST_ZERO_COUNT; ++zero_type)
3700             UVM_ASSERT(list_empty(find_free_list(pmm, type, UVM_CHUNK_SIZE_MAX, zero_type)));
3701     }
3702 }
3703 
uvm_pmm_gpu_deinit(uvm_pmm_gpu_t * pmm)3704 void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)
3705 {
3706     uvm_gpu_t *gpu;
3707     size_t i, j, k;
3708 
3709     if (!pmm->initialized)
3710         return;
3711 
3712     gpu = uvm_pmm_to_gpu(pmm);
3713 
3714     UVM_ASSERT(uvm_pmm_gpu_check_orphan_pages(pmm));
3715     nv_kthread_q_flush(&gpu->parent->lazy_free_q);
3716     UVM_ASSERT(list_empty(&pmm->root_chunks.va_block_lazy_free));
3717     release_free_root_chunks(pmm);
3718 
3719     if (gpu->mem_info.size != 0 && gpu_supports_pma_eviction(gpu))
3720         nvUvmInterfacePmaUnregisterEvictionCallbacks(pmm->pma);
3721 
3722     // TODO: Bug 1766184: Handle ECC/RC
3723     for (i = 0; i < ARRAY_SIZE(pmm->free_list); i++) {
3724         for (j = 0; j < ARRAY_SIZE(pmm->free_list[i]); j++) {
3725             for (k = 0; k < ARRAY_SIZE(pmm->free_list[i][j]); ++k) {
3726                 UVM_ASSERT_MSG(list_empty(&pmm->free_list[i][j][k]), "i: %s, j: %zu, k: %zu\n",
3727                                uvm_pmm_gpu_memory_type_string(i), j, k);
3728             }
3729         }
3730     }
3731 
3732     uvm_bit_locks_deinit(&pmm->root_chunks.bitlocks);
3733 
3734     for (i = 0; i < ARRAY_SIZE(pmm->root_chunks.indirect_peer); i++) {
3735         UVM_ASSERT(pmm->root_chunks.indirect_peer[i].dma_addrs == NULL);
3736         UVM_ASSERT(atomic64_read(&pmm->root_chunks.indirect_peer[i].map_count) == 0);
3737     }
3738 
3739     if (pmm->root_chunks.array) {
3740         // Make sure that all chunks have been returned to PMA
3741         for (i = 0; i < pmm->root_chunks.count; ++i) {
3742             uvm_gpu_chunk_t *chunk = &pmm->root_chunks.array[i].chunk;
3743             UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED,
3744                            "index %zu state %s GPU %s\n",
3745                            i,
3746                            uvm_pmm_gpu_chunk_state_string(chunk->state),
3747                            uvm_gpu_name(gpu));
3748         }
3749     }
3750     uvm_kvfree(pmm->root_chunks.array);
3751 
3752     deinit_caches(pmm);
3753 
3754     devmem_deinit(pmm);
3755 
3756     pmm->initialized = false;
3757 }
3758 
uvm_test_evict_chunk(UVM_TEST_EVICT_CHUNK_PARAMS * params,struct file * filp)3759 NV_STATUS uvm_test_evict_chunk(UVM_TEST_EVICT_CHUNK_PARAMS *params, struct file *filp)
3760 {
3761     NV_STATUS status = NV_OK;
3762     uvm_gpu_t *gpu;
3763     uvm_va_space_t *va_space = uvm_va_space_get(filp);
3764     uvm_va_block_t *block = NULL;
3765     uvm_gpu_root_chunk_t *root_chunk = NULL;
3766     uvm_pmm_gpu_t *pmm;
3767     struct mm_struct *mm;
3768 
3769     params->chunk_was_evicted = NV_FALSE;
3770     params->evicted_physical_address = 0;
3771     params->chunk_size_backing_virtual = 0;
3772 
3773     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
3774     uvm_va_space_down_read(va_space);
3775 
3776     gpu = uvm_va_space_get_gpu_by_uuid(va_space, &params->gpu_uuid);
3777     if (!gpu || !uvm_parent_gpu_supports_eviction(gpu->parent)) {
3778         uvm_va_space_up_read(va_space);
3779         uvm_va_space_mm_or_current_release_unlock(va_space, mm);
3780         return NV_ERR_INVALID_DEVICE;
3781     }
3782     pmm = &gpu->pmm;
3783 
3784     // Retain the GPU before unlocking the VA space so that it sticks around.
3785     uvm_gpu_retain(gpu);
3786 
3787     // For virtual mode, look up and retain the block first so that eviction can
3788     // be started without the VA space lock held.
3789     if (params->eviction_mode == UvmTestEvictModeVirtual) {
3790         if (mm)
3791             status = uvm_va_block_find_create(va_space, params->address, NULL, &block);
3792         else
3793             status = uvm_va_block_find_create_managed(va_space, params->address, &block);
3794 
3795         if (status != NV_OK) {
3796             uvm_va_space_up_read(va_space);
3797             uvm_va_space_mm_or_current_release_unlock(va_space, mm);
3798             goto out;
3799         }
3800 
3801         // Retain the block before unlocking the VA space lock so that we can
3802         // safely access it later.
3803         uvm_va_block_retain(block);
3804     }
3805 
3806     // Unlock the VA space to emulate real eviction better where a VA space lock
3807     // may not be held or may be held for a different VA space.
3808     uvm_va_space_up_read(va_space);
3809     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
3810 
3811     if (params->eviction_mode == UvmTestEvictModeVirtual) {
3812         UVM_ASSERT(block);
3813 
3814         uvm_mutex_lock(&block->lock);
3815 
3816         // As the VA space lock is not held we need to make sure the block
3817         // is still alive.
3818         if (!uvm_va_block_is_dead(block)) {
3819             // The block might have been split in the meantime and may no longer
3820             // cover the address as a result.
3821             if (params->address >= block->start && params->address <= block->end) {
3822                 uvm_gpu_chunk_t *chunk = uvm_va_block_lookup_gpu_chunk(block, gpu, params->address);
3823 
3824                 uvm_spin_lock(&pmm->list_lock);
3825                 if (chunk && chunk_is_evictable(pmm, chunk)) {
3826                     chunk_start_eviction(pmm, chunk);
3827                     root_chunk = root_chunk_from_chunk(pmm, chunk);
3828                     params->chunk_size_backing_virtual = uvm_gpu_chunk_get_size(chunk);
3829                 }
3830                 uvm_spin_unlock(&pmm->list_lock);
3831             }
3832         }
3833         else {
3834             // Consider it an error to free the block before the eviction ioctl
3835             // is done.
3836             status = NV_ERR_INVALID_ADDRESS;
3837         }
3838 
3839         uvm_mutex_unlock(&block->lock);
3840         uvm_va_block_release(block);
3841 
3842         if (status != NV_OK)
3843             goto out;
3844     }
3845     else if (params->eviction_mode == UvmTestEvictModePhysical) {
3846         uvm_gpu_chunk_t *chunk;
3847         size_t index = params->address / UVM_CHUNK_SIZE_MAX;
3848 
3849         if (index >= pmm->root_chunks.count) {
3850             status = NV_ERR_INVALID_ADDRESS;
3851             goto out;
3852         }
3853 
3854         root_chunk = &pmm->root_chunks.array[index];
3855         chunk = &root_chunk->chunk;
3856 
3857         uvm_spin_lock(&pmm->list_lock);
3858 
3859         if (chunk_is_evictable(pmm, chunk))
3860             chunk_start_eviction(pmm, chunk);
3861         else
3862             chunk = NULL;
3863 
3864         uvm_spin_unlock(&pmm->list_lock);
3865 
3866         if (!chunk)
3867             root_chunk = NULL;
3868     }
3869     else if (params->eviction_mode == UvmTestEvictModeDefault) {
3870         root_chunk = pick_root_chunk_to_evict(pmm);
3871     }
3872     else {
3873         UVM_DBG_PRINT("Invalid eviction mode: 0x%x\n", params->eviction_mode);
3874         status = NV_ERR_INVALID_ARGUMENT;
3875         goto out;
3876     }
3877 
3878     if (!root_chunk) {
3879         // Not finding a chunk to evict is not considered an error, the caller
3880         // can inspect the targeted_chunk_size to see whether anything was evicted.
3881         goto out;
3882     }
3883 
3884     uvm_mutex_lock(&pmm->lock);
3885     status = evict_root_chunk(pmm, root_chunk, PMM_CONTEXT_DEFAULT);
3886     uvm_mutex_unlock(&pmm->lock);
3887 
3888     if (status != NV_OK)
3889         goto out;
3890 
3891     params->chunk_was_evicted = NV_TRUE;
3892     params->evicted_physical_address = root_chunk->chunk.address;
3893     free_chunk(pmm, &root_chunk->chunk);
3894 
3895 out:
3896     uvm_gpu_release(gpu);
3897     return status;
3898 }
3899 
test_check_pma_allocated_chunks(uvm_pmm_gpu_t * pmm,UVM_TEST_PMA_ALLOC_FREE_PARAMS * params,NvU64 * pages)3900 static NV_STATUS test_check_pma_allocated_chunks(uvm_pmm_gpu_t *pmm,
3901                                                  UVM_TEST_PMA_ALLOC_FREE_PARAMS *params,
3902                                                  NvU64 *pages)
3903 {
3904     NV_STATUS status = NV_OK;
3905     NvU32 i;
3906 
3907     for (i = 0; i < params->num_pages; ++i) {
3908         uvm_gpu_root_chunk_t *root_chunk;
3909         NvU64 address;
3910         if (params->contiguous)
3911             address = pages[0] + ((NvU64)params->page_size) * i;
3912         else
3913             address = pages[i];
3914 
3915         root_chunk = root_chunk_from_address(pmm, address);
3916 
3917         if (!IS_ALIGNED(address, params->page_size)) {
3918             UVM_TEST_PRINT("Returned unaligned address 0x%llx page size %u\n", address, params->page_size);
3919             status = NV_ERR_INVALID_STATE;
3920         }
3921 
3922         // The chunk should still be in the PMA owned state
3923         uvm_spin_lock(&pmm->list_lock);
3924         if (root_chunk->chunk.state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED) {
3925             UVM_TEST_PRINT("Root chunk 0x%llx invalid state: %s, allocated [0x%llx, 0x%llx)\n",
3926                            root_chunk->chunk.address,
3927                            uvm_pmm_gpu_chunk_state_string(root_chunk->chunk.state),
3928                            address, address + params->page_size);
3929             status = NV_ERR_INVALID_STATE;
3930         }
3931         uvm_spin_unlock(&pmm->list_lock);
3932     }
3933     return status;
3934 }
3935 
uvm_test_pma_alloc_free(UVM_TEST_PMA_ALLOC_FREE_PARAMS * params,struct file * filp)3936 NV_STATUS uvm_test_pma_alloc_free(UVM_TEST_PMA_ALLOC_FREE_PARAMS *params, struct file *filp)
3937 {
3938     NV_STATUS status = NV_OK;
3939     uvm_gpu_t *gpu;
3940     uvm_pmm_gpu_t *pmm;
3941     NvU64 page;
3942     NvU64 *pages = NULL;
3943     NvU32 free_flags;
3944     UvmPmaAllocationOptions options = {0};
3945     uvm_va_space_t *va_space = uvm_va_space_get(filp);
3946 
3947     gpu = uvm_va_space_retain_gpu_by_uuid(va_space, &params->gpu_uuid);
3948     if (!gpu)
3949         return NV_ERR_INVALID_DEVICE;
3950 
3951     pmm = &gpu->pmm;
3952 
3953     options.flags = UVM_PMA_ALLOCATE_PINNED;
3954     if (params->contiguous) {
3955         options.flags |= UVM_PMA_ALLOCATE_CONTIGUOUS;
3956         pages = &page;
3957     }
3958     else {
3959         pages = uvm_kvmalloc(sizeof(*pages) * params->num_pages);
3960         if (!pages) {
3961             status = NV_ERR_NO_MEMORY;
3962             goto out;
3963         }
3964     }
3965     if (params->phys_begin != 0 || params->phys_end != 0) {
3966         options.physBegin = params->phys_begin;
3967         options.physEnd = params->phys_end;
3968         options.flags |= UVM_PMA_ALLOCATE_SPECIFY_ADDRESS_RANGE;
3969     }
3970 
3971     status = nvUvmInterfacePmaAllocPages(pmm->pma, params->num_pages, params->page_size, &options, pages);
3972     if (status != NV_OK)
3973         goto out;
3974 
3975     status = test_check_pma_allocated_chunks(pmm, params, pages);
3976     if (status != NV_OK) {
3977         UVM_TEST_PRINT("Failed before the nap\n");
3978         goto free;
3979     }
3980 
3981     if (params->nap_us_before_free)
3982         usleep_range(params->nap_us_before_free, params->nap_us_before_free + 10);
3983 
3984     status = test_check_pma_allocated_chunks(pmm, params, pages);
3985     if (status != NV_OK)
3986         UVM_TEST_PRINT("Failed after the nap\n");
3987 
3988 free:
3989     free_flags = options.flags;
3990 
3991     if (!!(options.resultFlags & UVM_PMA_ALLOCATE_RESULT_IS_ZERO))
3992         free_flags |= UVM_PMA_FREE_IS_ZERO;
3993 
3994     nvUvmInterfacePmaFreePages(gpu->pmm.pma, pages, params->num_pages, params->page_size, free_flags);
3995 
3996 out:
3997     if (!params->contiguous)
3998         uvm_kvfree(pages);
3999 
4000     uvm_gpu_release(gpu);
4001     return status;
4002 }
4003 
uvm_test_pmm_alloc_free_root(UVM_TEST_PMM_ALLOC_FREE_ROOT_PARAMS * params,struct file * filp)4004 NV_STATUS uvm_test_pmm_alloc_free_root(UVM_TEST_PMM_ALLOC_FREE_ROOT_PARAMS *params, struct file *filp)
4005 {
4006     NV_STATUS status = NV_OK;
4007     uvm_gpu_t *gpu;
4008     uvm_pmm_gpu_t *pmm;
4009     uvm_gpu_chunk_t *chunk;
4010     uvm_tracker_t tracker = UVM_TRACKER_INIT();
4011     uvm_va_space_t *va_space = uvm_va_space_get(filp);
4012 
4013     gpu = uvm_va_space_retain_gpu_by_uuid(va_space, &params->gpu_uuid);
4014     if (!gpu)
4015         return NV_ERR_INVALID_DEVICE;
4016 
4017     pmm = &gpu->pmm;
4018 
4019     status = uvm_pmm_gpu_alloc_user(pmm,
4020                                     1,
4021                                     UVM_CHUNK_SIZE_MAX,
4022                                     UVM_PMM_ALLOC_FLAGS_EVICT | UVM_PMM_ALLOC_FLAGS_DONT_BATCH,
4023                                     &chunk,
4024                                     &tracker);
4025 
4026     if (status != NV_OK)
4027         goto out;
4028 
4029     if (params->nap_us_before_free)
4030         usleep_range(params->nap_us_before_free, params->nap_us_before_free + 10);
4031 
4032     uvm_pmm_gpu_free(pmm, chunk, NULL);
4033     uvm_tracker_deinit(&tracker);
4034 
4035 out:
4036     uvm_gpu_release(gpu);
4037     return status;
4038 }
4039 
uvm_test_pmm_inject_pma_evict_error(UVM_TEST_PMM_INJECT_PMA_EVICT_ERROR_PARAMS * params,struct file * filp)4040 NV_STATUS uvm_test_pmm_inject_pma_evict_error(UVM_TEST_PMM_INJECT_PMA_EVICT_ERROR_PARAMS *params, struct file *filp)
4041 {
4042     uvm_gpu_t *gpu;
4043     uvm_pmm_gpu_t *pmm;
4044     uvm_va_space_t *va_space = uvm_va_space_get(filp);
4045 
4046     gpu = uvm_va_space_retain_gpu_by_uuid(va_space, &params->gpu_uuid);
4047     if (!gpu)
4048         return NV_ERR_INVALID_DEVICE;
4049 
4050     pmm = &gpu->pmm;
4051 
4052     uvm_mutex_lock(&pmm->lock);
4053     pmm->inject_pma_evict_error_after_num_chunks = params->error_after_num_chunks;
4054     uvm_mutex_unlock(&pmm->lock);
4055 
4056     uvm_gpu_release(gpu);
4057     return NV_OK;
4058 }
4059 
uvm_test_pmm_release_free_root_chunks(UVM_TEST_PMM_RELEASE_FREE_ROOT_CHUNKS_PARAMS * params,struct file * filp)4060 NV_STATUS uvm_test_pmm_release_free_root_chunks(UVM_TEST_PMM_RELEASE_FREE_ROOT_CHUNKS_PARAMS *params,
4061                                                  struct file *filp)
4062 {
4063     uvm_gpu_t *gpu;
4064     uvm_va_space_t *va_space = uvm_va_space_get(filp);
4065 
4066     gpu = uvm_va_space_retain_gpu_by_uuid(va_space, &params->gpu_uuid);
4067     if (!gpu)
4068         return NV_ERR_INVALID_DEVICE;
4069 
4070     release_free_root_chunks(&gpu->pmm);
4071 
4072     uvm_gpu_release(gpu);
4073     return NV_OK;
4074 }
4075 
uvm_test_pma_get_batch_size(UVM_TEST_PMA_GET_BATCH_SIZE_PARAMS * params,struct file * filp)4076 NV_STATUS uvm_test_pma_get_batch_size(UVM_TEST_PMA_GET_BATCH_SIZE_PARAMS *params, struct file *filp)
4077 {
4078     uvm_gpu_t *gpu;
4079     uvm_va_space_t *va_space = uvm_va_space_get(filp);
4080 
4081     gpu = uvm_va_space_retain_gpu_by_uuid(va_space, &params->gpu_uuid);
4082     if (!gpu)
4083         return NV_ERR_INVALID_DEVICE;
4084 
4085     if (gpu->parent->rm_info.isSimulated)
4086         params->pma_batch_size = UVM_CHUNK_SIZE_MAX;
4087     else
4088         params->pma_batch_size = (1 << uvm_perf_pma_batch_nonpinned_order) * UVM_CHUNK_SIZE_MAX;
4089 
4090     uvm_gpu_release(gpu);
4091     return NV_OK;
4092 }
4093 
uvm_test_pmm_query_pma_stats(UVM_TEST_PMM_QUERY_PMA_STATS_PARAMS * params,struct file * filp)4094 NV_STATUS uvm_test_pmm_query_pma_stats(UVM_TEST_PMM_QUERY_PMA_STATS_PARAMS *params, struct file *filp)
4095 {
4096     uvm_gpu_t *gpu;
4097     uvm_va_space_t *va_space = uvm_va_space_get(filp);
4098 
4099     gpu = uvm_va_space_retain_gpu_by_uuid(va_space, &params->gpu_uuid);
4100     if (!gpu)
4101         return NV_ERR_INVALID_DEVICE;
4102 
4103     params->pma_stats.numFreePages64k = UVM_READ_ONCE(gpu->pmm.pma_stats->numFreePages64k);
4104     params->pma_stats.numFreePages2m = UVM_READ_ONCE(gpu->pmm.pma_stats->numFreePages2m);
4105 
4106     uvm_gpu_release(gpu);
4107     return NV_OK;
4108 }
4109