1 /*******************************************************************************
2 Copyright (c) 2015-2023 NVIDIA Corporation
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to
6 deal in the Software without restriction, including without limitation the
7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8 sell copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be
12 included in all copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 DEALINGS IN THE SOFTWARE.
21
22 *******************************************************************************/
23
24 //
25 // High level description of PMM is in the header file, here some implementation
26 // details are discussed.
27 //
28 // There is one PMM object per GPU and the PMM state among GPUs is completely
29 // separate with the exception of a few shared kmem caches.
30 //
31 // PMM allocates all of the memory it manages from PMA which is the common GPU
32 // Physical Memory Allocator shared by UVM and RM (it's included as part of RM,
33 // but logically separate from it).
34 //
35 // The state of each GPU memory chunk is tracked in uvm_gpu_chunk_t objects.
36 // Each chunk has a type, size and state. Type and size are persistent
37 // throughout chunk's lifetime while its state changes as it's allocated, split,
38 // merged and freed.
39 //
40 // PMM maintains a pre-allocated flat array of root chunks covering all possible
41 // physical allocations that can be returned from PMA. For simplicity, PMM
42 // always allocates 2M (UVM_CHUNK_SIZE_MAX) chunks from PMA and each naturally
43 // aligned 2M chunk represents a single root chunk. The root chunks array is
44 // indexed by the physical address of each chunk divided by UVM_CHUNK_SIZE_MAX
45 // allowing for a simple and fast lookup of root chunks.
46 //
47 // Each root chunk has a tracker for any pending operations on the root chunk
48 // (including all of its subchunks in case it's split) to support asynchronous
49 // alloc and free. Each tracker is protected by a separate bitlock (see
50 // root_chunk_lock()) as synchronizing any pending operations might take a long
51 // time and it would be undesirable for that to block other operations of PMM.
52 // Notably some synchronization is required as part of allocation to handle GPU
53 // lifetime issues across VA spaces (see comments in uvm_pmm_gpu_alloc()). Bit
54 // locks (instead of a mutex in each root chunk) are used to save space.
55 //
56 // All free chunks (UVM_PMM_GPU_CHUNK_STATE_FREE) are kept on free lists, with
57 // one list per each combination of memory type and chunk size (see usage of
58 // uvm_pmm_gpu_t::free_list for reference). This allows for a very quick
59 // allocation and freeing of chunks in case the right size is already available
60 // on alloc or no merges are required on free. See claim_free_chunk() for
61 // allocation and chunk_free_locked() for freeing.
62 //
63 // When a chunk is allocated it transitions into the temporarily pinned state
64 // (UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) until it's unpinned when it becomes
65 // allocated (UVM_PMM_GPU_CHUNK_STATE_ALLOCATED). This transition is only
66 // meaningful for user memory chunks where temporarily pinned chunks cannot be
67 // evicted. Kernel memory type chunks do not support eviction at all and they
68 // are transitioned into the allocated state as part of the allocation itself
69 // (see uvm_pmm_gpu_alloc_kernel). When the chunk is freed it transitions back
70 // to the free state and is placed on an appropriate free list.
71 //
72 // To support smaller allocations, PMM internally splits and merges root chunks
73 // as needed. Splitting and merging is protected by an exclusive lock
74 // (uvm_pmm_gpu_t::lock) to prevent PMM from over-allocating root chunks in case
75 // multiple threads race for a small allocation and there are no free chunks
76 // immediately available.
77 //
78 // Splitting is performed lazily, i.e. chunks are only split when a chunk of the
79 // requested type and size is not available. Splits are only done to the next
80 // smaller size and hence may need to be performed multiple times recursively to
81 // get to the desired chunk size. See alloc_chunk_with_splits(). All split
82 // chunks under the root chunk form a tree with all internal nodes being in
83 // split state and leaf nodes being in any of the free, allocated or pinned
84 // states.
85 //
86 // Merging is performed eagerly, i.e. whenever all chunks under a parent (split)
87 // chunk become free, they are merged into one bigger chunk. See
88 // free_chunk_with_merges().
89 //
90 // Splitting and merging already allocated chunks is also exposed to the users of
91 // allocated chunks. See uvm_pmm_gpu_split_chunk() and uvm_pmm_gpu_merge_chunk().
92 //
93 // As splits and merges are protected by a single PMM mutex, they are only
94 // performed when really necessary. See alloc_chunk() that falls back to split
95 // only as the last step and free_chunk() that similarly first tries performing
96 // a quick free.
97 //
98 // When a memory allocation from PMA fails and eviction is requested, PMM will
99 // check whether it can evict any user memory chunks to satisfy the request.
100 // All allocated user memory root chunks are tracked in an LRU list
101 // (root_chunks.va_block_used). A root chunk is moved to the tail of that list
102 // whenever any of its subchunks is allocated (unpinned) by a VA block (see
103 // uvm_pmm_gpu_unpin_allocated()). When a root chunk is selected for eviction,
104 // it has the eviction flag set (see pick_root_chunk_to_evict()). This flag
105 // affects many of the PMM operations on all of the subchunks of the root chunk
106 // being evicted. See usage of (root_)chunk_is_in_eviction(), in particular in
107 // chunk_free_locked() and claim_free_chunk().
108 //
109 // To evict a root chunk, all of its free subchunks are pinned, then all
110 // resident pages backed by it are moved to the CPU one VA block at a time.
111 // After all of them are moved, the root chunk is merged and returned to the
112 // caller. See evict_root_chunk() for details.
113 //
114 // Eviction is also possible to be triggered by PMA. This makes it possible for
115 // other PMA clients (most importantly RM which CUDA uses for non-UVM
116 // allocations) to successfully allocate memory from the user memory pool
117 // allocated by UVM. UVM registers two eviction callbacks with PMA that PMA
118 // calls as needed to perform the eviction:
119 // - uvm_pmm_gpu_pma_evict_range - for evicting a physical range
120 // - uvm_pmm_gpu_pma_evict_pages - for evicting a number of pages
121 //
122 // Both of them perform the eviction using the same building blocks as internal
123 // eviction, but see their implementation and references to pma.h for more
124 // details.
125 //
126 // PMM locking
127 // - PMM mutex
128 // Exclusive lock protecting both internal and external splits and merges, and
129 // eviction.
130 //
131 // - PMM list lock
132 // Protects state transitions of chunks and their movement among lists.
133 //
134 // - PMM root chunk bit locks
135 // Each bit lock protects the corresponding root chunk's allocation, freeing
136 // from/to PMA, root chunk trackers, and root chunk indirect_peer mappings.
137 //
138 // - PMA allocation/eviction lock
139 // A read-write semaphore used by the eviction path to flush any pending
140 // allocations. See usage of pma_lock in alloc_root_chunk() and
141 // uvm_pmm_gpu_pma_evict_range().
142 //
143 // == Trade-offs ===
144 //
145 // In general, PMM is optimized towards Pascal+ and 2M VA blocks (that's also
146 // the UVM_CHUNK_SIZE_MAX) as Pascal+ makes much heavier use of PMM:
147 // - Oversubscription is Pascal+ only
148 // - On pre-Pascal (UVM-Lite) CUDA currently pre-populates all managed memory
149 // and hence performance matters mostly only during CUDA memory allocation.
150 // - On Pascal+ CUDA doesn't pre-populate and memory is allocated on first
151 // touch.
152 //
153 // The root chunk size matching the VA block chunk size allows PMM to avoid
154 // having to split and merge for the hopefully (HMM might make this hard) common
155 // allocation size of 2M on Pascal+.
156 //
157 // Careful benchmarks and tweaking of PMM are yet to be performed, but there is
158 // some evidence for PMA to potentially cause issues for oversubscription (see
159 // bug 1775408).
160 //
161
162 #include "uvm_common.h"
163 #include "nv_uvm_interface.h"
164 #include "uvm_api.h"
165 #include "uvm_gpu.h"
166 #include "uvm_pmm_gpu.h"
167 #include "uvm_mem.h"
168 #include "uvm_mmu.h"
169 #include "uvm_global.h"
170 #include "uvm_kvmalloc.h"
171 #include "uvm_va_space.h"
172 #include "uvm_va_block.h"
173 #include "uvm_test.h"
174 #include "uvm_linux.h"
175
176 static int uvm_global_oversubscription = 1;
177 module_param(uvm_global_oversubscription, int, S_IRUGO);
178 MODULE_PARM_DESC(uvm_global_oversubscription, "Enable (1) or disable (0) global oversubscription support.");
179
180 #define UVM_PERF_PMA_BATCH_NONPINNED_ORDER_DEFAULT 6
181
182 // Non-pinned root chunks are allocated in batches, in order to minimize the
183 // number of calls into PMA. The number of root chunks in the batch is:
184 // (1 << uvm_perf_pma_batch_nonpinned_order)
185 static unsigned uvm_perf_pma_batch_nonpinned_order = UVM_PERF_PMA_BATCH_NONPINNED_ORDER_DEFAULT;
186 module_param(uvm_perf_pma_batch_nonpinned_order, uint, S_IRUGO);
187
188 // Helper type for refcounting cache
189 typedef struct
190 {
191 // Cache for given split size
192 struct kmem_cache *cache;
193
194 // Number of GPUs using given split size
195 NvU32 refcount;
196
197 // Name of cache
198 char name[32];
199 } kmem_cache_ref_t;
200
201 static kmem_cache_ref_t g_pma_address_batch_cache_ref;
202
203 struct uvm_pmm_gpu_chunk_suballoc_struct
204 {
205 // Number of allocated chunks (including pinned ones)
206 NvU32 allocated;
207
208 // Number of pinned leaf chunks under this chunk
209 //
210 // Tracked only for suballocs of root chunks to know whether a root chunk
211 // can be evicted. This is not in the uvm_gpu_root_chunk_t itself to stop
212 // the root chunks array from growing too much.
213 // TODO: Bug 1765193: Consider moving this to a union with the parent
214 // pointer in uvm_gpu_chunk_t as root chunks never have a parent or just put
215 // in the root chunk directly.
216 // TODO: Bug 1765193: This could be NvU16 if we enforce the smallest chunk
217 // size to be at least 2^21 / 2^16 = 32 bytes.
218 NvU32 pinned_leaf_chunks;
219
220 // Array of all child subchunks
221 // TODO: Bug 1765461: Can the array be inlined? It could save the parent
222 // pointer.
223 uvm_gpu_chunk_t *subchunks[];
224 };
225
226 typedef enum
227 {
228 CHUNK_WALK_PRE_ORDER,
229 CHUNK_WALK_POST_ORDER
230 } chunk_walk_order_t;
231
232 typedef NV_STATUS (*chunk_walk_func_t)(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data);
233
234 // Cache for allocation of uvm_pmm_gpu_chunk_suballoc_t. At index n it stores
235 // a suballoc structure for size 2**n.
236 //
237 // For convenience of init/deinit code level 0 is for allocation of chunks
238 static kmem_cache_ref_t chunk_split_cache[UVM_PMM_CHUNK_SPLIT_CACHE_SIZES];
239 #define CHUNK_CACHE chunk_split_cache[0].cache
240
uvm_pmm_gpu_memory_type_string(uvm_pmm_gpu_memory_type_t type)241 const char *uvm_pmm_gpu_memory_type_string(uvm_pmm_gpu_memory_type_t type)
242 {
243 switch (type) {
244 UVM_ENUM_STRING_CASE(UVM_PMM_GPU_MEMORY_TYPE_USER);
245 UVM_ENUM_STRING_CASE(UVM_PMM_GPU_MEMORY_TYPE_USER_UNPROTECTED);
246 UVM_ENUM_STRING_CASE(UVM_PMM_GPU_MEMORY_TYPE_KERNEL);
247 UVM_ENUM_STRING_CASE(UVM_PMM_GPU_MEMORY_TYPE_KERNEL_UNPROTECTED);
248 UVM_ENUM_STRING_DEFAULT();
249 }
250
251 BUILD_BUG_ON(UVM_PMM_GPU_MEMORY_TYPE_COUNT != 4);
252 }
253
uvm_pmm_gpu_chunk_state_string(uvm_pmm_gpu_chunk_state_t state)254 const char *uvm_pmm_gpu_chunk_state_string(uvm_pmm_gpu_chunk_state_t state)
255 {
256 switch (state) {
257 UVM_ENUM_STRING_CASE(UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
258 UVM_ENUM_STRING_CASE(UVM_PMM_GPU_CHUNK_STATE_FREE);
259 UVM_ENUM_STRING_CASE(UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
260 UVM_ENUM_STRING_CASE(UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
261 UVM_ENUM_STRING_CASE(UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
262 UVM_ENUM_STRING_DEFAULT();
263 }
264 }
265
266 // The PMA APIs that can be called from PMA eviction callbacks (pmaPinPages and
267 // pmaFreePages*) need to be called differently depending whether it's as part
268 // of PMA eviction or not. The PMM context is used to plumb that information
269 // through the stack in a couple of places.
270 typedef enum
271 {
272 PMM_CONTEXT_DEFAULT,
273 PMM_CONTEXT_PMA_EVICTION,
274 } uvm_pmm_context_t;
275
276 // Freeing the root chunk not only needs to differentiate between two different
277 // contexts for calling pmaFreePages(), but also in some cases the free back to
278 // PMA needs to be skipped altogether.
279 typedef enum
280 {
281 FREE_ROOT_CHUNK_MODE_DEFAULT,
282 FREE_ROOT_CHUNK_MODE_PMA_EVICTION,
283 FREE_ROOT_CHUNK_MODE_SKIP_PMA_FREE
284 } free_root_chunk_mode_t;
285
free_root_chunk_mode_from_pmm_context(uvm_pmm_context_t pmm_context)286 static free_root_chunk_mode_t free_root_chunk_mode_from_pmm_context(uvm_pmm_context_t pmm_context)
287 {
288 switch (pmm_context) {
289 case PMM_CONTEXT_DEFAULT:
290 return FREE_ROOT_CHUNK_MODE_DEFAULT;
291 case PMM_CONTEXT_PMA_EVICTION:
292 return FREE_ROOT_CHUNK_MODE_PMA_EVICTION;
293 default:
294 UVM_ASSERT_MSG(false, "Invalid PMM context: 0x%x\n", pmm_context);
295 return FREE_ROOT_CHUNK_MODE_DEFAULT;
296 }
297 }
298
299 static NV_STATUS alloc_chunk(uvm_pmm_gpu_t *pmm,
300 uvm_pmm_gpu_memory_type_t type,
301 uvm_chunk_size_t chunk_size,
302 uvm_pmm_alloc_flags_t flags,
303 uvm_gpu_chunk_t **chunk);
304 static NV_STATUS alloc_root_chunk(uvm_pmm_gpu_t *pmm,
305 uvm_pmm_gpu_memory_type_t type,
306 uvm_pmm_alloc_flags_t flags,
307 uvm_gpu_chunk_t **chunk);
308 static void free_root_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, free_root_chunk_mode_t free_mode);
309 static NV_STATUS split_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
310 static void free_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
311 static void free_chunk_with_merges(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
312 static bool free_next_available_root_chunk(uvm_pmm_gpu_t *pmm, uvm_pmm_gpu_memory_type_t type);
313 static struct list_head *find_free_list(uvm_pmm_gpu_t *pmm,
314 uvm_pmm_gpu_memory_type_t type,
315 uvm_chunk_size_t chunk_size,
316 uvm_pmm_list_zero_t zero_type);
317 static bool check_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
318 static struct list_head *find_free_list_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
319 static void chunk_free_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
320
root_chunk_index(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk)321 static size_t root_chunk_index(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk)
322 {
323 size_t index = root_chunk->chunk.address / UVM_CHUNK_SIZE_MAX;
324 UVM_ASSERT(index < pmm->root_chunks.count);
325 return index;
326 }
327
root_chunk_lock(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk)328 static void root_chunk_lock(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk)
329 {
330 uvm_bit_lock(&pmm->root_chunks.bitlocks, root_chunk_index(pmm, root_chunk));
331 }
332
uvm_assert_root_chunk_locked(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk)333 static void uvm_assert_root_chunk_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk)
334 {
335 uvm_assert_bit_locked(&pmm->root_chunks.bitlocks, root_chunk_index(pmm, root_chunk));
336 }
337
root_chunk_unlock(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk)338 static void root_chunk_unlock(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk)
339 {
340 uvm_bit_unlock(&pmm->root_chunks.bitlocks, root_chunk_index(pmm, root_chunk));
341 }
342
343 // TODO: Bug 1795559: Remove once PMA eviction is considered safe enough not to
344 // have an opt-out.
gpu_supports_pma_eviction(uvm_gpu_t * gpu)345 static bool gpu_supports_pma_eviction(uvm_gpu_t *gpu)
346 {
347 return uvm_global_oversubscription && uvm_parent_gpu_supports_eviction(gpu->parent);
348 }
349
uvm_pmm_to_gpu(uvm_pmm_gpu_t * pmm)350 uvm_gpu_t *uvm_pmm_to_gpu(uvm_pmm_gpu_t *pmm)
351 {
352 return container_of(pmm, uvm_gpu_t, pmm);
353 }
354
root_chunk_from_address(uvm_pmm_gpu_t * pmm,NvU64 addr)355 static uvm_gpu_root_chunk_t *root_chunk_from_address(uvm_pmm_gpu_t *pmm, NvU64 addr)
356 {
357 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
358 size_t index = addr / UVM_CHUNK_SIZE_MAX;
359 uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[index];
360
361 UVM_ASSERT_MSG(addr <= gpu->mem_info.max_allocatable_address,
362 "Address 0x%llx vidmem max phys 0x%llx GPU %s\n",
363 addr,
364 gpu->mem_info.max_allocatable_address,
365 uvm_gpu_name(gpu));
366 UVM_ASSERT(root_chunk->chunk.address == UVM_ALIGN_DOWN(addr, UVM_CHUNK_SIZE_MAX));
367
368 return root_chunk;
369 }
370
root_chunk_from_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)371 static uvm_gpu_root_chunk_t *root_chunk_from_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
372 {
373 return root_chunk_from_address(pmm, chunk->address);
374 }
375
chunk_is_root_chunk(uvm_gpu_chunk_t * chunk)376 static bool chunk_is_root_chunk(uvm_gpu_chunk_t *chunk)
377 {
378 return uvm_gpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_MAX;
379 }
380
chunk_is_root_chunk_pinned(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)381 static bool chunk_is_root_chunk_pinned(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
382 {
383 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
384
385 uvm_assert_spinlock_locked(&pmm->list_lock);
386
387 chunk = &root_chunk->chunk;
388
389 if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED)
390 return true;
391 else if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
392 return false;
393
394 UVM_ASSERT(chunk->suballoc);
395
396 return chunk->suballoc->pinned_leaf_chunks > 0;
397 }
398
399 // Pin a chunk and update its root chunk's pinned leaf chunks count if the
400 // chunk is not a root chunk.
chunk_pin(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)401 static void chunk_pin(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
402 {
403 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
404
405 uvm_assert_spinlock_locked(&pmm->list_lock);
406 UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
407 chunk->state = UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED;
408
409 if (chunk_is_root_chunk(chunk))
410 return;
411
412 // For subchunks, update the pinned leaf chunks count tracked in the
413 // suballoc of the root chunk.
414 chunk = &root_chunk->chunk;
415
416 // The passed-in subchunk is not the root chunk so the root chunk has to be
417 // split.
418 UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT, "chunk state %s\n",
419 uvm_pmm_gpu_chunk_state_string(chunk->state));
420
421 chunk->suballoc->pinned_leaf_chunks++;
422 }
423
424 // Unpin a chunk and update its root chunk's pinned leaf chunks count if the
425 // chunk is not a root chunk.
chunk_unpin(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_pmm_gpu_chunk_state_t new_state)426 static void chunk_unpin(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_pmm_gpu_chunk_state_t new_state)
427 {
428 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
429
430 uvm_assert_spinlock_locked(&pmm->list_lock);
431 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
432 UVM_ASSERT(chunk->va_block == NULL);
433 UVM_ASSERT(chunk_is_root_chunk_pinned(pmm, chunk));
434 UVM_ASSERT(new_state != UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
435
436 chunk->state = new_state;
437
438 if (chunk_is_root_chunk(chunk))
439 return;
440
441 // For subchunks, update the pinned leaf chunks count tracked in the
442 // suballoc of the root chunk.
443 chunk = &root_chunk->chunk;
444
445 // The passed-in subchunk is not the root chunk so the root chunk has to be
446 // split.
447 UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT, "chunk state %s\n",
448 uvm_pmm_gpu_chunk_state_string(chunk->state));
449
450 UVM_ASSERT(chunk->suballoc->pinned_leaf_chunks != 0);
451 chunk->suballoc->pinned_leaf_chunks--;
452 }
453
uvm_pmm_gpu_memory_type_is_user(uvm_pmm_gpu_memory_type_t type)454 bool uvm_pmm_gpu_memory_type_is_user(uvm_pmm_gpu_memory_type_t type)
455 {
456 UVM_ASSERT(type < UVM_PMM_GPU_MEMORY_TYPE_COUNT);
457
458 switch (type) {
459 case UVM_PMM_GPU_MEMORY_TYPE_USER: // Alias UVM_PMM_GPU_MEMORY_TYPE_USER_PROTECTED
460 case UVM_PMM_GPU_MEMORY_TYPE_USER_UNPROTECTED:
461 return true;
462 default:
463 return false;
464 }
465 }
466
memory_type_is_protected(uvm_pmm_gpu_memory_type_t type)467 static bool memory_type_is_protected(uvm_pmm_gpu_memory_type_t type)
468 {
469 switch (type) {
470 case UVM_PMM_GPU_MEMORY_TYPE_USER: // Alias UVM_PMM_GPU_MEMORY_TYPE_USER_PROTECTED
471 case UVM_PMM_GPU_MEMORY_TYPE_KERNEL: // Alias UVM_PMM_GPU_MEMORY_TYPE_KERNEL_PROTECTED:
472 return true;
473 default:
474 return false;
475 }
476 }
477
uvm_gpu_chunk_set_in_eviction(uvm_gpu_chunk_t * chunk,bool in_eviction)478 static void uvm_gpu_chunk_set_in_eviction(uvm_gpu_chunk_t *chunk, bool in_eviction)
479 {
480 UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
481 UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_MAX);
482 chunk->in_eviction = in_eviction;
483 }
484
485 // A helper that queries the eviction flag of root chunk of the given chunk.
486 // Eviction is only tracked for root chunks.
chunk_is_in_eviction(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)487 static bool chunk_is_in_eviction(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
488 {
489 return root_chunk_from_chunk(pmm, chunk)->chunk.in_eviction;
490 }
491
uvm_gpu_chunk_get_gpu(const uvm_gpu_chunk_t * chunk)492 uvm_gpu_t *uvm_gpu_chunk_get_gpu(const uvm_gpu_chunk_t *chunk)
493 {
494 uvm_gpu_t *gpu = uvm_gpu_get(uvm_gpu_id_from_index(chunk->gpu_index));
495 UVM_ASSERT(gpu);
496
497 return gpu;
498 }
499
uvm_gpu_chunk_to_page(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)500 struct page *uvm_gpu_chunk_to_page(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
501 {
502 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
503 NvU64 sys_addr = chunk->address + gpu->parent->system_bus.memory_window_start;
504 unsigned long pfn = sys_addr >> PAGE_SHIFT;
505
506 UVM_ASSERT(sys_addr + uvm_gpu_chunk_get_size(chunk) <= gpu->parent->system_bus.memory_window_end + 1);
507 UVM_ASSERT(gpu->mem_info.numa.enabled);
508
509 return pfn_to_page(pfn);
510 }
511
uvm_pmm_gpu_sync(uvm_pmm_gpu_t * pmm)512 void uvm_pmm_gpu_sync(uvm_pmm_gpu_t *pmm)
513 {
514 size_t i;
515
516 if (!pmm->initialized)
517 return;
518
519 // Just go over all root chunks and sync the ones that are not PMA OWNED.
520 // This is slow, but uvm_pmm_gpu_sync() is a rarely used operation not
521 // critical for performance.
522 for (i = 0; i < pmm->root_chunks.count; ++i) {
523 uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[i];
524
525 root_chunk_lock(pmm, root_chunk);
526 if (root_chunk->chunk.state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED) {
527 NV_STATUS status = uvm_tracker_wait(&root_chunk->tracker);
528 if (status != NV_OK)
529 UVM_ASSERT(status == uvm_global_get_status());
530 }
531 root_chunk_unlock(pmm, root_chunk);
532 }
533 }
534
pmm_squash_memory_type(uvm_pmm_gpu_memory_type_t type)535 static uvm_pmm_gpu_memory_type_t pmm_squash_memory_type(uvm_pmm_gpu_memory_type_t type)
536 {
537 if (g_uvm_global.conf_computing_enabled)
538 return type;
539
540 // Enforce the contract that when the Confidential Computing feature is
541 // disabled, all user types are alike, as well as all kernel types,
542 // respectively. See uvm_pmm_gpu_memory_type_t.
543 if (uvm_pmm_gpu_memory_type_is_user(type))
544 return UVM_PMM_GPU_MEMORY_TYPE_USER;
545
546 return UVM_PMM_GPU_MEMORY_TYPE_KERNEL;
547 }
548
uvm_pmm_gpu_alloc(uvm_pmm_gpu_t * pmm,size_t num_chunks,uvm_chunk_size_t chunk_size,uvm_pmm_gpu_memory_type_t mem_type,uvm_pmm_alloc_flags_t flags,uvm_gpu_chunk_t ** chunks,uvm_tracker_t * out_tracker)549 NV_STATUS uvm_pmm_gpu_alloc(uvm_pmm_gpu_t *pmm,
550 size_t num_chunks,
551 uvm_chunk_size_t chunk_size,
552 uvm_pmm_gpu_memory_type_t mem_type,
553 uvm_pmm_alloc_flags_t flags,
554 uvm_gpu_chunk_t **chunks,
555 uvm_tracker_t *out_tracker)
556 {
557 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
558 NV_STATUS status;
559 uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
560 size_t i;
561
562 UVM_ASSERT((unsigned)mem_type < UVM_PMM_GPU_MEMORY_TYPE_COUNT);
563 UVM_ASSERT_MSG(is_power_of_2(chunk_size), "chunk size %u\n", chunk_size);
564 UVM_ASSERT_MSG(chunk_size & pmm->chunk_sizes[mem_type], "chunk size %u\n", chunk_size);
565 UVM_ASSERT(num_chunks == 0 || chunks);
566 UVM_ASSERT((flags & UVM_PMM_ALLOC_FLAGS_MASK) == flags);
567
568 if (flags & UVM_PMM_ALLOC_FLAGS_EVICT) {
569 // If eviction is requested then VA block locks need to be lockable
570 uvm_assert_lockable_order(UVM_LOCK_ORDER_VA_BLOCK);
571 }
572
573 mem_type = pmm_squash_memory_type(mem_type);
574 for (i = 0; i < num_chunks; i++) {
575 uvm_gpu_root_chunk_t *root_chunk;
576
577 status = alloc_chunk(pmm, mem_type, chunk_size, flags, &chunks[i]);
578 if (status != NV_OK)
579 goto error;
580
581 root_chunk = root_chunk_from_chunk(pmm, chunks[i]);
582
583 root_chunk_lock(pmm, root_chunk);
584 uvm_tracker_remove_completed(&root_chunk->tracker);
585 status = uvm_tracker_add_tracker_safe(&local_tracker, &root_chunk->tracker);
586 root_chunk_unlock(pmm, root_chunk);
587
588 if (status != NV_OK) {
589 i++;
590 goto error;
591 }
592 }
593
594 // Before we return to the caller, we need to ensure that the tracker only
595 // contains tracker entries belonging to the PMM's GPU. Otherwise we
596 // could leak trackers for other GPUs into VA spaces which never
597 // registered those GPUs, causing lifetime problems when those GPUs go
598 // away.
599 status = uvm_tracker_wait_for_other_gpus(&local_tracker, gpu);
600 if (status != NV_OK)
601 goto error;
602
603 if (out_tracker) {
604 status = uvm_tracker_add_tracker_safe(out_tracker, &local_tracker);
605 uvm_tracker_clear(&local_tracker);
606 if (status != NV_OK)
607 goto error;
608 }
609
610 return uvm_tracker_wait_deinit(&local_tracker);
611
612 error:
613 uvm_tracker_deinit(&local_tracker);
614 while (i-- > 0)
615 free_chunk(pmm, chunks[i]);
616
617 // Reset the array to make error handling easier for callers.
618 memset(chunks, 0, sizeof(chunks[0]) * num_chunks);
619
620 return status;
621 }
622
pmm_gpu_alloc_kernel(uvm_pmm_gpu_t * pmm,size_t num_chunks,uvm_chunk_size_t chunk_size,uvm_pmm_gpu_memory_type_t memory_type,uvm_pmm_alloc_flags_t flags,uvm_gpu_chunk_t ** chunks,uvm_tracker_t * out_tracker)623 static NV_STATUS pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
624 size_t num_chunks,
625 uvm_chunk_size_t chunk_size,
626 uvm_pmm_gpu_memory_type_t memory_type,
627 uvm_pmm_alloc_flags_t flags,
628 uvm_gpu_chunk_t **chunks,
629 uvm_tracker_t *out_tracker)
630 {
631 size_t i;
632 NV_STATUS status = uvm_pmm_gpu_alloc(pmm, num_chunks, chunk_size, memory_type, flags, chunks, out_tracker);
633 if (status != NV_OK)
634 return status;
635
636 for (i = 0; i < num_chunks; ++i) {
637 UVM_ASSERT(chunks[i]->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
638
639 uvm_spin_lock(&pmm->list_lock);
640 chunk_unpin(pmm, chunks[i], UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
641 chunks[i]->is_referenced = false;
642 uvm_spin_unlock(&pmm->list_lock);
643 }
644
645 return NV_OK;
646 }
647
chunk_update_lists_locked(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)648 static void chunk_update_lists_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
649 {
650 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
651
652 uvm_assert_spinlock_locked(&pmm->list_lock);
653
654 if (uvm_pmm_gpu_memory_type_is_user(chunk->type)) {
655 if (chunk_is_root_chunk_pinned(pmm, chunk)) {
656 UVM_ASSERT(root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT ||
657 root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
658 list_del_init(&root_chunk->chunk.list);
659 }
660 else if (root_chunk->chunk.state != UVM_PMM_GPU_CHUNK_STATE_FREE) {
661 UVM_ASSERT(root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT ||
662 root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
663 list_move_tail(&root_chunk->chunk.list, &pmm->root_chunks.va_block_used);
664 }
665 }
666
667 // TODO: Bug 1757148: Improve fragmentation of split chunks
668 if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_FREE)
669 list_move_tail(&chunk->list, find_free_list_chunk(pmm, chunk));
670 else if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED)
671 list_del_init(&chunk->list);
672 }
673
gpu_unpin_temp(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_va_block_t * va_block,bool is_referenced)674 static void gpu_unpin_temp(uvm_pmm_gpu_t *pmm,
675 uvm_gpu_chunk_t *chunk,
676 uvm_va_block_t *va_block,
677 bool is_referenced)
678 {
679 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
680 UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
681
682 INIT_LIST_HEAD(&chunk->list);
683
684 uvm_spin_lock(&pmm->list_lock);
685
686 UVM_ASSERT(!chunk->va_block);
687 UVM_ASSERT(va_block);
688 UVM_ASSERT(chunk->va_block_page_index < uvm_va_block_num_cpu_pages(va_block));
689
690 chunk_unpin(pmm, chunk, UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
691 chunk->is_referenced = is_referenced;
692 chunk->va_block = va_block;
693 chunk_update_lists_locked(pmm, chunk);
694
695 uvm_spin_unlock(&pmm->list_lock);
696 }
697
uvm_pmm_gpu_unpin_allocated(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_va_block_t * va_block)698 void uvm_pmm_gpu_unpin_allocated(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block)
699 {
700 gpu_unpin_temp(pmm, chunk, va_block, false);
701 }
702
uvm_pmm_gpu_unpin_referenced(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_va_block_t * va_block)703 void uvm_pmm_gpu_unpin_referenced(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block)
704 {
705 gpu_unpin_temp(pmm, chunk, va_block, true);
706 }
707
uvm_pmm_gpu_free(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_tracker_t * tracker)708 void uvm_pmm_gpu_free(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_tracker_t *tracker)
709 {
710 NV_STATUS status;
711
712 if (!chunk)
713 return;
714
715 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
716 chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
717
718 if (tracker) {
719 uvm_gpu_root_chunk_t *root_chunk;
720
721 uvm_tracker_remove_completed(tracker);
722
723 root_chunk = root_chunk_from_chunk(pmm, chunk);
724 root_chunk_lock(pmm, root_chunk);
725
726 // Remove any completed entries from the root tracker to prevent it from
727 // growing too much over time.
728 uvm_tracker_remove_completed(&root_chunk->tracker);
729
730 status = uvm_tracker_add_tracker_safe(&root_chunk->tracker, tracker);
731 if (status != NV_OK)
732 UVM_ASSERT(status == uvm_global_get_status());
733
734 root_chunk_unlock(pmm, root_chunk);
735 }
736
737 free_chunk(pmm, chunk);
738 }
739
num_subchunks(uvm_gpu_chunk_t * parent)740 static NvU32 num_subchunks(uvm_gpu_chunk_t *parent)
741 {
742 uvm_chunk_size_t parent_size, child_size;
743 UVM_ASSERT(parent->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
744 parent_size = uvm_gpu_chunk_get_size(parent);
745 child_size = uvm_gpu_chunk_get_size(parent->suballoc->subchunks[0]);
746 return (NvU32)uvm_div_pow2_64(parent_size, child_size);
747 }
748
next_sibling(uvm_gpu_chunk_t * chunk)749 static uvm_gpu_chunk_t *next_sibling(uvm_gpu_chunk_t *chunk)
750 {
751 uvm_gpu_chunk_t *parent = chunk->parent;
752 size_t index;
753
754 UVM_ASSERT(parent);
755 UVM_ASSERT(parent->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
756
757 index = (size_t)uvm_div_pow2_64(chunk->address - parent->address, uvm_gpu_chunk_get_size(chunk));
758 UVM_ASSERT(index < num_subchunks(parent));
759
760 ++index;
761 if (index == num_subchunks(parent))
762 return NULL;
763
764 return parent->suballoc->subchunks[index];
765 }
766
767 // Check that the chunk is in a mergeable state: all children must be pinned or
768 // or all children must be allocated with the same reverse mapping.
769 //
770 // Always returns true so it can be called from an assert macro.
assert_chunk_mergeable(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)771 static bool assert_chunk_mergeable(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
772 {
773 uvm_gpu_chunk_t *first_child = chunk->suballoc->subchunks[0];
774 uvm_va_block_t *child_va_block = first_child->va_block;
775 size_t i;
776
777 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
778 UVM_ASSERT(first_child->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
779 first_child->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
780
781 for (i = 1; i < num_subchunks(chunk); i++) {
782 uvm_gpu_chunk_t *child = chunk->suballoc->subchunks[i];
783
784 UVM_ASSERT(child->state == first_child->state);
785 if (first_child->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
786 uvm_gpu_chunk_t *prev_child = chunk->suballoc->subchunks[i-1];
787
788 UVM_ASSERT(child->va_block == child_va_block);
789 UVM_ASSERT(child->va_block_page_index ==
790 prev_child->va_block_page_index + uvm_gpu_chunk_get_size(prev_child) / PAGE_SIZE);
791 UVM_ASSERT(child->is_referenced == prev_child->is_referenced);
792 }
793 }
794
795 if (first_child->state == UVM_PMM_GPU_CHUNK_STATE_FREE) {
796 UVM_ASSERT(chunk->suballoc->allocated == 0);
797 }
798 else {
799 UVM_ASSERT_MSG(chunk->suballoc->allocated == num_subchunks(chunk), "%u != %u\n",
800 chunk->suballoc->allocated, num_subchunks(chunk));
801 }
802
803 return true;
804 }
805
806 // Merges a previously-split chunk. Assumes that all of its children have
807 // uniform state. This only merges leaves, so none of the children can be in the
808 // split state themselves.
809 //
810 // The children need to be removed from any lists before the merge.
811 //
812 // The merged chunk inherits the former state of its children.
merge_gpu_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)813 static void merge_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
814 {
815 uvm_pmm_gpu_chunk_suballoc_t *suballoc;
816 uvm_gpu_chunk_t *subchunk;
817 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
818 uvm_pmm_gpu_chunk_state_t child_state;
819 size_t i, num_sub = num_subchunks(chunk);
820
821 uvm_assert_mutex_locked(&pmm->lock);
822 UVM_ASSERT(assert_chunk_mergeable(pmm, chunk));
823
824 // Transition the chunk state under the list lock first and then clean up
825 // the subchunk state.
826 uvm_spin_lock(&pmm->list_lock);
827
828 child_state = chunk->suballoc->subchunks[0]->state;
829
830 if (child_state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
831 subchunk = chunk->suballoc->subchunks[0];
832 UVM_ASSERT(subchunk->va_block);
833 chunk->va_block = subchunk->va_block;
834 chunk->va_block_page_index = subchunk->va_block_page_index;
835 chunk->is_referenced = subchunk->is_referenced;
836 }
837 else if (child_state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
838 UVM_ASSERT(root_chunk->chunk.suballoc->pinned_leaf_chunks >= num_sub);
839 root_chunk->chunk.suballoc->pinned_leaf_chunks += 1 - num_sub;
840 }
841
842 chunk->state = child_state;
843 suballoc = chunk->suballoc;
844 chunk->suballoc = NULL;
845
846 // The resulting chunk is assumed to be non-zero as a simplification,
847 // instead of checking that all the subchunks are zero, since callers of
848 // uvm_pmm_gpu_alloc are not required to clear it. However, we think that
849 // this covers all relevant cases since it is uncommon to split a chunk and
850 // not to use any of the subchunks later on.
851 chunk->is_zero = false;
852
853 uvm_spin_unlock(&pmm->list_lock);
854
855 for (i = 0; i < num_sub; i++) {
856 subchunk = suballoc->subchunks[i];
857
858 // The subchunks should have been removed from their lists prior to the
859 // merge.
860 UVM_ASSERT(list_empty(&subchunk->list));
861
862 if (child_state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED)
863 UVM_ASSERT(subchunk->va_block != NULL);
864
865 kmem_cache_free(CHUNK_CACHE, subchunk);
866 }
867
868 kmem_cache_free(chunk_split_cache[ilog2(num_sub)].cache, suballoc);
869 }
870
871 // Checks that chunk is below ancestor in the tree. Always returns true so it
872 // can be called from an assert macro.
assert_chunk_under(uvm_gpu_chunk_t * chunk,uvm_gpu_chunk_t * ancestor)873 static bool assert_chunk_under(uvm_gpu_chunk_t *chunk, uvm_gpu_chunk_t *ancestor)
874 {
875 UVM_ASSERT(ancestor->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
876 UVM_ASSERT(ancestor->suballoc);
877 UVM_ASSERT(ancestor->address <= chunk->address);
878 UVM_ASSERT(chunk->address < ancestor->address + uvm_gpu_chunk_get_size(ancestor));
879 UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) <= uvm_gpu_chunk_get_size(ancestor));
880 return true;
881 }
882
883 // Traverses the chunk tree from start in the given traversal order.
884 //
885 // If the callback returns a status value of NV_WARN_NOTHING_TO_DO when doing
886 // pre-order traversal, the traversal skips walking below that chunk. In all
887 // other cases, returning any non-NV_OK value stops the walk immediately and
888 // returns that status to the caller.
889 //
890 // Be careful modifying the tree from the callback. Changing the tree below the
891 // input chunk is fine and modifying the input chunk itself is fine, but the
892 // callback must not modify the tree above the input chunk. If that is needed,
893 // return a non-NV_OK status from the walk and re-start the walk.
chunk_walk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * start,chunk_walk_func_t func,void * data,chunk_walk_order_t order)894 static NV_STATUS chunk_walk(uvm_pmm_gpu_t *pmm,
895 uvm_gpu_chunk_t *start,
896 chunk_walk_func_t func,
897 void *data,
898 chunk_walk_order_t order)
899 {
900 NV_STATUS status = NV_OK;
901 uvm_gpu_chunk_t *curr, *sibling;
902
903 curr = start;
904
905 do {
906 if (curr != start)
907 UVM_ASSERT(assert_chunk_under(curr, start));
908
909 if (order == CHUNK_WALK_PRE_ORDER) {
910 status = func(pmm, curr, data);
911 if (status != NV_OK && status != NV_WARN_NOTHING_TO_DO)
912 return status;
913 }
914
915 // Skip downward traversal on pre-order if requested
916 if (status != NV_WARN_NOTHING_TO_DO && curr->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT) {
917 // If the chunk is split, walk down
918 curr = curr->suballoc->subchunks[0];
919 }
920 else {
921 // This is a leaf chunk. If not start itself, check siblings.
922 while (curr != start) {
923 if (order == CHUNK_WALK_POST_ORDER) {
924 status = func(pmm, curr, data);
925 if (status != NV_OK)
926 return status;
927 }
928
929 sibling = next_sibling(curr);
930 if (sibling) {
931 curr = sibling;
932 break;
933 }
934
935 // curr is the last chunk in its parent. Walk up and try again.
936 curr = curr->parent;
937 UVM_ASSERT(curr);
938 UVM_ASSERT(curr->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
939 }
940 }
941 } while (curr != start);
942
943 // Invoke the final callback for start
944 if (order == CHUNK_WALK_POST_ORDER)
945 return func(pmm, curr, data);
946
947 return NV_OK;
948 }
949
chunk_walk_pre_order(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * start,chunk_walk_func_t func,void * data)950 static NV_STATUS chunk_walk_pre_order(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *start, chunk_walk_func_t func, void *data)
951 {
952 return chunk_walk(pmm, start, func, data, CHUNK_WALK_PRE_ORDER);
953 }
954
chunk_walk_post_order(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * start,chunk_walk_func_t func,void * data)955 static NV_STATUS chunk_walk_post_order(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *start, chunk_walk_func_t func, void *data)
956 {
957 return chunk_walk(pmm, start, func, data, CHUNK_WALK_POST_ORDER);
958 }
959
960 typedef struct
961 {
962 // Target size for the leaf subchunks
963 uvm_chunk_size_t min_size;
964
965 // Number of subchunks split to this point. If the subchunks array is non-
966 // NULL, this is the number of elements currently in the array.
967 size_t num_subchunks_curr;
968
969 // Number of subchunks needed for the whole split
970 size_t num_subchunks_total;
971
972 // Storage for the final split chunks. May be NULL.
973 uvm_gpu_chunk_t **subchunks;
974
975 // For testing
976 bool inject_error;
977 } split_walk_t;
978
split_walk_func(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,void * data)979 static NV_STATUS split_walk_func(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data)
980 {
981 uvm_chunk_size_t chunk_size, child_size;
982 uvm_chunk_sizes_mask_t chunk_sizes = pmm->chunk_sizes[chunk->type];
983 size_t i, num_children;
984 split_walk_t *args = data;
985 NV_STATUS status;
986
987 chunk_size = uvm_gpu_chunk_get_size(chunk);
988 UVM_ASSERT(chunk_size > args->min_size);
989
990 child_size = uvm_chunk_find_prev_size(chunk_sizes, chunk_size);
991 UVM_ASSERT(child_size != UVM_CHUNK_SIZE_INVALID);
992 num_children = chunk_size / child_size;
993
994 if (unlikely(args->inject_error)) {
995 // Inject errors on the last split. inject_split_error is a bitfield,
996 // so we must take the lock to modify it. This path is only used in
997 // testing.
998 if (child_size == args->min_size &&
999 args->num_subchunks_curr + num_children == args->num_subchunks_total) {
1000 uvm_spin_lock(&pmm->list_lock);
1001 chunk->inject_split_error = true;
1002 uvm_spin_unlock(&pmm->list_lock);
1003 }
1004 }
1005
1006 status = split_gpu_chunk(pmm, chunk);
1007 if (status != NV_OK)
1008 return status;
1009
1010 // If we've hit our target, add all child subchunks to the array
1011 if (child_size == args->min_size) {
1012 for (i = 0; i < num_children; i++) {
1013 UVM_ASSERT(args->num_subchunks_curr < args->num_subchunks_total);
1014 if (args->subchunks)
1015 args->subchunks[args->num_subchunks_curr] = chunk->suballoc->subchunks[i];
1016 ++args->num_subchunks_curr;
1017 }
1018
1019 // No need to walk below this chunk
1020 return NV_WARN_NOTHING_TO_DO;
1021 }
1022
1023 return NV_OK;
1024 }
1025
merge_walk_func(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,void * data)1026 static NV_STATUS merge_walk_func(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data)
1027 {
1028 // The merge walk uses post-order traversal, so all subchunks are guaranteed
1029 // to have already been merged.
1030 if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
1031 merge_gpu_chunk(pmm, chunk);
1032 return NV_OK;
1033 }
1034
uvm_pmm_gpu_merge_chunk_locked(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)1035 static void uvm_pmm_gpu_merge_chunk_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
1036 {
1037 NV_STATUS status;
1038
1039 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT ||
1040 chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
1041 chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
1042
1043 uvm_assert_mutex_locked(&pmm->lock);
1044
1045 status = chunk_walk_post_order(pmm, chunk, merge_walk_func, NULL);
1046
1047 // merge_walk_func can't fail
1048 UVM_ASSERT(status == NV_OK);
1049 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
1050 chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
1051 }
1052
uvm_pmm_gpu_split_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_chunk_size_t subchunk_size,uvm_gpu_chunk_t ** subchunks)1053 NV_STATUS uvm_pmm_gpu_split_chunk(uvm_pmm_gpu_t *pmm,
1054 uvm_gpu_chunk_t *chunk,
1055 uvm_chunk_size_t subchunk_size,
1056 uvm_gpu_chunk_t **subchunks)
1057 {
1058 NV_STATUS status;
1059 split_walk_t walk_args =
1060 {
1061 .min_size = subchunk_size,
1062 .num_subchunks_curr = 0,
1063 .num_subchunks_total = uvm_gpu_chunk_get_size(chunk) / subchunk_size,
1064 .subchunks = subchunks,
1065 .inject_error = chunk->inject_split_error,
1066 };
1067
1068 UVM_ASSERT(is_power_of_2(subchunk_size));
1069 UVM_ASSERT(subchunk_size & pmm->chunk_sizes[chunk->type]);
1070 UVM_ASSERT(subchunk_size < uvm_gpu_chunk_get_size(chunk));
1071
1072 uvm_mutex_lock(&pmm->lock);
1073
1074 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
1075 chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
1076
1077 // If we're supposed to inject an error, clear out the root chunk's flag so
1078 // we can inject after nearly all chunks have been split. Otherwise
1079 // split_gpu_chunk will fail on the first try, without creating the tree.
1080 if (unlikely(walk_args.inject_error)) {
1081 // inject_split_error is a bitfield, so we must take the lock to modify
1082 // it. This path is only used in testing.
1083 uvm_spin_lock(&pmm->list_lock);
1084 chunk->inject_split_error = false;
1085 uvm_spin_unlock(&pmm->list_lock);
1086 }
1087
1088 status = chunk_walk_pre_order(pmm, chunk, split_walk_func, &walk_args);
1089 if (status != NV_OK) {
1090 // Put the chunk back in its original state
1091 uvm_pmm_gpu_merge_chunk_locked(pmm, chunk);
1092 }
1093 else {
1094 UVM_ASSERT(walk_args.num_subchunks_curr == walk_args.num_subchunks_total);
1095 }
1096
1097 uvm_mutex_unlock(&pmm->lock);
1098 return status;
1099 }
1100
1101 typedef struct
1102 {
1103 size_t num_written;
1104 size_t num_to_write;
1105 size_t num_to_skip;
1106 uvm_gpu_chunk_t **subchunks;
1107 } get_subchunks_walk_t;
1108
get_subchunks_walk_func(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,void * data)1109 static NV_STATUS get_subchunks_walk_func(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data)
1110 {
1111 get_subchunks_walk_t *args = data;
1112
1113 // We're only collecting leaf chunks
1114 if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
1115 return NV_OK;
1116
1117 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
1118 chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
1119
1120 if (args->num_to_skip) {
1121 --args->num_to_skip;
1122 return NV_OK;
1123 }
1124
1125 UVM_ASSERT(args->num_written < args->num_to_write);
1126 args->subchunks[args->num_written++] = chunk;
1127
1128 // Bail immediately once we hit our limit. Note that this is not an error:
1129 // we just need to exit the walk.
1130 if (args->num_written == args->num_to_write)
1131 return NV_ERR_OUT_OF_RANGE;
1132
1133 return NV_OK;
1134 }
1135
uvm_pmm_gpu_get_subchunks(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * parent,size_t start_index,size_t num_subchunks,uvm_gpu_chunk_t ** subchunks)1136 size_t uvm_pmm_gpu_get_subchunks(uvm_pmm_gpu_t *pmm,
1137 uvm_gpu_chunk_t *parent,
1138 size_t start_index,
1139 size_t num_subchunks,
1140 uvm_gpu_chunk_t **subchunks)
1141 {
1142 NV_STATUS status;
1143
1144 get_subchunks_walk_t walk_args =
1145 {
1146 .num_written = 0,
1147 .num_to_write = num_subchunks,
1148 .num_to_skip = start_index,
1149 .subchunks = subchunks,
1150 };
1151
1152 if (num_subchunks == 0)
1153 return 0;
1154
1155 UVM_ASSERT(parent->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
1156 parent->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
1157 parent->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
1158
1159 uvm_mutex_lock(&pmm->lock);
1160
1161 // Either pre- or post-order would work. Pick post-order just because we
1162 // only care about leaf chunks and we may exit early, so we'd get slightly
1163 // fewer callbacks.
1164 status = chunk_walk_post_order(pmm, parent, get_subchunks_walk_func, &walk_args);
1165 if (status != NV_OK) {
1166 UVM_ASSERT(status == NV_ERR_OUT_OF_RANGE);
1167 UVM_ASSERT(walk_args.num_written == walk_args.num_to_write);
1168 }
1169
1170 uvm_mutex_unlock(&pmm->lock);
1171 return walk_args.num_written;
1172 }
1173
list_first_chunk(struct list_head * list)1174 static uvm_gpu_chunk_t *list_first_chunk(struct list_head *list)
1175 {
1176 return list_first_entry_or_null(list, uvm_gpu_chunk_t, list);
1177 }
1178
uvm_pmm_gpu_merge_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)1179 void uvm_pmm_gpu_merge_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
1180 {
1181 uvm_mutex_lock(&pmm->lock);
1182 uvm_pmm_gpu_merge_chunk_locked(pmm, chunk);
1183 uvm_mutex_unlock(&pmm->lock);
1184 }
1185
root_chunk_unmap_indirect_peer(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk,uvm_gpu_t * other_gpu)1186 static void root_chunk_unmap_indirect_peer(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, uvm_gpu_t *other_gpu)
1187 {
1188 uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
1189 size_t index = root_chunk_index(pmm, root_chunk);
1190 long long new_count;
1191 NV_STATUS status;
1192
1193 indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(other_gpu->id)];
1194
1195 uvm_assert_root_chunk_locked(pmm, root_chunk);
1196 UVM_ASSERT(indirect_peer->dma_addrs);
1197 UVM_ASSERT(root_chunk->chunk.state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
1198 UVM_ASSERT(uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, other_gpu->id));
1199
1200 // The tracker could have work which requires the indirect peer mappings to
1201 // remain until finished, such as PTE unmaps of this chunk from indirect
1202 // peers, so we need to wait. We also need to wait on the entire tracker,
1203 // not just other_gpu's entries, because there might be implicit chained
1204 // dependencies in the tracker.
1205 //
1206 // We know there can't be any other work which requires these mappings:
1207 // - If we're freeing the root chunk back to PMA or switching types of the
1208 // root chunk, nothing else can reference the chunk.
1209 //
1210 // - If the chunk is still allocated then global peer access must be in the
1211 // process of being disabled, say because one of the GPUs is being
1212 // unregistered. We know that all VA spaces must have already called
1213 // disable_peers and have waited on those PTE unmaps. The chunk could be
1214 // freed concurrently with this indirect peer unmap, but that will be
1215 // serialized by the root chunk lock.
1216 status = uvm_tracker_wait(&root_chunk->tracker);
1217 if (status != NV_OK)
1218 UVM_ASSERT(uvm_global_get_status() != NV_OK);
1219
1220 uvm_parent_gpu_unmap_cpu_pages(other_gpu->parent, indirect_peer->dma_addrs[index], UVM_CHUNK_SIZE_MAX);
1221 uvm_processor_mask_clear(&root_chunk->indirect_peers_mapped, other_gpu->id);
1222 new_count = atomic64_dec_return(&indirect_peer->map_count);
1223 UVM_ASSERT(new_count >= 0);
1224 }
1225
root_chunk_unmap_indirect_peers(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk)1226 static void root_chunk_unmap_indirect_peers(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk)
1227 {
1228 uvm_gpu_id_t other_gpu_id;
1229
1230 // Root chunks should use a global processor mask as they are not bound to
1231 // a specific VA space. However, indirect peers are not supported when SMC
1232 // partitioning is enabled and, therefore, we can obtain the uvm_gpu_t
1233 // object directly from the uvm_parent_gpu_t object's id.
1234 for_each_gpu_id_in_mask(other_gpu_id, &root_chunk->indirect_peers_mapped) {
1235 uvm_gpu_t *other_gpu = uvm_gpu_get(other_gpu_id);
1236 root_chunk_unmap_indirect_peer(pmm, root_chunk, other_gpu);
1237 }
1238 }
1239
uvm_pmm_gpu_indirect_peer_init(uvm_pmm_gpu_t * pmm,uvm_gpu_t * accessing_gpu)1240 NV_STATUS uvm_pmm_gpu_indirect_peer_init(uvm_pmm_gpu_t *pmm, uvm_gpu_t *accessing_gpu)
1241 {
1242 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1243 NvU64 *dma_addrs;
1244 uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
1245 NV_STATUS status = NV_OK;
1246
1247 indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(accessing_gpu->id)];
1248
1249 uvm_assert_mutex_locked(&g_uvm_global.global_lock);
1250 UVM_ASSERT(uvm_gpus_are_indirect_peers(gpu, accessing_gpu));
1251 UVM_ASSERT(!indirect_peer->dma_addrs);
1252 UVM_ASSERT(atomic64_read(&indirect_peer->map_count) == 0);
1253
1254 // Each root chunk tracks whether it has a mapping to a given indirect peer,
1255 // so we don't need to initialize this array.
1256 dma_addrs = uvm_kvmalloc(pmm->root_chunks.count * sizeof(dma_addrs[0]));
1257 if (!dma_addrs)
1258 status = NV_ERR_NO_MEMORY;
1259 else
1260 indirect_peer->dma_addrs = dma_addrs;
1261
1262 return status;
1263 }
1264
check_indirect_peer_empty(uvm_pmm_gpu_t * pmm,uvm_gpu_t * other_gpu)1265 static bool check_indirect_peer_empty(uvm_pmm_gpu_t *pmm, uvm_gpu_t *other_gpu)
1266 {
1267 uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
1268 size_t i;
1269
1270 indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(other_gpu->id)];
1271
1272 for (i = 0; i < pmm->root_chunks.count; i++) {
1273 uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[i];
1274
1275 // This doesn't take the root chunk lock because checking the mask is an
1276 // atomic operation.
1277 if (uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, other_gpu->id)) {
1278 UVM_ASSERT(atomic64_read(&indirect_peer->map_count) > 0);
1279 return false;
1280 }
1281 }
1282
1283 UVM_ASSERT(atomic64_read(&indirect_peer->map_count) == 0);
1284 return true;
1285 }
1286
uvm_pmm_gpu_indirect_peer_destroy(uvm_pmm_gpu_t * pmm,uvm_gpu_t * other_gpu)1287 void uvm_pmm_gpu_indirect_peer_destroy(uvm_pmm_gpu_t *pmm, uvm_gpu_t *other_gpu)
1288 {
1289 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1290 uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
1291 size_t i;
1292
1293 indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(other_gpu->id)];
1294
1295 uvm_assert_mutex_locked(&g_uvm_global.global_lock);
1296 UVM_ASSERT(uvm_gpus_are_indirect_peers(gpu, other_gpu));
1297
1298 if (!indirect_peer->dma_addrs) {
1299 UVM_ASSERT(check_indirect_peer_empty(pmm, other_gpu));
1300 return;
1301 }
1302
1303 // Just go over all root chunks and unmap them. This is slow, but it is not
1304 // a frequent operation.
1305 for (i = 0; i < pmm->root_chunks.count && atomic64_read(&indirect_peer->map_count); i++) {
1306 uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[i];
1307
1308 // Take the root chunk lock to prevent chunks from transitioning in or
1309 // out of the PMA_OWNED state, and to serialize updates to the tracker
1310 // and indirect_peers_mapped mask. Note that indirect peers besides
1311 // other_gpu could be trying to create mappings concurrently.
1312 root_chunk_lock(pmm, root_chunk);
1313
1314 if (root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED)
1315 UVM_ASSERT(uvm_processor_mask_empty(&root_chunk->indirect_peers_mapped));
1316 else if (uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, other_gpu->id))
1317 root_chunk_unmap_indirect_peer(pmm, root_chunk, other_gpu);
1318
1319 root_chunk_unlock(pmm, root_chunk);
1320 }
1321
1322 UVM_ASSERT(check_indirect_peer_empty(pmm, other_gpu));
1323
1324 uvm_kvfree(indirect_peer->dma_addrs);
1325 indirect_peer->dma_addrs = NULL;
1326 }
1327
uvm_pmm_gpu_indirect_peer_map(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_gpu_t * accessing_gpu)1328 NV_STATUS uvm_pmm_gpu_indirect_peer_map(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_gpu_t *accessing_gpu)
1329 {
1330 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1331 uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
1332 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
1333 size_t index = root_chunk_index(pmm, root_chunk);
1334 NV_STATUS status = NV_OK;
1335
1336 indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(accessing_gpu->id)];
1337
1338 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
1339 chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
1340
1341 UVM_ASSERT(uvm_gpus_are_indirect_peers(gpu, accessing_gpu));
1342 UVM_ASSERT(indirect_peer->dma_addrs);
1343
1344 // Serialize:
1345 // - Concurrent mappings to this root chunk (same or different GPUs)
1346 // - Concurrent unmappings of this root chunk (must be a different GPU)
1347 root_chunk_lock(pmm, root_chunk);
1348
1349 if (!uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, accessing_gpu->id)) {
1350 status = uvm_parent_gpu_map_cpu_pages(accessing_gpu->parent,
1351 uvm_gpu_chunk_to_page(pmm, &root_chunk->chunk),
1352 UVM_CHUNK_SIZE_MAX,
1353 &indirect_peer->dma_addrs[index]);
1354 if (status == NV_OK) {
1355 uvm_processor_mask_set(&root_chunk->indirect_peers_mapped, accessing_gpu->id);
1356 atomic64_inc(&indirect_peer->map_count);
1357 }
1358 }
1359
1360 root_chunk_unlock(pmm, root_chunk);
1361 return status;
1362 }
1363
uvm_pmm_gpu_indirect_peer_addr(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_gpu_t * accessing_gpu)1364 NvU64 uvm_pmm_gpu_indirect_peer_addr(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_gpu_t *accessing_gpu)
1365 {
1366 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1367 uvm_gpu_root_chunk_indirect_peer_t *indirect_peer;
1368 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
1369 size_t index = root_chunk_index(pmm, root_chunk);
1370 NvU64 chunk_offset = chunk->address - root_chunk->chunk.address;
1371
1372 indirect_peer = &pmm->root_chunks.indirect_peer[uvm_id_gpu_index(accessing_gpu->id)];
1373
1374 UVM_ASSERT(uvm_gpus_are_indirect_peers(gpu, accessing_gpu));
1375 UVM_ASSERT(indirect_peer->dma_addrs);
1376 UVM_ASSERT(uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, accessing_gpu->id));
1377 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
1378 chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
1379 chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
1380
1381 return indirect_peer->dma_addrs[index] + chunk_offset;
1382 }
1383
uvm_pmm_gpu_peer_phys_address(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_gpu_t * accessing_gpu)1384 uvm_gpu_phys_address_t uvm_pmm_gpu_peer_phys_address(uvm_pmm_gpu_t *pmm,
1385 uvm_gpu_chunk_t *chunk,
1386 uvm_gpu_t *accessing_gpu)
1387 {
1388 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1389 uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(accessing_gpu, gpu);
1390 uvm_aperture_t aperture = uvm_gpu_peer_aperture(accessing_gpu, gpu);
1391 NvU64 addr;
1392
1393 if (peer_caps->is_indirect_peer)
1394 addr = uvm_pmm_gpu_indirect_peer_addr(pmm, chunk, accessing_gpu);
1395 else if (uvm_gpus_are_nvswitch_connected(accessing_gpu, gpu))
1396 addr = chunk->address + gpu->parent->nvswitch_info.fabric_memory_window_start;
1397 else
1398 addr = chunk->address;
1399
1400 return uvm_gpu_phys_address(aperture, addr);
1401 }
1402
uvm_pmm_gpu_peer_copy_address(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,uvm_gpu_t * accessing_gpu)1403 uvm_gpu_address_t uvm_pmm_gpu_peer_copy_address(uvm_pmm_gpu_t *pmm,
1404 uvm_gpu_chunk_t *chunk,
1405 uvm_gpu_t *accessing_gpu)
1406 {
1407 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1408 uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(accessing_gpu, gpu);
1409 uvm_gpu_identity_mapping_t *gpu_peer_mapping;
1410
1411 if (peer_caps->is_indirect_peer ||
1412 (accessing_gpu->parent->peer_copy_mode == UVM_GPU_PEER_COPY_MODE_PHYSICAL)) {
1413 // Indirect peers are accessed as sysmem addresses, so they don't need
1414 // to use identity mappings.
1415 return uvm_gpu_address_from_phys(uvm_pmm_gpu_peer_phys_address(pmm, chunk, accessing_gpu));
1416 }
1417
1418 UVM_ASSERT(accessing_gpu->parent->peer_copy_mode == UVM_GPU_PEER_COPY_MODE_VIRTUAL);
1419 gpu_peer_mapping = uvm_gpu_get_peer_mapping(accessing_gpu, gpu->id);
1420
1421 return uvm_gpu_address_virtual(gpu_peer_mapping->base + chunk->address);
1422 }
1423
evict_root_chunk_from_va_block(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk,uvm_va_block_t * va_block)1424 static NV_STATUS evict_root_chunk_from_va_block(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, uvm_va_block_t *va_block)
1425 {
1426 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1427 NV_STATUS status;
1428 uvm_tracker_t tracker = UVM_TRACKER_INIT();
1429
1430 UVM_ASSERT(va_block);
1431
1432 // To evict the chunks from the VA block we need to lock it, but we already
1433 // have the PMM lock held. Unlock it first and re-lock it after.
1434 uvm_mutex_unlock(&pmm->lock);
1435
1436 uvm_mutex_lock(&va_block->lock);
1437
1438 status = uvm_va_block_evict_chunks(va_block, gpu, &root_chunk->chunk, &tracker);
1439
1440 uvm_mutex_unlock(&va_block->lock);
1441
1442 // The block has been retained by find_and_retain_va_block_to_evict(),
1443 // release it here as it's not needed any more. Notably do that even if
1444 // uvm_va_block_evict_chunks() fails.
1445 uvm_va_block_release(va_block);
1446
1447 if (status == NV_OK) {
1448 root_chunk_lock(pmm, root_chunk);
1449 status = uvm_tracker_add_tracker_safe(&root_chunk->tracker, &tracker);
1450 root_chunk_unlock(pmm, root_chunk);
1451 }
1452
1453 uvm_tracker_deinit(&tracker);
1454
1455 uvm_mutex_lock(&pmm->lock);
1456
1457 return status;
1458 }
1459
uvm_pmm_gpu_mark_chunk_evicted(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)1460 void uvm_pmm_gpu_mark_chunk_evicted(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
1461 {
1462 uvm_spin_lock(&pmm->list_lock);
1463
1464 UVM_ASSERT(chunk_is_in_eviction(pmm, chunk));
1465 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
1466 UVM_ASSERT(chunk->va_block != NULL);
1467
1468 chunk->va_block = NULL;
1469 chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
1470 chunk_pin(pmm, chunk);
1471
1472 uvm_spin_unlock(&pmm->list_lock);
1473 }
1474
pin_free_chunks_func(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,void * data)1475 static NV_STATUS pin_free_chunks_func(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data)
1476 {
1477 uvm_assert_mutex_locked(&pmm->lock);
1478
1479 uvm_spin_lock(&pmm->list_lock);
1480
1481 UVM_ASSERT(chunk_is_in_eviction(pmm, chunk));
1482
1483 if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_FREE) {
1484 list_del_init(&chunk->list);
1485 chunk_pin(pmm, chunk);
1486 if (chunk->parent)
1487 chunk->parent->suballoc->allocated++;
1488 }
1489
1490 uvm_spin_unlock(&pmm->list_lock);
1491
1492 return NV_OK;
1493 }
1494
free_first_pinned_chunk_func(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,void * data)1495 static NV_STATUS free_first_pinned_chunk_func(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data)
1496 {
1497 uvm_assert_mutex_locked(&pmm->lock);
1498
1499 UVM_ASSERT(!chunk_is_in_eviction(pmm, chunk));
1500
1501 if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
1502 free_chunk_with_merges(pmm, chunk);
1503 return NV_ERR_MORE_DATA_AVAILABLE;
1504 }
1505
1506 return NV_OK;
1507 }
1508
1509 typedef struct
1510 {
1511 uvm_va_block_t *va_block_to_evict_from;
1512 } evict_data_t;
1513
find_and_retain_va_block_to_evict(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,void * data)1514 static NV_STATUS find_and_retain_va_block_to_evict(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data)
1515 {
1516 NV_STATUS status = NV_OK;
1517 evict_data_t *evict_data = (evict_data_t *)data;
1518
1519 UVM_ASSERT(evict_data->va_block_to_evict_from == NULL);
1520
1521 uvm_spin_lock(&pmm->list_lock);
1522
1523 // All free chunks should have been pinned already by pin_free_chunks_func().
1524 UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
1525 chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED ||
1526 chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT,
1527 "state %s\n", uvm_pmm_gpu_chunk_state_string(chunk->state));
1528
1529 if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
1530 UVM_ASSERT(chunk->va_block);
1531 evict_data->va_block_to_evict_from = chunk->va_block;
1532 uvm_va_block_retain(chunk->va_block);
1533 status = NV_ERR_MORE_DATA_AVAILABLE;
1534 }
1535
1536 uvm_spin_unlock(&pmm->list_lock);
1537
1538 return status;
1539 }
1540
root_chunk_has_elevated_page(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk)1541 static bool root_chunk_has_elevated_page(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk)
1542 {
1543 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1544 uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
1545 struct page *page;
1546
1547 if (!gpu->mem_info.numa.enabled)
1548 return false;
1549
1550 page = uvm_gpu_chunk_to_page(pmm, chunk);
1551
1552 return page_count(page) > UVM_CHUNK_SIZE_MAX / PAGE_SIZE;
1553 }
1554
evict_root_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk,uvm_pmm_context_t pmm_context)1555 static NV_STATUS evict_root_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, uvm_pmm_context_t pmm_context)
1556 {
1557 NV_STATUS status;
1558 NV_STATUS free_status;
1559 uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
1560 const uvm_pmm_gpu_memory_type_t type = chunk->type;
1561
1562 uvm_assert_mutex_locked(&pmm->lock);
1563
1564 // First pin all the free subchunks
1565 status = chunk_walk_pre_order(pmm, chunk, pin_free_chunks_func, NULL);
1566 UVM_ASSERT(status == NV_OK);
1567 while (1) {
1568 evict_data_t evict = {0};
1569 status = chunk_walk_pre_order(pmm, chunk, find_and_retain_va_block_to_evict, &evict);
1570
1571 // find_and_retain_va_block_to_evict() returns NV_ERR_MORE_DATA_AVAILABLE
1572 // immediately after finding the first VA block to evict from and NV_OK
1573 // if no more blocks are left.
1574 if (status != NV_ERR_MORE_DATA_AVAILABLE) {
1575 UVM_ASSERT(status == NV_OK);
1576 break;
1577 }
1578
1579 // Evict the chunks from the VA block. Notably this will unlock and
1580 // re-lock the PMM mutex. This is ok as we don't rely on any PMM state
1581 // that can change across the calls. In particular, the walk to pick the
1582 // next VA block to evict above is always started from the root chunk.
1583 status = evict_root_chunk_from_va_block(pmm, root_chunk, evict.va_block_to_evict_from);
1584 if (status != NV_OK)
1585 goto error;
1586 }
1587
1588 // All of the leaf chunks should be pinned now, merge them all back into a
1589 // pinned root chunk.
1590 uvm_pmm_gpu_merge_chunk_locked(pmm, chunk);
1591
1592 uvm_spin_lock(&pmm->list_lock);
1593
1594 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
1595 uvm_gpu_chunk_set_in_eviction(chunk, false);
1596
1597 chunk->is_zero = false;
1598
1599 uvm_spin_unlock(&pmm->list_lock);
1600
1601 // Bug 2085760: Check if there is any page within the evicted chunk with an
1602 // elevated refcount. In such case there is another holder of the page,
1603 // which prevents us from reusing it. This can happen on systems where
1604 // struct pages backed by GPU memory are directly available to third-party
1605 // device drivers. Note that at this point, the chunk ends up not being in
1606 // a chunk free list. We can just free it, so PMA will handle the page with
1607 // elevated refcount.
1608 if (root_chunk_has_elevated_page(pmm, root_chunk)) {
1609 free_root_chunk(pmm, root_chunk, free_root_chunk_mode_from_pmm_context(pmm_context));
1610 return NV_ERR_IN_USE;
1611 }
1612
1613 UVM_ASSERT(check_chunk(pmm, chunk));
1614
1615 return NV_OK;
1616
1617 error:
1618 // On error we need to free all the chunks that we were able to evict so
1619 // far. They should all be pinned.
1620
1621 // Clear the eviction state so any new chunks freed by other threads are
1622 // actually freed instead of pinned. We need the list lock to make the
1623 // eviction check and conditional pin in chunk_free_locked atomic with our
1624 // free-if-pinned loop below.
1625 uvm_spin_lock(&pmm->list_lock);
1626
1627 uvm_gpu_chunk_set_in_eviction(chunk, false);
1628
1629 // In case we didn't manage to evict any chunks and hence the root is still
1630 // unpinned, we need to put it back on an eviction list.
1631 // chunk_update_lists_locked() will do that.
1632 chunk_update_lists_locked(pmm, chunk);
1633
1634 uvm_spin_unlock(&pmm->list_lock);
1635
1636 do {
1637 free_status = chunk_walk_pre_order(pmm, chunk, free_first_pinned_chunk_func, NULL);
1638 } while (free_status == NV_ERR_MORE_DATA_AVAILABLE);
1639 UVM_ASSERT(free_status == NV_OK);
1640
1641 (void)free_next_available_root_chunk(pmm, type);
1642
1643 return status;
1644 }
1645
chunk_is_evictable(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)1646 static bool chunk_is_evictable(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
1647 {
1648 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
1649
1650 uvm_assert_spinlock_locked(&pmm->list_lock);
1651
1652 if (root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED)
1653 return false;
1654
1655 if (chunk_is_root_chunk_pinned(pmm, chunk))
1656 return false;
1657
1658 if (chunk_is_in_eviction(pmm, chunk))
1659 return false;
1660
1661 // An evictable chunk's root should be on one of the eviction lists.
1662 UVM_ASSERT(!list_empty(&root_chunk->chunk.list));
1663
1664 return true;
1665 }
1666
chunk_start_eviction(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)1667 static void chunk_start_eviction(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
1668 {
1669 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
1670 chunk = &root_chunk->chunk;
1671
1672 uvm_assert_spinlock_locked(&pmm->list_lock);
1673
1674 UVM_ASSERT(chunk_is_evictable(pmm, chunk));
1675 UVM_ASSERT(!list_empty(&chunk->list));
1676
1677 list_del_init(&chunk->list);
1678 uvm_gpu_chunk_set_in_eviction(chunk, true);
1679 }
1680
root_chunk_update_eviction_list(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,struct list_head * list)1681 static void root_chunk_update_eviction_list(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, struct list_head *list)
1682 {
1683 uvm_spin_lock(&pmm->list_lock);
1684
1685 UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_MAX);
1686 UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
1687 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
1688 chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
1689
1690 if (!chunk_is_root_chunk_pinned(pmm, chunk) && !chunk_is_in_eviction(pmm, chunk)) {
1691 // An unpinned chunk not selected for eviction should be on one of the
1692 // eviction lists.
1693 UVM_ASSERT(!list_empty(&chunk->list));
1694
1695 list_move_tail(&chunk->list, list);
1696 }
1697
1698 uvm_spin_unlock(&pmm->list_lock);
1699 }
1700
uvm_pmm_gpu_mark_root_chunk_used(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)1701 void uvm_pmm_gpu_mark_root_chunk_used(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
1702 {
1703 root_chunk_update_eviction_list(pmm, chunk, &pmm->root_chunks.va_block_used);
1704 }
1705
uvm_pmm_gpu_mark_root_chunk_unused(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)1706 void uvm_pmm_gpu_mark_root_chunk_unused(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
1707 {
1708 root_chunk_update_eviction_list(pmm, chunk, &pmm->root_chunks.va_block_unused);
1709 }
1710
pick_root_chunk_to_evict(uvm_pmm_gpu_t * pmm)1711 static uvm_gpu_root_chunk_t *pick_root_chunk_to_evict(uvm_pmm_gpu_t *pmm)
1712 {
1713 uvm_gpu_chunk_t *chunk;
1714
1715 uvm_spin_lock(&pmm->list_lock);
1716
1717 // Check if there are root chunks sitting in the free lists. Non-zero
1718 // chunks are preferred.
1719 chunk = list_first_chunk(find_free_list(pmm,
1720 UVM_PMM_GPU_MEMORY_TYPE_USER,
1721 UVM_CHUNK_SIZE_MAX,
1722 UVM_PMM_LIST_NO_ZERO));
1723 if (chunk)
1724 UVM_ASSERT(!chunk->is_zero);
1725
1726 if (!chunk) {
1727 chunk = list_first_chunk(find_free_list(pmm,
1728 UVM_PMM_GPU_MEMORY_TYPE_USER,
1729 UVM_CHUNK_SIZE_MAX,
1730 UVM_PMM_LIST_ZERO));
1731 if (chunk)
1732 UVM_ASSERT(chunk->is_zero);
1733 }
1734
1735 if (!chunk)
1736 chunk = list_first_chunk(&pmm->root_chunks.va_block_unused);
1737
1738 // TODO: Bug 1765193: Move the chunks to the tail of the used list whenever
1739 // they get mapped.
1740 if (!chunk)
1741 chunk = list_first_chunk(&pmm->root_chunks.va_block_used);
1742
1743 if (chunk)
1744 chunk_start_eviction(pmm, chunk);
1745
1746 uvm_spin_unlock(&pmm->list_lock);
1747
1748 if (chunk)
1749 return root_chunk_from_chunk(pmm, chunk);
1750 return NULL;
1751 }
1752
pick_and_evict_root_chunk(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_pmm_context_t pmm_context,uvm_gpu_chunk_t ** out_chunk)1753 static NV_STATUS pick_and_evict_root_chunk(uvm_pmm_gpu_t *pmm,
1754 uvm_pmm_gpu_memory_type_t type,
1755 uvm_pmm_context_t pmm_context,
1756 uvm_gpu_chunk_t **out_chunk)
1757 {
1758 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1759 NV_STATUS status;
1760 uvm_gpu_chunk_t *chunk;
1761 uvm_gpu_root_chunk_t *root_chunk;
1762
1763 UVM_ASSERT(uvm_parent_gpu_supports_eviction(gpu->parent));
1764
1765 uvm_assert_mutex_locked(&pmm->lock);
1766
1767 root_chunk = pick_root_chunk_to_evict(pmm);
1768 if (!root_chunk)
1769 return NV_ERR_NO_MEMORY;
1770
1771 status = evict_root_chunk(pmm, root_chunk, pmm_context);
1772 if (status != NV_OK)
1773 return status;
1774
1775 chunk = &root_chunk->chunk;
1776
1777 if (uvm_pmm_gpu_memory_type_is_kernel(type)) {
1778 NvU32 flags = 0;
1779 if (pmm_context == PMM_CONTEXT_PMA_EVICTION)
1780 flags |= UVM_PMA_CALLED_FROM_PMA_EVICTION;
1781
1782 // Transitioning user memory type to kernel memory type requires pinning
1783 // it so that PMA doesn't pick it for eviction.
1784 status = nvUvmInterfacePmaPinPages(pmm->pma,
1785 &chunk->address,
1786 1,
1787 UVM_CHUNK_SIZE_MAX,
1788 flags);
1789 if (status == NV_ERR_IN_USE) {
1790 // Pinning can fail if some of the pages have been chosen for
1791 // eviction already. In that case free the root chunk back to PMA
1792 // and let the caller retry.
1793 free_root_chunk(pmm, root_chunk, free_root_chunk_mode_from_pmm_context(pmm_context));
1794
1795 return status;
1796 }
1797
1798 UVM_ASSERT_MSG(status == NV_OK,
1799 "pmaPinPages(root_chunk=0x%llx) failed unexpectedly: %s\n",
1800 chunk->address,
1801 nvstatusToString(status));
1802
1803 // Unmap any indirect peer physical mappings for this chunk, since
1804 // kernel chunks generally don't need them.
1805 root_chunk_lock(pmm, root_chunk);
1806 root_chunk_unmap_indirect_peers(pmm, root_chunk);
1807 root_chunk_unlock(pmm, root_chunk);
1808
1809 uvm_spin_lock(&pmm->list_lock);
1810 chunk->type = type;
1811 uvm_spin_unlock(&pmm->list_lock);
1812 }
1813
1814 *out_chunk = chunk;
1815 return NV_OK;
1816 }
1817
pick_and_evict_root_chunk_retry(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_pmm_context_t pmm_context,uvm_gpu_chunk_t ** out_chunk)1818 static NV_STATUS pick_and_evict_root_chunk_retry(uvm_pmm_gpu_t *pmm,
1819 uvm_pmm_gpu_memory_type_t type,
1820 uvm_pmm_context_t pmm_context,
1821 uvm_gpu_chunk_t **out_chunk)
1822 {
1823 NV_STATUS status;
1824
1825 // Eviction can fail if the chunk gets selected for PMA eviction at
1826 // the same time. Keep retrying.
1827 do {
1828 status = pick_and_evict_root_chunk(pmm, type, pmm_context, out_chunk);
1829 } while (status == NV_ERR_IN_USE);
1830
1831 return status;
1832 }
1833
find_free_chunk_locked(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_chunk_size_t chunk_size,uvm_pmm_list_zero_t zero_type)1834 static uvm_gpu_chunk_t *find_free_chunk_locked(uvm_pmm_gpu_t *pmm,
1835 uvm_pmm_gpu_memory_type_t type,
1836 uvm_chunk_size_t chunk_size,
1837 uvm_pmm_list_zero_t zero_type)
1838 {
1839 struct list_head *free_list = find_free_list(pmm, type, chunk_size, zero_type);
1840 uvm_gpu_chunk_t *tmp, *chunk;
1841
1842 uvm_assert_spinlock_locked(&pmm->list_lock);
1843
1844 list_for_each_entry_safe(chunk, tmp, free_list, list) {
1845 if (zero_type == UVM_PMM_LIST_ZERO)
1846 UVM_ASSERT(chunk->is_zero);
1847 else
1848 UVM_ASSERT(!chunk->is_zero);
1849
1850 if (chunk_is_in_eviction(pmm, chunk)) {
1851 // Remove chunks that have been picked for eviction from the free
1852 // lists. The eviction path does it with pin_free_chunks_func(),
1853 // but there is a window between when a root chunk is chosen for
1854 // eviction and all of its subchunks are removed from free lists.
1855 list_del_init(&chunk->list);
1856 }
1857 else {
1858 // Bug 2085760: When NUMA GPU is enabled, also check that the root
1859 // chunk containing the candidate free chunk doesn't have any page
1860 // escaped to another driver. If that is the case, just skip such
1861 // chunk hoping that the page will eventually lose the extra
1862 // reference.
1863 // References can only be added when a virtual mapping to the page
1864 // exists, so once a chunk in the free list has no elevated pages
1865 // the chunk is safe to reuse.
1866 if (!root_chunk_has_elevated_page(pmm, root_chunk_from_chunk(pmm, chunk)))
1867 return chunk;
1868 }
1869 }
1870
1871 return NULL;
1872 }
1873
claim_free_chunk(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_chunk_size_t chunk_size)1874 static uvm_gpu_chunk_t *claim_free_chunk(uvm_pmm_gpu_t *pmm, uvm_pmm_gpu_memory_type_t type, uvm_chunk_size_t chunk_size)
1875 {
1876 uvm_gpu_chunk_t *chunk;
1877
1878 uvm_spin_lock(&pmm->list_lock);
1879
1880 // Prefer zero free chunks as they are likely going to be used for a new
1881 // allocation.
1882 //
1883 // TODO: Bug 2446832: Allow callers to request non-zero chunks in PMM
1884 // allocation functions, so we don't waste zero chunks.
1885 chunk = find_free_chunk_locked(pmm, type, chunk_size, UVM_PMM_LIST_ZERO);
1886
1887 if (!chunk)
1888 chunk = find_free_chunk_locked(pmm, type, chunk_size, UVM_PMM_LIST_NO_ZERO);
1889
1890 if (!chunk)
1891 goto out;
1892
1893 UVM_ASSERT_MSG(uvm_gpu_chunk_get_size(chunk) == chunk_size, "chunk size %u expected %u\n",
1894 uvm_gpu_chunk_get_size(chunk), chunk_size);
1895 UVM_ASSERT(chunk->type == type);
1896 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_FREE);
1897 UVM_ASSERT(!chunk_is_in_eviction(pmm, chunk));
1898
1899 if (chunk->parent) {
1900 UVM_ASSERT(chunk->parent->suballoc);
1901 UVM_ASSERT(chunk->parent->type == type);
1902 UVM_ASSERT(chunk->parent->suballoc->allocated < num_subchunks(chunk->parent));
1903 chunk->parent->suballoc->allocated++;
1904 }
1905
1906 chunk_pin(pmm, chunk);
1907 chunk_update_lists_locked(pmm, chunk);
1908
1909 out:
1910 uvm_spin_unlock(&pmm->list_lock);
1911
1912 return chunk;
1913 }
1914
alloc_or_evict_root_chunk(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_pmm_alloc_flags_t flags,uvm_gpu_chunk_t ** chunk_out)1915 static NV_STATUS alloc_or_evict_root_chunk(uvm_pmm_gpu_t *pmm,
1916 uvm_pmm_gpu_memory_type_t type,
1917 uvm_pmm_alloc_flags_t flags,
1918 uvm_gpu_chunk_t **chunk_out)
1919 {
1920 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1921 NV_STATUS status;
1922 uvm_gpu_chunk_t *chunk;
1923
1924 status = alloc_root_chunk(pmm, type, flags, &chunk);
1925 if (status != NV_OK) {
1926 if ((flags & UVM_PMM_ALLOC_FLAGS_EVICT) && uvm_parent_gpu_supports_eviction(gpu->parent))
1927 status = pick_and_evict_root_chunk_retry(pmm, type, PMM_CONTEXT_DEFAULT, chunk_out);
1928
1929 return status;
1930 }
1931
1932 *chunk_out = chunk;
1933 return status;
1934 }
1935
1936 // Same as alloc_or_evit_root_chunk(), but without the PMM lock held.
alloc_or_evict_root_chunk_unlocked(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_pmm_alloc_flags_t flags,uvm_gpu_chunk_t ** chunk_out)1937 static NV_STATUS alloc_or_evict_root_chunk_unlocked(uvm_pmm_gpu_t *pmm,
1938 uvm_pmm_gpu_memory_type_t type,
1939 uvm_pmm_alloc_flags_t flags,
1940 uvm_gpu_chunk_t **chunk_out)
1941 {
1942 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
1943 NV_STATUS status;
1944 uvm_gpu_chunk_t *chunk;
1945
1946 status = alloc_root_chunk(pmm, type, flags, &chunk);
1947 if (status != NV_OK) {
1948 if ((flags & UVM_PMM_ALLOC_FLAGS_EVICT) && uvm_parent_gpu_supports_eviction(gpu->parent)) {
1949 uvm_mutex_lock(&pmm->lock);
1950 status = pick_and_evict_root_chunk_retry(pmm, type, PMM_CONTEXT_DEFAULT, chunk_out);
1951 uvm_mutex_unlock(&pmm->lock);
1952 }
1953
1954 return status;
1955 }
1956
1957 *chunk_out = chunk;
1958 return status;
1959 }
1960
alloc_chunk_with_splits(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_chunk_size_t chunk_size,uvm_pmm_alloc_flags_t flags,uvm_gpu_chunk_t ** out_chunk)1961 static NV_STATUS alloc_chunk_with_splits(uvm_pmm_gpu_t *pmm,
1962 uvm_pmm_gpu_memory_type_t type,
1963 uvm_chunk_size_t chunk_size,
1964 uvm_pmm_alloc_flags_t flags,
1965 uvm_gpu_chunk_t **out_chunk)
1966 {
1967 NV_STATUS status;
1968 uvm_chunk_size_t cur_size;
1969 uvm_gpu_chunk_t *chunk;
1970 uvm_chunk_sizes_mask_t chunk_sizes = pmm->chunk_sizes[type];
1971
1972 uvm_assert_mutex_locked(&pmm->lock);
1973 UVM_ASSERT(chunk_size != UVM_CHUNK_SIZE_MAX);
1974
1975 // Check for a free chunk again in case a different thread freed something
1976 // up while this thread was waiting for the PMM lock.
1977 chunk = claim_free_chunk(pmm, type, chunk_size);
1978 if (chunk) {
1979 // A free chunk was claimed, return immediately.
1980 UVM_ASSERT(check_chunk(pmm, chunk));
1981
1982 *out_chunk = chunk;
1983 return NV_OK;
1984 }
1985
1986 cur_size = chunk_size;
1987
1988 // Look for a bigger free chunk that can be split
1989 for_each_chunk_size_from(cur_size, chunk_sizes) {
1990 chunk = claim_free_chunk(pmm, type, cur_size);
1991 if (chunk)
1992 break;
1993 }
1994
1995 if (unlikely(!chunk)) {
1996 status = alloc_or_evict_root_chunk(pmm, type, flags, &chunk);
1997 if (status != NV_OK)
1998 return status;
1999 cur_size = UVM_CHUNK_SIZE_MAX;
2000 UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == cur_size);
2001 }
2002
2003 UVM_ASSERT(chunk);
2004
2005 for_each_chunk_size_rev_from(cur_size, chunk_sizes) {
2006 NvU32 i;
2007 uvm_gpu_chunk_t *parent;
2008
2009 UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == cur_size);
2010 UVM_ASSERT(chunk->type == type);
2011 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
2012
2013 if (chunk->parent) {
2014 UVM_ASSERT(chunk->parent->suballoc);
2015 UVM_ASSERT(uvm_gpu_chunk_get_size(chunk->parent) == uvm_chunk_find_next_size(chunk_sizes, cur_size));
2016 UVM_ASSERT(chunk->parent->type == type);
2017 UVM_ASSERT_MSG(chunk->parent->suballoc->allocated <= num_subchunks(chunk->parent), "allocated %u num %u\n",
2018 chunk->parent->suballoc->allocated, num_subchunks(chunk->parent));
2019 }
2020
2021 if (cur_size == chunk_size) {
2022 *out_chunk = chunk;
2023 return NV_OK;
2024 }
2025
2026 status = split_gpu_chunk(pmm, chunk);
2027 if (status != NV_OK) {
2028 free_chunk_with_merges(pmm, chunk);
2029 return status;
2030 }
2031
2032 parent = chunk;
2033
2034 // Use the first subchunk for further splitting, if needed.
2035 chunk = parent->suballoc->subchunks[0];
2036
2037 // And add the rest to the free list
2038 uvm_spin_lock(&pmm->list_lock);
2039
2040 for (i = 1; i < num_subchunks(parent); ++i)
2041 chunk_free_locked(pmm, parent->suballoc->subchunks[i]);
2042
2043 uvm_spin_unlock(&pmm->list_lock);
2044 }
2045 UVM_PANIC();
2046 }
2047
2048 // Allocates a single chunk of a given size. If needed, splits a chunk of
2049 // bigger size or, if that is not possible, allocates from PMA or evicts.
alloc_chunk(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_chunk_size_t chunk_size,uvm_pmm_alloc_flags_t flags,uvm_gpu_chunk_t ** out_chunk)2050 NV_STATUS alloc_chunk(uvm_pmm_gpu_t *pmm,
2051 uvm_pmm_gpu_memory_type_t type,
2052 uvm_chunk_size_t chunk_size,
2053 uvm_pmm_alloc_flags_t flags,
2054 uvm_gpu_chunk_t **out_chunk)
2055 {
2056 NV_STATUS status;
2057 uvm_gpu_chunk_t *chunk;
2058
2059 chunk = claim_free_chunk(pmm, type, chunk_size);
2060 if (chunk) {
2061 // A free chunk could be claimed, we are done.
2062 goto out;
2063 }
2064
2065 if (chunk_size == UVM_CHUNK_SIZE_MAX) {
2066 // For chunks of root chunk size we won't be doing any splitting so we
2067 // can just directly try allocating without holding the PMM lock. If
2068 // eviction is necessary, the lock will be acquired internally.
2069 status = alloc_or_evict_root_chunk_unlocked(pmm, type, flags, &chunk);
2070 if (status != NV_OK)
2071 return status;
2072
2073 goto out;
2074 }
2075
2076 // We didn't find a free chunk and we will require splits so acquire the
2077 // PMM lock.
2078 uvm_mutex_lock(&pmm->lock);
2079
2080 status = alloc_chunk_with_splits(pmm, type, chunk_size, flags, &chunk);
2081
2082 uvm_mutex_unlock(&pmm->lock);
2083
2084 if (status != NV_OK) {
2085 (void)free_next_available_root_chunk(pmm, type);
2086 return status;
2087 }
2088
2089 out:
2090 *out_chunk = chunk;
2091
2092 return NV_OK;
2093 }
2094
2095 // Initialize the given root chunk. If the initial state is
2096 // UVM_PMM_GPU_CHUNK_STATE_FREE, the chunk is added to the corresponding free
2097 // list.
2098 //
2099 // PMA lock must be held by the caller
init_root_chunk(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_gpu_root_chunk_t * root_chunk,uvm_pmm_gpu_chunk_state_t initial_state,bool is_zero)2100 static void init_root_chunk(uvm_pmm_gpu_t *pmm,
2101 uvm_pmm_gpu_memory_type_t type,
2102 uvm_gpu_root_chunk_t *root_chunk,
2103 uvm_pmm_gpu_chunk_state_t initial_state,
2104 bool is_zero)
2105 {
2106 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
2107 uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
2108
2109 uvm_assert_rwsem_locked(&pmm->pma_lock);
2110
2111 root_chunk_lock(pmm, root_chunk);
2112
2113 uvm_tracker_init(&root_chunk->tracker);
2114
2115 uvm_spin_lock(&pmm->list_lock);
2116
2117 UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED,
2118 "Address 0x%llx state %s GPU %s\n",
2119 chunk->address,
2120 uvm_pmm_gpu_chunk_state_string(chunk->state),
2121 uvm_gpu_name(gpu));
2122
2123 UVM_ASSERT(chunk->parent == NULL);
2124 UVM_ASSERT(chunk->suballoc == NULL);
2125 UVM_ASSERT(chunk->va_block == NULL);
2126 UVM_ASSERT(chunk->va_block_page_index == PAGES_PER_UVM_VA_BLOCK);
2127 UVM_ASSERT(list_empty(&chunk->list));
2128 UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_MAX);
2129 UVM_ASSERT(!root_chunk_has_elevated_page(pmm, root_chunk));
2130
2131 UVM_ASSERT(initial_state == UVM_PMM_GPU_CHUNK_STATE_FREE ||
2132 initial_state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
2133
2134 chunk->type = type;
2135 chunk->state = initial_state;
2136 chunk->is_zero = is_zero;
2137
2138 chunk_update_lists_locked(pmm, chunk);
2139
2140 uvm_spin_unlock(&pmm->list_lock);
2141
2142 root_chunk_unlock(pmm, root_chunk);
2143 }
2144
alloc_root_chunk(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_pmm_alloc_flags_t flags,uvm_gpu_chunk_t ** out_chunk)2145 NV_STATUS alloc_root_chunk(uvm_pmm_gpu_t *pmm,
2146 uvm_pmm_gpu_memory_type_t type,
2147 uvm_pmm_alloc_flags_t flags,
2148 uvm_gpu_chunk_t **out_chunk)
2149 {
2150 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
2151 NV_STATUS status;
2152 UvmPmaAllocationOptions options = {0};
2153 NvU32 num_chunks;
2154 NvU32 i;
2155 bool used_kmem_cache = false;
2156 UvmGpuPointer pa;
2157 UvmGpuPointer *pas;
2158
2159 // TODO: Bug 2444368: On P9 systems, PMA scrubbing is very slow. For now,
2160 // zero the chunk within UVM. Re-evaluate this condition once PMA scrubbing
2161 // is improved.
2162 //
2163 // TODO: Bug 2446832: Most (all?) kernel chunks don't require scrubbing.
2164 // Also, user pages that are about to be overwritten, don't need to be
2165 // zeroed, either. Add an interface to uvm_pmm_gpu_alloc for callers to
2166 // specify when they don't need zeroed pages.
2167 const bool skip_pma_scrubbing = gpu->mem_info.numa.enabled;
2168 UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(type) || uvm_pmm_gpu_memory_type_is_kernel(type));
2169
2170 options.flags = UVM_PMA_ALLOCATE_DONT_EVICT;
2171
2172 if (uvm_pmm_gpu_memory_type_is_kernel(type) || !gpu_supports_pma_eviction(gpu))
2173 options.flags |= UVM_PMA_ALLOCATE_PINNED;
2174
2175 if (skip_pma_scrubbing)
2176 options.flags |= UVM_PMA_ALLOCATE_NO_ZERO;
2177
2178 // TODO: Bug 200480500: Batching is currently disabled on P9. Re-enable
2179 // when the performance of best-effort allocations is verified.
2180 if (gpu->mem_info.numa.enabled)
2181 flags |= UVM_PMM_ALLOC_FLAGS_DONT_BATCH;
2182
2183 // When the Confidential Computing feature is enabled, allocate GPU memory
2184 // in the protected region, unless specified otherwise.
2185 if (g_uvm_global.conf_computing_enabled && memory_type_is_protected(type))
2186 options.flags |= UVM_PMA_ALLOCATE_PROTECTED_REGION;
2187
2188 if (!gpu->parent->rm_info.isSimulated &&
2189 !(options.flags & UVM_PMA_ALLOCATE_PINNED) &&
2190 !(flags & UVM_PMM_ALLOC_FLAGS_DONT_BATCH)) {
2191 num_chunks = 1 << uvm_perf_pma_batch_nonpinned_order;
2192
2193 // Allocate a batch of root chunks in order to reduce the number of
2194 // calls to PMA. The first one is returned as allocated, the rest are
2195 // added to the corresponding free list.
2196 pas = kmem_cache_alloc(g_pma_address_batch_cache_ref.cache, NV_UVM_GFP_FLAGS);
2197 if (!pas)
2198 return NV_ERR_NO_MEMORY;
2199
2200 // Make the allocation best-effort to avoid retries if the whole batch
2201 // cannot be allocated.
2202 options.flags |= UVM_PMA_ALLOCATE_ALLOW_PARTIAL;
2203
2204 used_kmem_cache = true;
2205 }
2206 else {
2207 num_chunks = 1;
2208
2209 pas = &pa;
2210 }
2211
2212 // Acquire the PMA lock for read so that uvm_pmm_gpu_pma_evict_range() can
2213 // flush out any pending allocs.
2214 uvm_down_read(&pmm->pma_lock);
2215
2216 status = nvUvmInterfacePmaAllocPages(pmm->pma, num_chunks, UVM_CHUNK_SIZE_MAX, &options, pas);
2217 if (status != NV_OK)
2218 goto exit_unlock;
2219
2220 // Batched allocations are best-effort. Therefore, we need to adjust the
2221 // number of allocated chunks.
2222 if (used_kmem_cache) {
2223 UVM_ASSERT(options.numPagesAllocated <= num_chunks);
2224 UVM_ASSERT(options.numPagesAllocated > 0);
2225 num_chunks = options.numPagesAllocated;
2226 }
2227
2228 for (i = 0; i < num_chunks; ++i) {
2229 uvm_pmm_gpu_chunk_state_t initial_state;
2230 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_address(pmm, pas[i]);
2231 uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
2232
2233 if (i == 0) {
2234 initial_state = UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED;
2235 *out_chunk = chunk;
2236 }
2237 else {
2238 initial_state = UVM_PMM_GPU_CHUNK_STATE_FREE;
2239 }
2240
2241 UVM_ASSERT_MSG(IS_ALIGNED(pas[i], UVM_CHUNK_SIZE_MAX), "Address 0x%llx\n", pas[i]);
2242 UVM_ASSERT(chunk->address == pas[i]);
2243
2244 init_root_chunk(pmm,
2245 type,
2246 root_chunk,
2247 initial_state,
2248 !!(options.resultFlags & UVM_PMA_ALLOCATE_RESULT_IS_ZERO));
2249 }
2250
2251 exit_unlock:
2252 uvm_up_read(&pmm->pma_lock);
2253
2254 if (used_kmem_cache)
2255 kmem_cache_free(g_pma_address_batch_cache_ref.cache, pas);
2256
2257 return status;
2258 }
2259
free_root_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_root_chunk_t * root_chunk,free_root_chunk_mode_t free_mode)2260 void free_root_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, free_root_chunk_mode_t free_mode)
2261 {
2262 NV_STATUS status;
2263 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
2264 uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
2265 NvU32 flags = 0;
2266
2267 // Acquire the PMA lock for read so that uvm_pmm_gpu_pma_evict_range() can
2268 // flush out any pending frees.
2269 uvm_down_read(&pmm->pma_lock);
2270
2271 root_chunk_lock(pmm, root_chunk);
2272
2273 root_chunk_unmap_indirect_peers(pmm, root_chunk);
2274
2275 status = uvm_tracker_wait_deinit(&root_chunk->tracker);
2276 if (status != NV_OK) {
2277 // TODO: Bug 1766184: Handle RC/ECC. For now just go ahead and free the chunk anyway.
2278 UVM_ASSERT(uvm_global_get_status() != NV_OK);
2279 }
2280
2281 uvm_spin_lock(&pmm->list_lock);
2282
2283 UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED,
2284 "Address 0x%llx state %s GPU %s\n",
2285 chunk->address,
2286 uvm_pmm_gpu_chunk_state_string(chunk->state),
2287 uvm_gpu_name(gpu));
2288 UVM_ASSERT(list_empty(&chunk->list));
2289
2290 chunk_unpin(pmm, chunk, UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
2291
2292 uvm_spin_unlock(&pmm->list_lock);
2293
2294 root_chunk_unlock(pmm, root_chunk);
2295
2296 if (free_mode == FREE_ROOT_CHUNK_MODE_SKIP_PMA_FREE) {
2297 uvm_up_read(&pmm->pma_lock);
2298 return;
2299 }
2300
2301 if (free_mode == FREE_ROOT_CHUNK_MODE_PMA_EVICTION)
2302 flags |= UVM_PMA_CALLED_FROM_PMA_EVICTION;
2303
2304 if (chunk->is_zero)
2305 flags |= UVM_PMA_FREE_IS_ZERO;
2306
2307 nvUvmInterfacePmaFreePages(pmm->pma, &chunk->address, 1, UVM_CHUNK_SIZE_MAX, flags);
2308
2309 uvm_up_read(&pmm->pma_lock);
2310 }
2311
2312 // Splits the input chunk into subchunks of the next size down. The chunk state
2313 // can be UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED or
2314 // UVM_PMM_GPU_CHUNK_STATE_ALLOCATED.
2315 //
2316 // UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED: This is a split for allocation.
2317 //
2318 // UVM_PMM_GPU_CHUNK_STATE_ALLOCATED: This is an in-place split. The new chunks
2319 // are also marked allocated and they inherit the reverse map from the original.
2320 //
2321 // The PMM lock must be held when calling this function.
split_gpu_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2322 NV_STATUS split_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2323 {
2324 uvm_chunk_size_t chunk_size = uvm_gpu_chunk_get_size(chunk);
2325 uvm_chunk_sizes_mask_t chunk_sizes = pmm->chunk_sizes[chunk->type];
2326 uvm_chunk_size_t subchunk_size;
2327 size_t cache_idx, num_sub;
2328 int i;
2329 NV_STATUS status;
2330 uvm_pmm_gpu_chunk_suballoc_t *suballoc;
2331 uvm_gpu_chunk_t *subchunk;
2332 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
2333
2334 uvm_assert_mutex_locked(&pmm->lock);
2335 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
2336 chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
2337
2338 subchunk_size = uvm_chunk_find_prev_size(chunk_sizes, chunk_size);
2339 UVM_ASSERT(subchunk_size != UVM_CHUNK_SIZE_INVALID);
2340
2341 num_sub = chunk_size / subchunk_size;
2342 cache_idx = ilog2(num_sub);
2343 UVM_ASSERT(chunk_split_cache[cache_idx].cache != NULL);
2344
2345 suballoc = nv_kmem_cache_zalloc(chunk_split_cache[cache_idx].cache, NV_UVM_GFP_FLAGS);
2346 if (suballoc == NULL)
2347 return NV_ERR_NO_MEMORY;
2348
2349 for (i = 0; i < num_sub; i++) {
2350 // If requested, inject a failure on the last subchunk
2351 if (unlikely(chunk->inject_split_error) && i == num_sub - 1) {
2352 status = NV_ERR_NO_MEMORY;
2353 goto cleanup;
2354 }
2355
2356 subchunk = nv_kmem_cache_zalloc(CHUNK_CACHE, NV_UVM_GFP_FLAGS);
2357 if (!subchunk) {
2358 status = NV_ERR_NO_MEMORY;
2359 goto cleanup;
2360 }
2361 suballoc->subchunks[i] = subchunk;
2362
2363 subchunk->gpu_index = chunk->gpu_index;
2364 subchunk->address = chunk->address + i * subchunk_size;
2365 subchunk->type = chunk->type;
2366 uvm_gpu_chunk_set_size(subchunk, subchunk_size);
2367 subchunk->parent = chunk;
2368 subchunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
2369 subchunk->is_zero = chunk->is_zero;
2370 INIT_LIST_HEAD(&subchunk->list);
2371
2372 // The child inherits the parent's state.
2373 subchunk->state = chunk->state;
2374
2375 if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
2376 UVM_ASSERT(chunk->va_block);
2377 uvm_assert_mutex_locked(&chunk->va_block->lock);
2378 subchunk->va_block = chunk->va_block;
2379 subchunk->va_block_page_index = chunk->va_block_page_index + (i * subchunk_size) / PAGE_SIZE;
2380 subchunk->is_referenced = chunk->is_referenced;
2381 }
2382 }
2383
2384 // We're splitting an allocated or pinned chunk in-place.
2385 suballoc->allocated = num_sub;
2386
2387 // Now that all of the subchunk state has been initialized, transition the
2388 // parent into the split state under the list lock.
2389 uvm_spin_lock(&pmm->list_lock);
2390
2391 chunk->suballoc = suballoc;
2392
2393 if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
2394 chunk->va_block = NULL;
2395 chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
2396 chunk->is_referenced = false;
2397 }
2398 else if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
2399 // -1 for the parent chunk that is going to transition into the split state.
2400 root_chunk->chunk.suballoc->pinned_leaf_chunks += num_sub - 1;
2401
2402 // When a pinned root chunk gets split, the count starts at 0 not
2403 // accounting for the root chunk itself so add the 1 back.
2404 if (chunk_is_root_chunk(chunk))
2405 root_chunk->chunk.suballoc->pinned_leaf_chunks += 1;
2406 }
2407
2408 chunk->state = UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT;
2409
2410 uvm_spin_unlock(&pmm->list_lock);
2411
2412 return NV_OK;
2413 cleanup:
2414 for (i = 0; i < num_sub; i++) {
2415 if (suballoc->subchunks[i] == NULL)
2416 break;
2417 kmem_cache_free(CHUNK_CACHE, suballoc->subchunks[i]);
2418 }
2419 kmem_cache_free(chunk_split_cache[cache_idx].cache, suballoc);
2420 return status;
2421 }
2422
2423 // Sanity check the chunk, the chunk's tree, and any mappings to the chunk. The
2424 // chunk must be newly-freed or newly-allocated, but its state may not reflect
2425 // that yet.
2426 //
2427 // This function always returns true so it can be called from an assert macro.
check_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2428 static bool check_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2429 {
2430 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
2431 uvm_chunk_sizes_mask_t chunk_sizes = pmm->chunk_sizes[chunk->type];
2432 uvm_gpu_chunk_t *parent = chunk->parent;
2433 uvm_chunk_size_t chunk_size = uvm_gpu_chunk_get_size(chunk);
2434 uvm_chunk_size_t parent_size;
2435
2436 UVM_ASSERT(chunk_size & chunk_sizes);
2437 UVM_ASSERT(IS_ALIGNED(chunk->address, chunk_size));
2438 UVM_ASSERT(uvm_id_equal(uvm_gpu_id_from_index(chunk->gpu_index), gpu->id));
2439
2440
2441 // See pmm_squash_memory_type().
2442 if (!g_uvm_global.conf_computing_enabled)
2443 UVM_ASSERT((chunk->type == UVM_PMM_GPU_MEMORY_TYPE_USER) || (chunk->type == UVM_PMM_GPU_MEMORY_TYPE_KERNEL));
2444
2445 if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
2446 UVM_ASSERT(chunk_size > uvm_chunk_find_first_size(chunk_sizes));
2447
2448 if (parent) {
2449 UVM_ASSERT(parent->type == chunk->type);
2450
2451 parent_size = uvm_gpu_chunk_get_size(parent);
2452 UVM_ASSERT(uvm_chunk_find_next_size(chunk_sizes, chunk_size) == parent_size);
2453 UVM_ASSERT(parent_size <= uvm_chunk_find_last_size(chunk_sizes));
2454
2455 UVM_ASSERT(parent->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
2456 UVM_ASSERT(parent->suballoc);
2457 UVM_ASSERT(parent->suballoc->allocated > 0);
2458 UVM_ASSERT(parent->suballoc->allocated <= num_subchunks(parent));
2459
2460 UVM_ASSERT(parent->address <= chunk->address);
2461 UVM_ASSERT(chunk->address < parent->address + parent_size);
2462 }
2463 else {
2464 UVM_ASSERT(chunk_size == uvm_chunk_find_last_size(chunk_sizes));
2465 }
2466
2467 if (uvm_pmm_sysmem_mappings_indirect_supported()) {
2468 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
2469 uvm_gpu_id_t other_gpu_id;
2470
2471 root_chunk_lock(pmm, root_chunk);
2472
2473 // See root_chunk_unmap_indirect_peers for the usage of uvm_gpu_get
2474 for_each_gpu_id_in_mask(other_gpu_id, &root_chunk->indirect_peers_mapped) {
2475 uvm_gpu_t *other_gpu = uvm_gpu_get(other_gpu_id);
2476 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(pmm, chunk, other_gpu);
2477 uvm_reverse_map_t reverse_map;
2478 size_t num_mappings;
2479
2480 num_mappings = uvm_pmm_sysmem_mappings_dma_to_virt(&other_gpu->pmm_reverse_sysmem_mappings,
2481 peer_addr,
2482 uvm_gpu_chunk_get_size(chunk),
2483 &reverse_map,
2484 1);
2485 UVM_ASSERT(num_mappings == 0);
2486 }
2487
2488 root_chunk_unlock(pmm, root_chunk);
2489 }
2490
2491 return true;
2492 }
2493
chunk_is_last_allocated_child(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2494 static bool chunk_is_last_allocated_child(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2495 {
2496 uvm_assert_spinlock_locked(&pmm->list_lock);
2497
2498 if (!chunk->parent)
2499 return false;
2500
2501 return chunk->parent->suballoc->allocated == 1;
2502 }
2503
chunk_free_locked(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2504 static void chunk_free_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2505 {
2506 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
2507
2508 uvm_assert_spinlock_locked(&pmm->list_lock);
2509
2510 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
2511 chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
2512
2513 if (root_chunk->chunk.in_eviction) {
2514 // A root chunk with pinned subchunks would never be picked for eviction
2515 // so this one has to be in the allocated state. Pin it and let the
2516 // evicting thread pick it up.
2517 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
2518 UVM_ASSERT(chunk->va_block != NULL);
2519 UVM_ASSERT(chunk->va_block_page_index != PAGES_PER_UVM_VA_BLOCK);
2520 UVM_ASSERT(list_empty(&chunk->list));
2521 chunk->va_block = NULL;
2522 chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
2523 chunk->is_zero = false;
2524 chunk_pin(pmm, chunk);
2525 return;
2526 }
2527
2528 if (chunk->parent) {
2529 UVM_ASSERT(chunk->parent->suballoc->allocated > 0);
2530 --chunk->parent->suballoc->allocated;
2531 if (chunk->parent->suballoc->allocated == 0) {
2532 // Freeing the last subchunk should trigger a merge and the PMM
2533 // mutex is required to perform it.
2534 uvm_assert_mutex_locked(&pmm->lock);
2535 }
2536 }
2537
2538 if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
2539 chunk_unpin(pmm, chunk, UVM_PMM_GPU_CHUNK_STATE_FREE);
2540 }
2541 else {
2542 chunk->state = UVM_PMM_GPU_CHUNK_STATE_FREE;
2543 chunk->va_block = NULL;
2544 }
2545
2546 chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
2547 chunk->is_zero = false;
2548
2549 chunk_update_lists_locked(pmm, chunk);
2550 }
2551
try_chunk_free(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2552 static bool try_chunk_free(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2553 {
2554 bool freed = false;
2555
2556 uvm_spin_lock(&pmm->list_lock);
2557
2558 UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED || !chunk->is_referenced);
2559
2560 chunk->inject_split_error = false;
2561
2562 // Chunks that are the last allocated child need to trigger a merge and are
2563 // handled by free_or_prepare_for_merge().
2564 if (!chunk_is_last_allocated_child(pmm, chunk)) {
2565 chunk_free_locked(pmm, chunk);
2566 freed = true;
2567 }
2568
2569 uvm_spin_unlock(&pmm->list_lock);
2570
2571 return freed;
2572 }
2573
2574 // Return NULL if the chunk could be freed immediately. Otherwise, if the chunk
2575 // was the last allocated child, return the parent chunk to be merged with all
2576 // of its children taken off the free list in TEMP_PINNED state.
free_or_prepare_for_merge(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2577 static uvm_gpu_chunk_t *free_or_prepare_for_merge(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2578 {
2579 uvm_gpu_chunk_t *parent = NULL;
2580 NvU32 i;
2581
2582 uvm_assert_mutex_locked(&pmm->lock);
2583
2584 if (!chunk->parent) {
2585 bool freed = try_chunk_free(pmm, chunk);
2586
2587 // Freeing a root chunk should never fail
2588 UVM_ASSERT(freed);
2589
2590 return NULL;
2591 }
2592
2593 uvm_spin_lock(&pmm->list_lock);
2594
2595 if (chunk_is_last_allocated_child(pmm, chunk))
2596 parent = chunk->parent;
2597
2598 chunk_free_locked(pmm, chunk);
2599
2600 if (parent == NULL) {
2601 UVM_ASSERT(chunk->parent->suballoc->allocated != 0);
2602 goto done;
2603 }
2604
2605 UVM_ASSERT(chunk->parent->suballoc->allocated == 0);
2606
2607 // Pin all the subchunks to prepare them for being merged.
2608 for (i = 0; i < num_subchunks(chunk->parent); ++i) {
2609 uvm_gpu_chunk_t *subchunk = chunk->parent->suballoc->subchunks[i];
2610
2611 UVM_ASSERT(subchunk->state == UVM_PMM_GPU_CHUNK_STATE_FREE);
2612
2613 list_del_init(&subchunk->list);
2614 subchunk->state = UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED;
2615 }
2616 root_chunk_from_chunk(pmm, chunk)->chunk.suballoc->pinned_leaf_chunks += num_subchunks(chunk->parent);
2617
2618 chunk->parent->suballoc->allocated = num_subchunks(chunk->parent);
2619 parent = chunk->parent;
2620
2621 done:
2622 uvm_spin_unlock(&pmm->list_lock);
2623
2624 return parent;
2625 }
2626
free_chunk_with_merges(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2627 static void free_chunk_with_merges(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2628 {
2629 uvm_assert_mutex_locked(&pmm->lock);
2630
2631 while (1) {
2632 // When called from the free_chunk path this check_chunk is redundant,
2633 // but we have some PMM-internal direct calls of this function.
2634 UVM_ASSERT(check_chunk(pmm, chunk));
2635
2636 chunk = free_or_prepare_for_merge(pmm, chunk);
2637 if (!chunk)
2638 break;
2639
2640 merge_gpu_chunk(pmm, chunk);
2641 }
2642 }
2643
2644 // Mark the chunk as free and put it on the free list. If this is a suballocated
2645 // chunk and the parent has no more allocated chunks, the parent is freed and so
2646 // on up the tree.
free_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2647 static void free_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2648 {
2649 bool try_free = true;
2650 const bool is_root = chunk_is_root_chunk(chunk);
2651 const uvm_pmm_gpu_memory_type_t type = chunk->type;
2652
2653 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
2654 chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
2655
2656 UVM_ASSERT(check_chunk(pmm, chunk));
2657
2658 if (try_chunk_free(pmm, chunk)) {
2659 try_free = is_root;
2660 }
2661 else {
2662 // Freeing a chunk can only fail if it requires merging. Take the PMM lock
2663 // and free it with merges supported.
2664 uvm_mutex_lock(&pmm->lock);
2665 free_chunk_with_merges(pmm, chunk);
2666 uvm_mutex_unlock(&pmm->lock);
2667 }
2668
2669 // Once try_chunk_free succeeds or free_chunk_with_merges returns, it's no
2670 // longer safe to access chunk in general. All you know is that the
2671 // chunk you freed was put on the free list by the call. Since the spin lock
2672 // has been dropped, any other thread could have come in and allocated the
2673 // chunk in the meantime. Therefore, this next step just looks for a
2674 // root chunk to free, without assuming that one is actually there.
2675
2676 if (try_free)
2677 (void)free_next_available_root_chunk(pmm, type);
2678 }
2679
2680 // Finds and frees the next root chunk of the given type (if any) that can be
2681 // freed. Returns true if a root chunk was freed, or false otherwise.
free_next_available_root_chunk(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type)2682 bool free_next_available_root_chunk(uvm_pmm_gpu_t *pmm, uvm_pmm_gpu_memory_type_t type)
2683 {
2684 uvm_gpu_chunk_t *result;
2685
2686 UVM_ASSERT(uvm_chunk_find_last_size(pmm->chunk_sizes[type]) == UVM_CHUNK_SIZE_MAX);
2687
2688 uvm_spin_lock(&pmm->list_lock);
2689
2690 // Prefer non-zero free chunk as memory is about to be released to PMA
2691 result = list_first_chunk(find_free_list(pmm, type, UVM_CHUNK_SIZE_MAX, UVM_PMM_LIST_NO_ZERO));
2692 if (result)
2693 UVM_ASSERT(!result->is_zero);
2694
2695 if (!result) {
2696 result = list_first_chunk(find_free_list(pmm, type, UVM_CHUNK_SIZE_MAX, UVM_PMM_LIST_ZERO));
2697 if (result)
2698 UVM_ASSERT(result->is_zero);
2699 }
2700
2701 if (result != NULL) {
2702 list_del_init(&result->list);
2703 UVM_ASSERT(result->state == UVM_PMM_GPU_CHUNK_STATE_FREE);
2704 UVM_ASSERT(uvm_gpu_chunk_get_size(result) == UVM_CHUNK_SIZE_MAX);
2705 UVM_ASSERT(result->type == type);
2706
2707 // The chunk has been freed and removed from the free list so it
2708 // can't get allocated again, but it could be targeted for eviction
2709 // by physical address. Pin it temporarily to protect the chunk from
2710 // eviction between dropping the list lock and taking the root chunk
2711 // lock.
2712 chunk_pin(pmm, result);
2713 }
2714
2715 uvm_spin_unlock(&pmm->list_lock);
2716
2717 if (result != NULL) {
2718 free_root_chunk(pmm, root_chunk_from_chunk(pmm, result), FREE_ROOT_CHUNK_MODE_DEFAULT);
2719 return true;
2720 }
2721
2722 return false;
2723 }
2724
2725 // Get free list for the given chunk size and type
find_free_list(uvm_pmm_gpu_t * pmm,uvm_pmm_gpu_memory_type_t type,uvm_chunk_size_t chunk_size,uvm_pmm_list_zero_t zero_type)2726 struct list_head *find_free_list(uvm_pmm_gpu_t *pmm,
2727 uvm_pmm_gpu_memory_type_t type,
2728 uvm_chunk_size_t chunk_size,
2729 uvm_pmm_list_zero_t zero_type)
2730 {
2731 uvm_chunk_sizes_mask_t chunk_sizes = pmm->chunk_sizes[type];
2732 size_t idx = hweight_long(chunk_sizes & (chunk_size - 1));
2733 UVM_ASSERT(is_power_of_2(chunk_size));
2734 UVM_ASSERT_MSG(chunk_size & chunk_sizes, "chunk size 0x%x chunk sizes 0x%x\n", chunk_size, chunk_sizes);
2735 return &pmm->free_list[type][idx][zero_type];
2736 }
2737
find_free_list_chunk(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)2738 struct list_head *find_free_list_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
2739 {
2740 return find_free_list(pmm,
2741 chunk->type,
2742 uvm_gpu_chunk_get_size(chunk),
2743 chunk->is_zero? UVM_PMM_LIST_ZERO : UVM_PMM_LIST_NO_ZERO);
2744 }
2745
uvm_pmm_should_inject_pma_eviction_error(uvm_pmm_gpu_t * pmm)2746 static bool uvm_pmm_should_inject_pma_eviction_error(uvm_pmm_gpu_t *pmm)
2747 {
2748 uvm_assert_mutex_locked(&pmm->lock);
2749
2750 if (unlikely(pmm->inject_pma_evict_error_after_num_chunks > 0))
2751 return --pmm->inject_pma_evict_error_after_num_chunks == 0;
2752
2753 return false;
2754 }
2755
2756 // See the documentation of pmaEvictPagesCb_t in pma.h for details of the
2757 // expected semantics.
uvm_pmm_gpu_pma_evict_pages(void * void_pmm,NvU32 page_size,NvU64 * pages,NvU32 num_pages_to_evict,NvU64 phys_start,NvU64 phys_end,UVM_PMA_GPU_MEMORY_TYPE mem_type)2758 static NV_STATUS uvm_pmm_gpu_pma_evict_pages(void *void_pmm,
2759 NvU32 page_size,
2760 NvU64 *pages,
2761 NvU32 num_pages_to_evict,
2762 NvU64 phys_start,
2763 NvU64 phys_end,
2764 UVM_PMA_GPU_MEMORY_TYPE mem_type)
2765 {
2766 NV_STATUS status;
2767 uvm_pmm_gpu_t *pmm = (uvm_pmm_gpu_t *)void_pmm;
2768 uvm_gpu_chunk_t *chunk;
2769 NvU64 num_pages_evicted_so_far = 0;
2770 NvU64 num_pages_left_to_evict = num_pages_to_evict;
2771 const NvU64 pages_per_chunk = UVM_CHUNK_SIZE_MAX / page_size;
2772 bool all_pages_are_zero = true;
2773
2774 UVM_ASSERT(IS_ALIGNED(UVM_CHUNK_SIZE_MAX, page_size));
2775 UVM_ASSERT(UVM_CHUNK_SIZE_MAX >= page_size);
2776
2777 // Currently, when the Confidential Computing feature is enabled, the
2778 // entirety of vidmem is protected.
2779 if (g_uvm_global.conf_computing_enabled && (mem_type != UVM_PMA_GPU_MEMORY_TYPE_PROTECTED))
2780 return NV_ERR_INVALID_ARGUMENT;
2781
2782 while (num_pages_left_to_evict > 0) {
2783 uvm_gpu_root_chunk_t *root_chunk;
2784 uvm_page_index_t page_index;
2785 NvU64 pages_this_time = min(pages_per_chunk, num_pages_left_to_evict);
2786
2787 uvm_mutex_lock(&pmm->lock);
2788
2789 if (uvm_pmm_should_inject_pma_eviction_error(pmm)) {
2790 status = NV_ERR_NO_MEMORY;
2791 }
2792 else {
2793 status = pick_and_evict_root_chunk_retry(pmm,
2794 UVM_PMM_GPU_MEMORY_TYPE_KERNEL,
2795 PMM_CONTEXT_PMA_EVICTION,
2796 &chunk);
2797 }
2798 uvm_mutex_unlock(&pmm->lock);
2799
2800 // TODO: Bug 1795559: Consider waiting for any pinned user allocations
2801 // to be unpinned.
2802 if (status != NV_OK)
2803 goto error;
2804
2805 root_chunk = root_chunk_from_chunk(pmm, chunk);
2806
2807 if (chunk->address < phys_start || chunk->address + UVM_CHUNK_SIZE_MAX > phys_end) {
2808 // If the chunk we get is outside of the physical range requested,
2809 // just give up and return an error.
2810 //
2811 // TODO: Bug 1795559: PMA pre-populates the array of pages with a
2812 // list of candidates that were unpinned before triggering eviction.
2813 // If they were marked for eviction, we could fall back to evicting
2814 // those instead and be sure that it succeeds.
2815 free_root_chunk(pmm, root_chunk, FREE_ROOT_CHUNK_MODE_PMA_EVICTION);
2816 status = NV_ERR_NO_MEMORY;
2817 goto error;
2818 }
2819
2820 all_pages_are_zero = all_pages_are_zero && chunk->is_zero;
2821
2822 // Free the root chunk as far as PMM's state is concerned, but skip the
2823 // free back to PMA as that would make it available for other PMA
2824 // allocations.
2825 free_root_chunk(pmm, root_chunk, FREE_ROOT_CHUNK_MODE_SKIP_PMA_FREE);
2826
2827 for (page_index = 0; page_index < pages_this_time; page_index++)
2828 pages[num_pages_evicted_so_far++] = chunk->address + page_index * page_size;
2829
2830 num_pages_left_to_evict -= pages_this_time;
2831
2832 // If we didn't use a whole root chunk, free its tail back to PMA
2833 // directly.
2834 if (pages_this_time != pages_per_chunk) {
2835 NvU64 address = chunk->address + pages_this_time * page_size;
2836 NvU64 num_pages = pages_per_chunk - pages_this_time;
2837 NvU32 free_flags = UVM_PMA_CALLED_FROM_PMA_EVICTION | UVM_PMA_ALLOCATE_CONTIGUOUS;
2838
2839 if (chunk->is_zero)
2840 free_flags |= UVM_PMA_FREE_IS_ZERO;
2841
2842 // Free the whole tail as a contiguous allocation
2843 nvUvmInterfacePmaFreePages(pmm->pma, &address, num_pages, page_size, free_flags);
2844 }
2845 }
2846
2847 return NV_OK;
2848
2849 error:
2850 // On error, free all of the evicted pages back to PMA directly.
2851 if (num_pages_evicted_so_far > 0) {
2852 NvU32 free_flags = UVM_PMA_CALLED_FROM_PMA_EVICTION;
2853
2854 if (all_pages_are_zero)
2855 free_flags |= UVM_PMA_FREE_IS_ZERO;
2856
2857 nvUvmInterfacePmaFreePages(pmm->pma, pages, num_pages_evicted_so_far, page_size, free_flags);
2858 }
2859
2860 return status;
2861 }
2862
uvm_pmm_gpu_pma_evict_pages_wrapper(void * void_pmm,NvU32 page_size,NvU64 * pages,NvU32 num_pages_to_evict,NvU64 phys_start,NvU64 phys_end,UVM_PMA_GPU_MEMORY_TYPE mem_type)2863 static NV_STATUS uvm_pmm_gpu_pma_evict_pages_wrapper(void *void_pmm,
2864 NvU32 page_size,
2865 NvU64 *pages,
2866 NvU32 num_pages_to_evict,
2867 NvU64 phys_start,
2868 NvU64 phys_end,
2869 UVM_PMA_GPU_MEMORY_TYPE mem_type)
2870 {
2871 NV_STATUS status;
2872
2873 // RM invokes the eviction callbacks with its API lock held, but not its GPU
2874 // lock.
2875 uvm_record_lock_rm_api();
2876 status = uvm_pmm_gpu_pma_evict_pages(void_pmm, page_size, pages, num_pages_to_evict, phys_start, phys_end, mem_type);
2877 uvm_record_unlock_rm_api();
2878 return status;
2879 }
2880
uvm_pmm_gpu_pma_evict_pages_wrapper_entry(void * void_pmm,NvU64 page_size,NvU64 * pages,NvU32 num_pages_to_evict,NvU64 phys_start,NvU64 phys_end,UVM_PMA_GPU_MEMORY_TYPE mem_type)2881 static NV_STATUS uvm_pmm_gpu_pma_evict_pages_wrapper_entry(void *void_pmm,
2882 NvU64 page_size,
2883 NvU64 *pages,
2884 NvU32 num_pages_to_evict,
2885 NvU64 phys_start,
2886 NvU64 phys_end,
2887 UVM_PMA_GPU_MEMORY_TYPE mem_type)
2888 {
2889 UVM_ENTRY_RET(uvm_pmm_gpu_pma_evict_pages_wrapper(void_pmm,
2890 page_size,
2891 pages,
2892 num_pages_to_evict,
2893 phys_start,
2894 phys_end,
2895 mem_type));
2896 }
2897
2898 // See the documentation of pmaEvictRangeCb_t in pma.h for details of the
2899 // expected semantics.
uvm_pmm_gpu_pma_evict_range(void * void_pmm,NvU64 phys_begin,NvU64 phys_end,UVM_PMA_GPU_MEMORY_TYPE mem_type)2900 static NV_STATUS uvm_pmm_gpu_pma_evict_range(void *void_pmm,
2901 NvU64 phys_begin,
2902 NvU64 phys_end,
2903 UVM_PMA_GPU_MEMORY_TYPE mem_type)
2904 {
2905 NV_STATUS status;
2906 uvm_pmm_gpu_t *pmm = (uvm_pmm_gpu_t *)void_pmm;
2907 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
2908 NvU64 address = UVM_ALIGN_DOWN(phys_begin, UVM_CHUNK_SIZE_MAX);
2909
2910 UVM_ASSERT_MSG(phys_begin <= phys_end, "range [0x%llx, 0x%llx]\n", phys_begin, phys_end);
2911 UVM_ASSERT_MSG(phys_end <= gpu->mem_info.max_allocatable_address,
2912 "range [0x%llx, 0x%llx]\n",
2913 phys_begin,
2914 phys_end);
2915
2916 // Make sure that all pending allocations, that could have started before
2917 // the eviction callback was called, are done. This is required to guarantee
2918 // that any address that, PMA thinks, is owned by UVM has been indeed recorded
2919 // in PMM's state. Taking the pma_lock in write mode will make sure all
2920 // readers (pending allocations and frees) are done, but will also
2921 // unnecessarily stop new allocations from starting until it's released.
2922 // TODO: Bug 1795559: SRCU would likely be better for this type of
2923 // synchronization, but that's GPL. Figure out whether we can do anything
2924 // better easily.
2925 uvm_down_write(&pmm->pma_lock);
2926 uvm_up_write(&pmm->pma_lock);
2927
2928 for (; address <= phys_end; address += UVM_CHUNK_SIZE_MAX) {
2929 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_address(pmm, address);
2930 uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
2931 bool eviction_started = false;
2932 uvm_spin_loop_t spin;
2933 bool should_inject_error;
2934
2935 uvm_spin_loop_init(&spin);
2936
2937 // Wait until we can start eviction or the chunk is returned to PMA
2938 do {
2939 uvm_spin_lock(&pmm->list_lock);
2940
2941 if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED) {
2942 UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
2943
2944 if (chunk_is_evictable(pmm, chunk)) {
2945 chunk_start_eviction(pmm, chunk);
2946 eviction_started = true;
2947 }
2948 }
2949
2950 uvm_spin_unlock(&pmm->list_lock);
2951
2952 // TODO: Bug 1795559: Replace this with a wait queue.
2953 if (UVM_SPIN_LOOP(&spin) == NV_ERR_TIMEOUT_RETRY) {
2954 UVM_ERR_PRINT("Stuck waiting for root chunk 0x%llx to be unpinned, giving up\n", chunk->address);
2955 return NV_ERR_NO_MEMORY;
2956 }
2957 } while (!eviction_started && chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
2958
2959 // The eviction callback gets called with a physical range that might be
2960 // only partially allocated by UVM. Skip the chunks that UVM doesn't own.
2961 if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED)
2962 continue;
2963
2964 uvm_mutex_lock(&pmm->lock);
2965
2966 status = evict_root_chunk(pmm, root_chunk, PMM_CONTEXT_PMA_EVICTION);
2967 should_inject_error = uvm_pmm_should_inject_pma_eviction_error(pmm);
2968
2969 uvm_mutex_unlock(&pmm->lock);
2970
2971 if (status != NV_OK)
2972 return status;
2973
2974 free_root_chunk(pmm, root_chunk, FREE_ROOT_CHUNK_MODE_PMA_EVICTION);
2975
2976 if (should_inject_error)
2977 return NV_ERR_NO_MEMORY;
2978 }
2979
2980 // Make sure that all pending frees for chunks that the eviction above could
2981 // have observed as PMA owned are done. This is required to guarantee that
2982 // any address that, PMM thinks, is owned by PMA, has been actually freed
2983 // back to PMA. Taking the pma_lock in write mode will make sure all
2984 // readers (pending frees) are done, but will also unnecessarily stop new
2985 // allocations and frees from starting until it's released.
2986 uvm_down_write(&pmm->pma_lock);
2987 uvm_up_write(&pmm->pma_lock);
2988
2989 return NV_OK;
2990 }
2991
uvm_pmm_gpu_pma_evict_range_wrapper(void * void_pmm,NvU64 phys_begin,NvU64 phys_end,UVM_PMA_GPU_MEMORY_TYPE mem_type)2992 static NV_STATUS uvm_pmm_gpu_pma_evict_range_wrapper(void *void_pmm,
2993 NvU64 phys_begin,
2994 NvU64 phys_end,
2995 UVM_PMA_GPU_MEMORY_TYPE mem_type)
2996 {
2997 NV_STATUS status;
2998
2999 // RM invokes the eviction callbacks with its API lock held, but not its GPU
3000 // lock.
3001 uvm_record_lock_rm_api();
3002 status = uvm_pmm_gpu_pma_evict_range(void_pmm, phys_begin, phys_end, mem_type);
3003 uvm_record_unlock_rm_api();
3004 return status;
3005 }
3006
uvm_pmm_gpu_pma_evict_range_wrapper_entry(void * void_pmm,NvU64 phys_begin,NvU64 phys_end,UVM_PMA_GPU_MEMORY_TYPE mem_type)3007 static NV_STATUS uvm_pmm_gpu_pma_evict_range_wrapper_entry(void *void_pmm,
3008 NvU64 phys_begin,
3009 NvU64 phys_end,
3010 UVM_PMA_GPU_MEMORY_TYPE mem_type)
3011 {
3012 UVM_ENTRY_RET(uvm_pmm_gpu_pma_evict_range_wrapper(void_pmm, phys_begin, phys_end, mem_type));
3013 }
3014
deinit_chunk_split_cache(uvm_pmm_gpu_t * pmm)3015 static void deinit_chunk_split_cache(uvm_pmm_gpu_t *pmm)
3016 {
3017 unsigned long subchunk_count_log2;
3018
3019 uvm_assert_mutex_locked(&g_uvm_global.global_lock);
3020
3021 for_each_set_bit(subchunk_count_log2, pmm->chunk_split_cache_initialized, UVM_PMM_CHUNK_SPLIT_CACHE_SIZES) {
3022 UVM_ASSERT(chunk_split_cache[subchunk_count_log2].refcount > 0);
3023 UVM_ASSERT(chunk_split_cache[subchunk_count_log2].cache);
3024
3025 if (--chunk_split_cache[subchunk_count_log2].refcount == 0)
3026 kmem_cache_destroy_safe(&chunk_split_cache[subchunk_count_log2].cache);
3027
3028 __clear_bit(subchunk_count_log2, pmm->chunk_split_cache_initialized);
3029 }
3030 }
3031
init_chunk_split_cache_level(uvm_pmm_gpu_t * pmm,size_t level)3032 static NV_STATUS init_chunk_split_cache_level(uvm_pmm_gpu_t *pmm, size_t level)
3033 {
3034 uvm_assert_mutex_locked(&g_uvm_global.global_lock);
3035
3036 if (!test_bit(level, pmm->chunk_split_cache_initialized)) {
3037 if (!chunk_split_cache[level].cache) {
3038 size_t size;
3039 size_t align;
3040 if (level == 0) {
3041 strncpy(chunk_split_cache[level].name, "uvm_gpu_chunk_t", sizeof(chunk_split_cache[level].name) - 1);
3042 size = sizeof(uvm_gpu_chunk_t);
3043 align = __alignof__(uvm_gpu_chunk_t);
3044 } else {
3045 snprintf(chunk_split_cache[level].name,
3046 sizeof(chunk_split_cache[level].name),
3047 "uvm_gpu_chunk_%u", (unsigned)level);
3048 size = sizeof(uvm_pmm_gpu_chunk_suballoc_t) + (sizeof(uvm_gpu_chunk_t *) << level);
3049 align = __alignof__(uvm_pmm_gpu_chunk_suballoc_t);
3050 }
3051 chunk_split_cache[level].cache =
3052 nv_kmem_cache_create(chunk_split_cache[level].name, size, align);
3053
3054
3055 if (!chunk_split_cache[level].cache)
3056 return NV_ERR_NO_MEMORY;
3057
3058 UVM_ASSERT(chunk_split_cache[level].refcount == 0);
3059 } else {
3060 UVM_ASSERT(chunk_split_cache[level].refcount > 0);
3061 }
3062
3063 ++chunk_split_cache[level].refcount;
3064 UVM_ASSERT_MSG(chunk_split_cache[level].refcount != 0, "Overflow of refcount\n");
3065
3066 __set_bit(level, pmm->chunk_split_cache_initialized);
3067 }
3068
3069 return NV_OK;
3070 }
3071
3072 // Initializes the split cache for given GPU.
3073 //
3074 // It walks through all memory splits - in other words all ratios of neighboring
3075 // pairs of sizes - and allocates kmem cache for them, unless they are already
3076 // allocated.
3077 //
3078 // It also bumps the refcount if this GPU did not use such split yet.
init_chunk_split_cache(uvm_pmm_gpu_t * pmm)3079 static NV_STATUS init_chunk_split_cache(uvm_pmm_gpu_t *pmm)
3080 {
3081 NV_STATUS status;
3082 uvm_pmm_gpu_memory_type_t type;
3083
3084 uvm_assert_mutex_locked(&g_uvm_global.global_lock);
3085
3086 for (type = 0; type < UVM_PMM_GPU_MEMORY_TYPE_COUNT; type++) {
3087 uvm_chunk_size_t prev_size, cur_size;
3088 uvm_chunk_sizes_mask_t chunk_sizes = pmm->chunk_sizes[type];
3089 // Iterate over each pair of neighboring sizes. Note that same level
3090 // may be visited multiple times and it is handled internally by
3091 // init_chunk_split_cache_level
3092 prev_size = uvm_chunk_find_first_size(chunk_sizes);
3093 cur_size = uvm_chunk_find_next_size(chunk_sizes, prev_size);
3094 for_each_chunk_size_from(cur_size, chunk_sizes) {
3095 size_t subchunk_count = cur_size / prev_size;
3096 size_t level = ilog2(subchunk_count);
3097 status = init_chunk_split_cache_level(pmm, level);
3098 if (status != NV_OK)
3099 return status;
3100
3101 prev_size = cur_size;
3102 }
3103 }
3104
3105 return init_chunk_split_cache_level(pmm, 0);
3106 }
3107
init_pma_address_batch_cache(uvm_pmm_gpu_t * pmm)3108 static NV_STATUS init_pma_address_batch_cache(uvm_pmm_gpu_t *pmm)
3109 {
3110 uvm_assert_mutex_locked(&g_uvm_global.global_lock);
3111
3112 if (!g_pma_address_batch_cache_ref.cache) {
3113 const size_t address_batch_size = sizeof(UvmGpuPointer) << uvm_perf_pma_batch_nonpinned_order;
3114
3115 snprintf(g_pma_address_batch_cache_ref.name,
3116 sizeof(g_pma_address_batch_cache_ref.name),
3117 "pma_address_batch");
3118 g_pma_address_batch_cache_ref.cache =
3119 nv_kmem_cache_create(g_pma_address_batch_cache_ref.name,
3120 address_batch_size, __alignof__(UvmGpuPointer));
3121
3122 if (!g_pma_address_batch_cache_ref.cache)
3123 return NV_ERR_NO_MEMORY;
3124
3125 UVM_ASSERT(g_pma_address_batch_cache_ref.refcount == 0);
3126 }
3127 else {
3128 UVM_ASSERT(g_pma_address_batch_cache_ref.refcount > 0);
3129 }
3130
3131 pmm->pma_address_cache_initialized = true;
3132
3133 ++g_pma_address_batch_cache_ref.refcount;
3134 UVM_ASSERT_MSG(g_pma_address_batch_cache_ref.refcount != 0, "Overflow of refcount\n");
3135
3136 return NV_OK;
3137 }
3138
deinit_pma_address_batch_cache(uvm_pmm_gpu_t * pmm)3139 static void deinit_pma_address_batch_cache(uvm_pmm_gpu_t *pmm)
3140 {
3141 if (pmm->pma_address_cache_initialized) {
3142 UVM_ASSERT(g_pma_address_batch_cache_ref.refcount > 0);
3143 UVM_ASSERT(g_pma_address_batch_cache_ref.cache);
3144
3145 if (--g_pma_address_batch_cache_ref.refcount == 0)
3146 kmem_cache_destroy_safe(&g_pma_address_batch_cache_ref.cache);
3147
3148 pmm->pma_address_cache_initialized = false;
3149 }
3150 }
3151
deinit_caches(uvm_pmm_gpu_t * pmm)3152 static void deinit_caches(uvm_pmm_gpu_t *pmm)
3153 {
3154 uvm_assert_mutex_locked(&g_uvm_global.global_lock);
3155
3156 deinit_pma_address_batch_cache(pmm);
3157 deinit_chunk_split_cache(pmm);
3158 }
3159
init_caches(uvm_pmm_gpu_t * pmm)3160 static NV_STATUS init_caches(uvm_pmm_gpu_t *pmm)
3161 {
3162 NV_STATUS status;
3163
3164 status = init_pma_address_batch_cache(pmm);
3165 if (status != NV_OK)
3166 goto cleanup;
3167
3168 status = init_chunk_split_cache(pmm);
3169 if (status != NV_OK)
3170 goto cleanup;
3171
3172 return NV_OK;
3173
3174 cleanup:
3175 deinit_caches(pmm);
3176
3177 return status;
3178 }
3179
3180 typedef struct
3181 {
3182 // Start/end of the physical region to be traversed (IN)
3183 NvU64 phys_start;
3184 NvU64 phys_end;
3185
3186 // Pointer to the array of mappins where to store results (OUT)
3187 uvm_reverse_map_t *mappings;
3188
3189 // Number of entries written to mappings (OUT)
3190 NvU32 num_mappings;
3191 } get_chunk_mappings_data_t;
3192
3193 // Chunk traversal function used for phys-to-virt translation. These are the
3194 // possible return values.
3195 //
3196 // - NV_ERR_OUT_OF_RANGE: no allocated physical chunks were found
3197 // - NV_ERR_MORE_DATA_AVAILABLE: allocated physical chunks were found
3198 // - NV_OK: allocated physical chunks may have been found. Check num_mappings
get_chunk_mappings_in_range(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk,void * data)3199 static NV_STATUS get_chunk_mappings_in_range(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, void *data)
3200 {
3201 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
3202 get_chunk_mappings_data_t *get_chunk_mappings_data = (get_chunk_mappings_data_t *)data;
3203 NvU64 chunk_end = chunk->address + uvm_gpu_chunk_get_size(chunk) - 1;
3204
3205 uvm_assert_mutex_locked(&pmm->lock);
3206
3207 // Kernel chunks do not have assigned VA blocks so we can just skip them
3208 if (uvm_pmm_gpu_memory_type_is_kernel(chunk->type))
3209 return NV_WARN_NOTHING_TO_DO;
3210
3211 // This chunk is located before the requested physical range. Skip its
3212 // children and keep going
3213 if (chunk_end < get_chunk_mappings_data->phys_start)
3214 return NV_WARN_NOTHING_TO_DO;
3215
3216 // We are beyond the search phys range. Stop traversing.
3217 if (chunk->address > get_chunk_mappings_data->phys_end) {
3218 if (get_chunk_mappings_data->num_mappings > 0)
3219 return NV_ERR_MORE_DATA_AVAILABLE;
3220 else
3221 return NV_ERR_OUT_OF_RANGE;
3222 }
3223
3224 uvm_spin_lock(&pmm->list_lock);
3225
3226 // Return results for allocated leaf chunks, only
3227 if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
3228 uvm_reverse_map_t *reverse_map;
3229
3230 UVM_ASSERT(chunk->va_block);
3231 uvm_va_block_retain(chunk->va_block);
3232
3233 reverse_map = &get_chunk_mappings_data->mappings[get_chunk_mappings_data->num_mappings];
3234
3235 reverse_map->va_block = chunk->va_block;
3236 reverse_map->region = uvm_va_block_region(chunk->va_block_page_index,
3237 chunk->va_block_page_index + uvm_gpu_chunk_get_size(chunk) / PAGE_SIZE);
3238 reverse_map->owner = gpu->id;
3239
3240 // If we land in the middle of a chunk, adjust the offset
3241 if (get_chunk_mappings_data->phys_start > chunk->address) {
3242 NvU64 offset = get_chunk_mappings_data->phys_start - chunk->address;
3243
3244 reverse_map->region.first += offset / PAGE_SIZE;
3245 }
3246
3247 // If the physical range doesn't cover the whole chunk, adjust num_pages
3248 if (get_chunk_mappings_data->phys_end < chunk_end)
3249 reverse_map->region.outer -= (chunk_end - get_chunk_mappings_data->phys_end) / PAGE_SIZE;
3250
3251 ++get_chunk_mappings_data->num_mappings;
3252 }
3253
3254 uvm_spin_unlock(&pmm->list_lock);
3255
3256 return NV_OK;
3257 }
3258
uvm_pmm_gpu_phys_to_virt(uvm_pmm_gpu_t * pmm,NvU64 phys_addr,NvU64 region_size,uvm_reverse_map_t * out_mappings)3259 NvU32 uvm_pmm_gpu_phys_to_virt(uvm_pmm_gpu_t *pmm, NvU64 phys_addr, NvU64 region_size, uvm_reverse_map_t *out_mappings)
3260 {
3261 NvU64 chunk_base_addr = UVM_ALIGN_DOWN(phys_addr, UVM_CHUNK_SIZE_MAX);
3262 NvU64 size_in_chunk = min(UVM_CHUNK_SIZE_MAX - (phys_addr - chunk_base_addr), region_size);
3263 NvU32 num_mappings = 0;
3264
3265 UVM_ASSERT(PAGE_ALIGNED(phys_addr));
3266 UVM_ASSERT(PAGE_ALIGNED(region_size));
3267
3268 uvm_mutex_lock(&pmm->lock);
3269
3270 // Traverse the whole requested region
3271 do {
3272 NV_STATUS status = NV_OK;
3273 uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_address(pmm, phys_addr);
3274 uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
3275 get_chunk_mappings_data_t get_chunk_mappings_data;
3276
3277 get_chunk_mappings_data.phys_start = phys_addr;
3278 get_chunk_mappings_data.phys_end = phys_addr + size_in_chunk - 1;
3279 get_chunk_mappings_data.mappings = out_mappings + num_mappings;
3280 get_chunk_mappings_data.num_mappings = 0;
3281
3282 // Walk the chunks for the current root chunk
3283 status = chunk_walk_pre_order(pmm,
3284 chunk,
3285 get_chunk_mappings_in_range,
3286 &get_chunk_mappings_data);
3287 if (status == NV_ERR_OUT_OF_RANGE)
3288 break;
3289
3290 if (get_chunk_mappings_data.num_mappings > 0) {
3291 UVM_ASSERT(status == NV_OK || status == NV_ERR_MORE_DATA_AVAILABLE);
3292 num_mappings += get_chunk_mappings_data.num_mappings;
3293 }
3294 else {
3295 UVM_ASSERT(status == NV_OK);
3296 }
3297
3298 region_size -= size_in_chunk;
3299 phys_addr += size_in_chunk;
3300 size_in_chunk = min((NvU64)UVM_CHUNK_SIZE_MAX, region_size);
3301 } while (region_size > 0);
3302
3303 uvm_mutex_unlock(&pmm->lock);
3304
3305 return num_mappings;
3306 }
3307
3308 #if UVM_IS_CONFIG_HMM()
3309
devmem_page_to_pmm(struct page * page)3310 static uvm_pmm_gpu_t *devmem_page_to_pmm(struct page *page)
3311 {
3312 return container_of(page->pgmap, uvm_pmm_gpu_t, devmem.pagemap);
3313 }
3314
devmem_page_to_chunk_locked(struct page * page)3315 static uvm_gpu_chunk_t *devmem_page_to_chunk_locked(struct page *page)
3316 {
3317 uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
3318 NvU64 chunk_addr = ((NvU64)page_to_pfn(page) << PAGE_SHIFT) - pmm->devmem.pagemap.range.start;
3319 size_t index = chunk_addr / UVM_CHUNK_SIZE_MAX;
3320 uvm_gpu_chunk_t *root_chunk;
3321 uvm_gpu_chunk_t *chunk;
3322 uvm_gpu_chunk_t *parent;
3323 uvm_chunk_size_t chunk_size;
3324
3325 UVM_ASSERT(index < pmm->root_chunks.count);
3326 root_chunk = &pmm->root_chunks.array[index].chunk;
3327 UVM_ASSERT(root_chunk->address == UVM_ALIGN_DOWN(chunk_addr, UVM_CHUNK_SIZE_MAX));
3328
3329 // Find the uvm_gpu_chunk_t that corresponds to the device private struct
3330 // page's PFN. The loop is only 0, 1, or 2 iterations.
3331 for (chunk = root_chunk;
3332 uvm_gpu_chunk_get_size(chunk) != page_size(page);
3333 chunk = parent->suballoc->subchunks[index]) {
3334
3335 parent = chunk;
3336 UVM_ASSERT(parent->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
3337 UVM_ASSERT(parent->suballoc);
3338
3339 chunk_size = uvm_gpu_chunk_get_size(parent->suballoc->subchunks[0]);
3340 index = (size_t)uvm_div_pow2_64(chunk_addr - parent->address, chunk_size);
3341 UVM_ASSERT(index < num_subchunks(parent));
3342 }
3343
3344 UVM_ASSERT(chunk->address = chunk_addr);
3345 UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
3346 UVM_ASSERT(chunk->is_referenced);
3347
3348 return chunk;
3349 }
3350
uvm_pmm_devmem_page_to_chunk(struct page * page)3351 uvm_gpu_chunk_t *uvm_pmm_devmem_page_to_chunk(struct page *page)
3352 {
3353 uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
3354 uvm_gpu_chunk_t *chunk;
3355
3356 UVM_ASSERT(is_device_private_page(page));
3357
3358 uvm_spin_lock(&pmm->list_lock);
3359 chunk = devmem_page_to_chunk_locked(page);
3360 uvm_spin_unlock(&pmm->list_lock);
3361
3362 return chunk;
3363 }
3364
uvm_pmm_devmem_page_to_gpu_id(struct page * page)3365 uvm_gpu_id_t uvm_pmm_devmem_page_to_gpu_id(struct page *page)
3366 {
3367 uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
3368 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
3369
3370 UVM_ASSERT(is_device_private_page(page));
3371
3372 return gpu->id;
3373 }
3374
3375 // Check there are no orphan pages. This should be only called as part of
3376 // removing a GPU: after all work is stopped and all va_blocks have been
3377 // destroyed. By now there should be no device-private page references left as
3378 // there are no va_space's left on this GPU and orphan pages should be removed
3379 // by va_space destruction or unregistration from the GPU.
uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t * pmm)3380 static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
3381 {
3382 size_t i;
3383 bool ret = true;
3384 unsigned long pfn;
3385 struct range range = pmm->devmem.pagemap.range;
3386
3387 if (!pmm->initialized || !uvm_hmm_is_enabled_system_wide())
3388 return ret;
3389
3390 // Scan all the root chunks looking for subchunks which are still
3391 // referenced.
3392 for (i = 0; i < pmm->root_chunks.count; i++) {
3393 uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[i];
3394
3395 root_chunk_lock(pmm, root_chunk);
3396 if (root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
3397 ret = false;
3398 root_chunk_unlock(pmm, root_chunk);
3399 }
3400
3401 for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
3402 struct page *page = pfn_to_page(pfn);
3403
3404 if (!is_device_private_page(page)) {
3405 ret = false;
3406 break;
3407 }
3408
3409 if (page_count(page)) {
3410 ret = false;
3411 break;
3412 }
3413 }
3414
3415 return ret;
3416 }
3417
devmem_page_free(struct page * page)3418 static void devmem_page_free(struct page *page)
3419 {
3420 uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
3421 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
3422 uvm_gpu_chunk_t *chunk;
3423
3424 page->zone_device_data = NULL;
3425
3426 // We should be calling free_chunk() except that it acquires a mutex and
3427 // we may be in an interrupt context where we can't do that. Instead,
3428 // do a lazy free. Note that we have to use a "normal" spin lock because
3429 // the UVM context is not available.
3430 spin_lock(&pmm->list_lock.lock);
3431
3432 chunk = devmem_page_to_chunk_locked(page);
3433 UVM_ASSERT(chunk->is_referenced);
3434 chunk->is_referenced = false;
3435 list_add_tail(&chunk->list, &pmm->root_chunks.va_block_lazy_free);
3436
3437 spin_unlock(&pmm->list_lock.lock);
3438
3439 nv_kthread_q_schedule_q_item(&gpu->parent->lazy_free_q,
3440 &pmm->root_chunks.va_block_lazy_free_q_item);
3441 }
3442
3443 // This is called by HMM when the CPU faults on a ZONE_DEVICE private entry.
devmem_fault(struct vm_fault * vmf)3444 static vm_fault_t devmem_fault(struct vm_fault *vmf)
3445 {
3446 uvm_va_space_t *va_space = vmf->page->zone_device_data;
3447
3448 if (!va_space)
3449 return VM_FAULT_SIGBUS;
3450
3451 return uvm_va_space_cpu_fault_hmm(va_space, vmf->vma, vmf);
3452 }
3453
devmem_fault_entry(struct vm_fault * vmf)3454 static vm_fault_t devmem_fault_entry(struct vm_fault *vmf)
3455 {
3456 UVM_ENTRY_RET(devmem_fault(vmf));
3457 }
3458
3459 static const struct dev_pagemap_ops uvm_pmm_devmem_ops =
3460 {
3461 .page_free = devmem_page_free,
3462 .migrate_to_ram = devmem_fault_entry,
3463 };
3464
devmem_init(uvm_pmm_gpu_t * pmm)3465 static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
3466 {
3467 unsigned long size = pmm->root_chunks.count * UVM_CHUNK_SIZE_MAX;
3468 uvm_pmm_gpu_devmem_t *devmem = &pmm->devmem;
3469 struct resource *res;
3470 void *ptr;
3471 NV_STATUS status;
3472
3473 if (!uvm_hmm_is_enabled_system_wide()) {
3474 devmem->pagemap.owner = NULL;
3475 return NV_OK;
3476 }
3477
3478 res = request_free_mem_region(&iomem_resource, size, "nvidia-uvm-hmm");
3479 if (IS_ERR(res)) {
3480 UVM_ERR_PRINT("request_free_mem_region() err %ld\n", PTR_ERR(res));
3481 status = errno_to_nv_status(PTR_ERR(res));
3482 goto err;
3483 }
3484
3485 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
3486 devmem->pagemap.range.start = res->start;
3487 devmem->pagemap.range.end = res->end;
3488 devmem->pagemap.nr_range = 1;
3489 devmem->pagemap.ops = &uvm_pmm_devmem_ops;
3490 devmem->pagemap.owner = &g_uvm_global;
3491
3492 // Numa node ID doesn't matter for ZONE_DEVICE private pages.
3493 ptr = memremap_pages(&devmem->pagemap, NUMA_NO_NODE);
3494 if (IS_ERR(ptr)) {
3495 UVM_ERR_PRINT("memremap_pages() err %ld\n", PTR_ERR(ptr));
3496 status = errno_to_nv_status(PTR_ERR(ptr));
3497 goto err_release;
3498 }
3499
3500 return NV_OK;
3501
3502 err_release:
3503 release_mem_region(res->start, resource_size(res));
3504 err:
3505 devmem->pagemap.owner = NULL;
3506 return status;
3507 }
3508
devmem_deinit(uvm_pmm_gpu_t * pmm)3509 static void devmem_deinit(uvm_pmm_gpu_t *pmm)
3510 {
3511 uvm_pmm_gpu_devmem_t *devmem = &pmm->devmem;
3512
3513 if (!devmem->pagemap.owner)
3514 return;
3515
3516 memunmap_pages(&devmem->pagemap);
3517 release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range));
3518 }
3519
uvm_pmm_gpu_devmem_get_pfn(uvm_pmm_gpu_t * pmm,uvm_gpu_chunk_t * chunk)3520 unsigned long uvm_pmm_gpu_devmem_get_pfn(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
3521 {
3522 return (pmm->devmem.pagemap.range.start + chunk->address) >> PAGE_SHIFT;
3523 }
3524
3525 #endif // UVM_IS_CONFIG_HMM()
3526
3527 #if !UVM_IS_CONFIG_HMM()
devmem_init(uvm_pmm_gpu_t * pmm)3528 static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
3529 {
3530 return NV_OK;
3531 }
3532
devmem_deinit(uvm_pmm_gpu_t * pmm)3533 static void devmem_deinit(uvm_pmm_gpu_t *pmm)
3534 {
3535 }
3536
uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t * pmm)3537 static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
3538 {
3539 return true;
3540 }
3541 #endif // UVM_IS_CONFIG_HMM()
3542
process_lazy_free(uvm_pmm_gpu_t * pmm)3543 static void process_lazy_free(uvm_pmm_gpu_t *pmm)
3544 {
3545 uvm_gpu_chunk_t *chunk;
3546
3547 uvm_spin_lock(&pmm->list_lock);
3548
3549 // Note: We can't use list_for_each_safe_entry() because we drop the lock
3550 // in the loop. Instead, just keep removing the first entry until the list
3551 // is empty.
3552 while (!list_empty(&pmm->root_chunks.va_block_lazy_free)) {
3553 chunk = list_first_entry(&pmm->root_chunks.va_block_lazy_free, uvm_gpu_chunk_t, list);
3554 list_del_init(&chunk->list);
3555 uvm_spin_unlock(&pmm->list_lock);
3556
3557 free_chunk(pmm, chunk);
3558
3559 uvm_spin_lock(&pmm->list_lock);
3560 }
3561
3562 uvm_spin_unlock(&pmm->list_lock);
3563 }
3564
process_lazy_free_entry(void * args)3565 static void process_lazy_free_entry(void *args)
3566 {
3567 UVM_ENTRY_VOID(process_lazy_free(args));
3568 }
3569
uvm_pmm_gpu_init(uvm_pmm_gpu_t * pmm)3570 NV_STATUS uvm_pmm_gpu_init(uvm_pmm_gpu_t *pmm)
3571 {
3572 uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
3573 const uvm_chunk_sizes_mask_t chunk_size_init[][UVM_PMM_GPU_MEMORY_TYPE_COUNT] =
3574 {
3575 { gpu->parent->mmu_user_chunk_sizes,
3576 gpu->parent->mmu_user_chunk_sizes,
3577 gpu->parent->mmu_kernel_chunk_sizes,
3578 gpu->parent->mmu_kernel_chunk_sizes },
3579 { 0, 0, uvm_mem_kernel_chunk_sizes(gpu), uvm_mem_kernel_chunk_sizes(gpu)},
3580 };
3581 NV_STATUS status = NV_OK;
3582 size_t i, j, k;
3583
3584 // UVM_CHUNK_SIZE_INVALID is UVM_CHUNK_SIZE_MAX shifted left by 1. This protects
3585 // UVM_CHUNK_SIZE_INVALID from being negative
3586 BUILD_BUG_ON(UVM_CHUNK_SIZE_MAX >= UVM_CHUNK_SIZE_INVALID);
3587
3588 uvm_assert_mutex_locked(&g_uvm_global.global_lock);
3589
3590 for (i = 0; i < ARRAY_SIZE(pmm->free_list); i++) {
3591 for (j = 0; j < ARRAY_SIZE(pmm->free_list[i]); j++) {
3592 for (k = 0; k < ARRAY_SIZE(pmm->free_list[i][j]); k++)
3593 INIT_LIST_HEAD(&pmm->free_list[i][j][k]);
3594 }
3595 }
3596 INIT_LIST_HEAD(&pmm->root_chunks.va_block_used);
3597 INIT_LIST_HEAD(&pmm->root_chunks.va_block_unused);
3598 INIT_LIST_HEAD(&pmm->root_chunks.va_block_lazy_free);
3599 nv_kthread_q_item_init(&pmm->root_chunks.va_block_lazy_free_q_item, process_lazy_free_entry, pmm);
3600
3601 uvm_mutex_init(&pmm->lock, UVM_LOCK_ORDER_PMM);
3602 uvm_init_rwsem(&pmm->pma_lock, UVM_LOCK_ORDER_PMM_PMA);
3603 uvm_spin_lock_init(&pmm->list_lock, UVM_LOCK_ORDER_LEAF);
3604
3605 pmm->initialized = true;
3606
3607 for (i = 0; i < UVM_PMM_GPU_MEMORY_TYPE_COUNT; i++) {
3608 pmm->chunk_sizes[i] = 0;
3609 // Add the common root chunk size to all memory types
3610 pmm->chunk_sizes[i] |= UVM_CHUNK_SIZE_MAX;
3611 for (j = 0; j < ARRAY_SIZE(chunk_size_init); j++)
3612 pmm->chunk_sizes[i] |= chunk_size_init[j][i];
3613
3614 UVM_ASSERT(pmm->chunk_sizes[i] < UVM_CHUNK_SIZE_INVALID);
3615 UVM_ASSERT_MSG(hweight_long(pmm->chunk_sizes[i]) <= UVM_MAX_CHUNK_SIZES,
3616 "chunk sizes %lu, max chunk sizes %u\n", hweight_long(pmm->chunk_sizes[i]), UVM_MAX_CHUNK_SIZES);
3617 }
3618
3619 status = init_caches(pmm);
3620 if (status != NV_OK)
3621 goto cleanup;
3622
3623 // Assert that max physical address of the GPU is not unreasonably big for
3624 // creating the flat array of root chunks. 256GB should provide a reasonable
3625 // amount of future-proofing and results in 128K chunks which is still
3626 // manageable.
3627 UVM_ASSERT_MSG(gpu->mem_info.max_allocatable_address < UVM_GPU_MAX_PHYS_MEM,
3628 "Max physical address 0x%llx exceeds limit of 0x%llx\n",
3629 gpu->mem_info.max_allocatable_address,
3630 UVM_GPU_MAX_PHYS_MEM);
3631
3632 // Align up the size to have a root chunk for the last part of the FB. PMM
3633 // won't be able to allocate it, if it doesn't fit a whole root chunk, but
3634 // it's convenient to have it for uvm_test_pma_alloc_free().
3635 pmm->root_chunks.count = UVM_ALIGN_UP(gpu->mem_info.max_allocatable_address, UVM_CHUNK_SIZE_MAX) /
3636 UVM_CHUNK_SIZE_MAX;
3637 pmm->root_chunks.array = uvm_kvmalloc_zero(sizeof(*pmm->root_chunks.array) * pmm->root_chunks.count);
3638 if (!pmm->root_chunks.array) {
3639 status = NV_ERR_NO_MEMORY;
3640 goto cleanup;
3641 }
3642
3643 // Initialize all root chunks to be PMA owned and set their addresses
3644 for (i = 0; i < pmm->root_chunks.count; ++i) {
3645 uvm_gpu_chunk_t *chunk = &pmm->root_chunks.array[i].chunk;
3646
3647 INIT_LIST_HEAD(&chunk->list);
3648 chunk->gpu_index = uvm_id_gpu_index(gpu->id);
3649 chunk->state = UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED;
3650 uvm_gpu_chunk_set_size(chunk, UVM_CHUNK_SIZE_MAX);
3651 chunk->address = i * UVM_CHUNK_SIZE_MAX;
3652 chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
3653 }
3654
3655 status = uvm_bit_locks_init(&pmm->root_chunks.bitlocks, pmm->root_chunks.count, UVM_LOCK_ORDER_PMM_ROOT_CHUNK);
3656 if (status != NV_OK)
3657 goto cleanup;
3658
3659 if (gpu->mem_info.size != 0) {
3660 status = uvm_rm_locked_call(nvUvmInterfaceGetPmaObject(uvm_gpu_device_handle(gpu), &pmm->pma, &pmm->pma_stats));
3661
3662 if (status != NV_OK)
3663 goto cleanup;
3664
3665 if (gpu_supports_pma_eviction(gpu)) {
3666 status = nvUvmInterfacePmaRegisterEvictionCallbacks(pmm->pma,
3667 uvm_pmm_gpu_pma_evict_pages_wrapper_entry,
3668 uvm_pmm_gpu_pma_evict_range_wrapper_entry,
3669 pmm);
3670 if (status != NV_OK)
3671 goto cleanup;
3672 }
3673 }
3674
3675 status = devmem_init(pmm);
3676 if (status != NV_OK)
3677 goto cleanup;
3678
3679 return NV_OK;
3680 cleanup:
3681 uvm_pmm_gpu_deinit(pmm);
3682 return status;
3683 }
3684
3685 // Return to PMA any remaining free root chunks. Currently only USER
3686 // (non-pinned) chunks are pre-allocated, so the KERNEL free list should be
3687 // empty at this point. However, we may want to batch the allocation of pinned
3688 // pages in the future, too.
release_free_root_chunks(uvm_pmm_gpu_t * pmm)3689 static void release_free_root_chunks(uvm_pmm_gpu_t *pmm)
3690 {
3691 uvm_pmm_gpu_memory_type_t type;
3692
3693 for (type = 0; type < UVM_PMM_GPU_MEMORY_TYPE_COUNT; ++type) {
3694 uvm_pmm_list_zero_t zero_type;
3695
3696 while (free_next_available_root_chunk(pmm, type))
3697 ;
3698
3699 for (zero_type = 0; zero_type < UVM_PMM_LIST_ZERO_COUNT; ++zero_type)
3700 UVM_ASSERT(list_empty(find_free_list(pmm, type, UVM_CHUNK_SIZE_MAX, zero_type)));
3701 }
3702 }
3703
uvm_pmm_gpu_deinit(uvm_pmm_gpu_t * pmm)3704 void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)
3705 {
3706 uvm_gpu_t *gpu;
3707 size_t i, j, k;
3708
3709 if (!pmm->initialized)
3710 return;
3711
3712 gpu = uvm_pmm_to_gpu(pmm);
3713
3714 UVM_ASSERT(uvm_pmm_gpu_check_orphan_pages(pmm));
3715 nv_kthread_q_flush(&gpu->parent->lazy_free_q);
3716 UVM_ASSERT(list_empty(&pmm->root_chunks.va_block_lazy_free));
3717 release_free_root_chunks(pmm);
3718
3719 if (gpu->mem_info.size != 0 && gpu_supports_pma_eviction(gpu))
3720 nvUvmInterfacePmaUnregisterEvictionCallbacks(pmm->pma);
3721
3722 // TODO: Bug 1766184: Handle ECC/RC
3723 for (i = 0; i < ARRAY_SIZE(pmm->free_list); i++) {
3724 for (j = 0; j < ARRAY_SIZE(pmm->free_list[i]); j++) {
3725 for (k = 0; k < ARRAY_SIZE(pmm->free_list[i][j]); ++k) {
3726 UVM_ASSERT_MSG(list_empty(&pmm->free_list[i][j][k]), "i: %s, j: %zu, k: %zu\n",
3727 uvm_pmm_gpu_memory_type_string(i), j, k);
3728 }
3729 }
3730 }
3731
3732 uvm_bit_locks_deinit(&pmm->root_chunks.bitlocks);
3733
3734 for (i = 0; i < ARRAY_SIZE(pmm->root_chunks.indirect_peer); i++) {
3735 UVM_ASSERT(pmm->root_chunks.indirect_peer[i].dma_addrs == NULL);
3736 UVM_ASSERT(atomic64_read(&pmm->root_chunks.indirect_peer[i].map_count) == 0);
3737 }
3738
3739 if (pmm->root_chunks.array) {
3740 // Make sure that all chunks have been returned to PMA
3741 for (i = 0; i < pmm->root_chunks.count; ++i) {
3742 uvm_gpu_chunk_t *chunk = &pmm->root_chunks.array[i].chunk;
3743 UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED,
3744 "index %zu state %s GPU %s\n",
3745 i,
3746 uvm_pmm_gpu_chunk_state_string(chunk->state),
3747 uvm_gpu_name(gpu));
3748 }
3749 }
3750 uvm_kvfree(pmm->root_chunks.array);
3751
3752 deinit_caches(pmm);
3753
3754 devmem_deinit(pmm);
3755
3756 pmm->initialized = false;
3757 }
3758
uvm_test_evict_chunk(UVM_TEST_EVICT_CHUNK_PARAMS * params,struct file * filp)3759 NV_STATUS uvm_test_evict_chunk(UVM_TEST_EVICT_CHUNK_PARAMS *params, struct file *filp)
3760 {
3761 NV_STATUS status = NV_OK;
3762 uvm_gpu_t *gpu;
3763 uvm_va_space_t *va_space = uvm_va_space_get(filp);
3764 uvm_va_block_t *block = NULL;
3765 uvm_gpu_root_chunk_t *root_chunk = NULL;
3766 uvm_pmm_gpu_t *pmm;
3767 struct mm_struct *mm;
3768
3769 params->chunk_was_evicted = NV_FALSE;
3770 params->evicted_physical_address = 0;
3771 params->chunk_size_backing_virtual = 0;
3772
3773 mm = uvm_va_space_mm_or_current_retain_lock(va_space);
3774 uvm_va_space_down_read(va_space);
3775
3776 gpu = uvm_va_space_get_gpu_by_uuid(va_space, ¶ms->gpu_uuid);
3777 if (!gpu || !uvm_parent_gpu_supports_eviction(gpu->parent)) {
3778 uvm_va_space_up_read(va_space);
3779 uvm_va_space_mm_or_current_release_unlock(va_space, mm);
3780 return NV_ERR_INVALID_DEVICE;
3781 }
3782 pmm = &gpu->pmm;
3783
3784 // Retain the GPU before unlocking the VA space so that it sticks around.
3785 uvm_gpu_retain(gpu);
3786
3787 // For virtual mode, look up and retain the block first so that eviction can
3788 // be started without the VA space lock held.
3789 if (params->eviction_mode == UvmTestEvictModeVirtual) {
3790 if (mm)
3791 status = uvm_va_block_find_create(va_space, params->address, NULL, &block);
3792 else
3793 status = uvm_va_block_find_create_managed(va_space, params->address, &block);
3794
3795 if (status != NV_OK) {
3796 uvm_va_space_up_read(va_space);
3797 uvm_va_space_mm_or_current_release_unlock(va_space, mm);
3798 goto out;
3799 }
3800
3801 // Retain the block before unlocking the VA space lock so that we can
3802 // safely access it later.
3803 uvm_va_block_retain(block);
3804 }
3805
3806 // Unlock the VA space to emulate real eviction better where a VA space lock
3807 // may not be held or may be held for a different VA space.
3808 uvm_va_space_up_read(va_space);
3809 uvm_va_space_mm_or_current_release_unlock(va_space, mm);
3810
3811 if (params->eviction_mode == UvmTestEvictModeVirtual) {
3812 UVM_ASSERT(block);
3813
3814 uvm_mutex_lock(&block->lock);
3815
3816 // As the VA space lock is not held we need to make sure the block
3817 // is still alive.
3818 if (!uvm_va_block_is_dead(block)) {
3819 // The block might have been split in the meantime and may no longer
3820 // cover the address as a result.
3821 if (params->address >= block->start && params->address <= block->end) {
3822 uvm_gpu_chunk_t *chunk = uvm_va_block_lookup_gpu_chunk(block, gpu, params->address);
3823
3824 uvm_spin_lock(&pmm->list_lock);
3825 if (chunk && chunk_is_evictable(pmm, chunk)) {
3826 chunk_start_eviction(pmm, chunk);
3827 root_chunk = root_chunk_from_chunk(pmm, chunk);
3828 params->chunk_size_backing_virtual = uvm_gpu_chunk_get_size(chunk);
3829 }
3830 uvm_spin_unlock(&pmm->list_lock);
3831 }
3832 }
3833 else {
3834 // Consider it an error to free the block before the eviction ioctl
3835 // is done.
3836 status = NV_ERR_INVALID_ADDRESS;
3837 }
3838
3839 uvm_mutex_unlock(&block->lock);
3840 uvm_va_block_release(block);
3841
3842 if (status != NV_OK)
3843 goto out;
3844 }
3845 else if (params->eviction_mode == UvmTestEvictModePhysical) {
3846 uvm_gpu_chunk_t *chunk;
3847 size_t index = params->address / UVM_CHUNK_SIZE_MAX;
3848
3849 if (index >= pmm->root_chunks.count) {
3850 status = NV_ERR_INVALID_ADDRESS;
3851 goto out;
3852 }
3853
3854 root_chunk = &pmm->root_chunks.array[index];
3855 chunk = &root_chunk->chunk;
3856
3857 uvm_spin_lock(&pmm->list_lock);
3858
3859 if (chunk_is_evictable(pmm, chunk))
3860 chunk_start_eviction(pmm, chunk);
3861 else
3862 chunk = NULL;
3863
3864 uvm_spin_unlock(&pmm->list_lock);
3865
3866 if (!chunk)
3867 root_chunk = NULL;
3868 }
3869 else if (params->eviction_mode == UvmTestEvictModeDefault) {
3870 root_chunk = pick_root_chunk_to_evict(pmm);
3871 }
3872 else {
3873 UVM_DBG_PRINT("Invalid eviction mode: 0x%x\n", params->eviction_mode);
3874 status = NV_ERR_INVALID_ARGUMENT;
3875 goto out;
3876 }
3877
3878 if (!root_chunk) {
3879 // Not finding a chunk to evict is not considered an error, the caller
3880 // can inspect the targeted_chunk_size to see whether anything was evicted.
3881 goto out;
3882 }
3883
3884 uvm_mutex_lock(&pmm->lock);
3885 status = evict_root_chunk(pmm, root_chunk, PMM_CONTEXT_DEFAULT);
3886 uvm_mutex_unlock(&pmm->lock);
3887
3888 if (status != NV_OK)
3889 goto out;
3890
3891 params->chunk_was_evicted = NV_TRUE;
3892 params->evicted_physical_address = root_chunk->chunk.address;
3893 free_chunk(pmm, &root_chunk->chunk);
3894
3895 out:
3896 uvm_gpu_release(gpu);
3897 return status;
3898 }
3899
test_check_pma_allocated_chunks(uvm_pmm_gpu_t * pmm,UVM_TEST_PMA_ALLOC_FREE_PARAMS * params,NvU64 * pages)3900 static NV_STATUS test_check_pma_allocated_chunks(uvm_pmm_gpu_t *pmm,
3901 UVM_TEST_PMA_ALLOC_FREE_PARAMS *params,
3902 NvU64 *pages)
3903 {
3904 NV_STATUS status = NV_OK;
3905 NvU32 i;
3906
3907 for (i = 0; i < params->num_pages; ++i) {
3908 uvm_gpu_root_chunk_t *root_chunk;
3909 NvU64 address;
3910 if (params->contiguous)
3911 address = pages[0] + ((NvU64)params->page_size) * i;
3912 else
3913 address = pages[i];
3914
3915 root_chunk = root_chunk_from_address(pmm, address);
3916
3917 if (!IS_ALIGNED(address, params->page_size)) {
3918 UVM_TEST_PRINT("Returned unaligned address 0x%llx page size %u\n", address, params->page_size);
3919 status = NV_ERR_INVALID_STATE;
3920 }
3921
3922 // The chunk should still be in the PMA owned state
3923 uvm_spin_lock(&pmm->list_lock);
3924 if (root_chunk->chunk.state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED) {
3925 UVM_TEST_PRINT("Root chunk 0x%llx invalid state: %s, allocated [0x%llx, 0x%llx)\n",
3926 root_chunk->chunk.address,
3927 uvm_pmm_gpu_chunk_state_string(root_chunk->chunk.state),
3928 address, address + params->page_size);
3929 status = NV_ERR_INVALID_STATE;
3930 }
3931 uvm_spin_unlock(&pmm->list_lock);
3932 }
3933 return status;
3934 }
3935
uvm_test_pma_alloc_free(UVM_TEST_PMA_ALLOC_FREE_PARAMS * params,struct file * filp)3936 NV_STATUS uvm_test_pma_alloc_free(UVM_TEST_PMA_ALLOC_FREE_PARAMS *params, struct file *filp)
3937 {
3938 NV_STATUS status = NV_OK;
3939 uvm_gpu_t *gpu;
3940 uvm_pmm_gpu_t *pmm;
3941 NvU64 page;
3942 NvU64 *pages = NULL;
3943 NvU32 free_flags;
3944 UvmPmaAllocationOptions options = {0};
3945 uvm_va_space_t *va_space = uvm_va_space_get(filp);
3946
3947 gpu = uvm_va_space_retain_gpu_by_uuid(va_space, ¶ms->gpu_uuid);
3948 if (!gpu)
3949 return NV_ERR_INVALID_DEVICE;
3950
3951 pmm = &gpu->pmm;
3952
3953 options.flags = UVM_PMA_ALLOCATE_PINNED;
3954 if (params->contiguous) {
3955 options.flags |= UVM_PMA_ALLOCATE_CONTIGUOUS;
3956 pages = &page;
3957 }
3958 else {
3959 pages = uvm_kvmalloc(sizeof(*pages) * params->num_pages);
3960 if (!pages) {
3961 status = NV_ERR_NO_MEMORY;
3962 goto out;
3963 }
3964 }
3965 if (params->phys_begin != 0 || params->phys_end != 0) {
3966 options.physBegin = params->phys_begin;
3967 options.physEnd = params->phys_end;
3968 options.flags |= UVM_PMA_ALLOCATE_SPECIFY_ADDRESS_RANGE;
3969 }
3970
3971 status = nvUvmInterfacePmaAllocPages(pmm->pma, params->num_pages, params->page_size, &options, pages);
3972 if (status != NV_OK)
3973 goto out;
3974
3975 status = test_check_pma_allocated_chunks(pmm, params, pages);
3976 if (status != NV_OK) {
3977 UVM_TEST_PRINT("Failed before the nap\n");
3978 goto free;
3979 }
3980
3981 if (params->nap_us_before_free)
3982 usleep_range(params->nap_us_before_free, params->nap_us_before_free + 10);
3983
3984 status = test_check_pma_allocated_chunks(pmm, params, pages);
3985 if (status != NV_OK)
3986 UVM_TEST_PRINT("Failed after the nap\n");
3987
3988 free:
3989 free_flags = options.flags;
3990
3991 if (!!(options.resultFlags & UVM_PMA_ALLOCATE_RESULT_IS_ZERO))
3992 free_flags |= UVM_PMA_FREE_IS_ZERO;
3993
3994 nvUvmInterfacePmaFreePages(gpu->pmm.pma, pages, params->num_pages, params->page_size, free_flags);
3995
3996 out:
3997 if (!params->contiguous)
3998 uvm_kvfree(pages);
3999
4000 uvm_gpu_release(gpu);
4001 return status;
4002 }
4003
uvm_test_pmm_alloc_free_root(UVM_TEST_PMM_ALLOC_FREE_ROOT_PARAMS * params,struct file * filp)4004 NV_STATUS uvm_test_pmm_alloc_free_root(UVM_TEST_PMM_ALLOC_FREE_ROOT_PARAMS *params, struct file *filp)
4005 {
4006 NV_STATUS status = NV_OK;
4007 uvm_gpu_t *gpu;
4008 uvm_pmm_gpu_t *pmm;
4009 uvm_gpu_chunk_t *chunk;
4010 uvm_tracker_t tracker = UVM_TRACKER_INIT();
4011 uvm_va_space_t *va_space = uvm_va_space_get(filp);
4012
4013 gpu = uvm_va_space_retain_gpu_by_uuid(va_space, ¶ms->gpu_uuid);
4014 if (!gpu)
4015 return NV_ERR_INVALID_DEVICE;
4016
4017 pmm = &gpu->pmm;
4018
4019 status = uvm_pmm_gpu_alloc_user(pmm,
4020 1,
4021 UVM_CHUNK_SIZE_MAX,
4022 UVM_PMM_ALLOC_FLAGS_EVICT | UVM_PMM_ALLOC_FLAGS_DONT_BATCH,
4023 &chunk,
4024 &tracker);
4025
4026 if (status != NV_OK)
4027 goto out;
4028
4029 if (params->nap_us_before_free)
4030 usleep_range(params->nap_us_before_free, params->nap_us_before_free + 10);
4031
4032 uvm_pmm_gpu_free(pmm, chunk, NULL);
4033 uvm_tracker_deinit(&tracker);
4034
4035 out:
4036 uvm_gpu_release(gpu);
4037 return status;
4038 }
4039
uvm_test_pmm_inject_pma_evict_error(UVM_TEST_PMM_INJECT_PMA_EVICT_ERROR_PARAMS * params,struct file * filp)4040 NV_STATUS uvm_test_pmm_inject_pma_evict_error(UVM_TEST_PMM_INJECT_PMA_EVICT_ERROR_PARAMS *params, struct file *filp)
4041 {
4042 uvm_gpu_t *gpu;
4043 uvm_pmm_gpu_t *pmm;
4044 uvm_va_space_t *va_space = uvm_va_space_get(filp);
4045
4046 gpu = uvm_va_space_retain_gpu_by_uuid(va_space, ¶ms->gpu_uuid);
4047 if (!gpu)
4048 return NV_ERR_INVALID_DEVICE;
4049
4050 pmm = &gpu->pmm;
4051
4052 uvm_mutex_lock(&pmm->lock);
4053 pmm->inject_pma_evict_error_after_num_chunks = params->error_after_num_chunks;
4054 uvm_mutex_unlock(&pmm->lock);
4055
4056 uvm_gpu_release(gpu);
4057 return NV_OK;
4058 }
4059
uvm_test_pmm_release_free_root_chunks(UVM_TEST_PMM_RELEASE_FREE_ROOT_CHUNKS_PARAMS * params,struct file * filp)4060 NV_STATUS uvm_test_pmm_release_free_root_chunks(UVM_TEST_PMM_RELEASE_FREE_ROOT_CHUNKS_PARAMS *params,
4061 struct file *filp)
4062 {
4063 uvm_gpu_t *gpu;
4064 uvm_va_space_t *va_space = uvm_va_space_get(filp);
4065
4066 gpu = uvm_va_space_retain_gpu_by_uuid(va_space, ¶ms->gpu_uuid);
4067 if (!gpu)
4068 return NV_ERR_INVALID_DEVICE;
4069
4070 release_free_root_chunks(&gpu->pmm);
4071
4072 uvm_gpu_release(gpu);
4073 return NV_OK;
4074 }
4075
uvm_test_pma_get_batch_size(UVM_TEST_PMA_GET_BATCH_SIZE_PARAMS * params,struct file * filp)4076 NV_STATUS uvm_test_pma_get_batch_size(UVM_TEST_PMA_GET_BATCH_SIZE_PARAMS *params, struct file *filp)
4077 {
4078 uvm_gpu_t *gpu;
4079 uvm_va_space_t *va_space = uvm_va_space_get(filp);
4080
4081 gpu = uvm_va_space_retain_gpu_by_uuid(va_space, ¶ms->gpu_uuid);
4082 if (!gpu)
4083 return NV_ERR_INVALID_DEVICE;
4084
4085 if (gpu->parent->rm_info.isSimulated)
4086 params->pma_batch_size = UVM_CHUNK_SIZE_MAX;
4087 else
4088 params->pma_batch_size = (1 << uvm_perf_pma_batch_nonpinned_order) * UVM_CHUNK_SIZE_MAX;
4089
4090 uvm_gpu_release(gpu);
4091 return NV_OK;
4092 }
4093
uvm_test_pmm_query_pma_stats(UVM_TEST_PMM_QUERY_PMA_STATS_PARAMS * params,struct file * filp)4094 NV_STATUS uvm_test_pmm_query_pma_stats(UVM_TEST_PMM_QUERY_PMA_STATS_PARAMS *params, struct file *filp)
4095 {
4096 uvm_gpu_t *gpu;
4097 uvm_va_space_t *va_space = uvm_va_space_get(filp);
4098
4099 gpu = uvm_va_space_retain_gpu_by_uuid(va_space, ¶ms->gpu_uuid);
4100 if (!gpu)
4101 return NV_ERR_INVALID_DEVICE;
4102
4103 params->pma_stats.numFreePages64k = UVM_READ_ONCE(gpu->pmm.pma_stats->numFreePages64k);
4104 params->pma_stats.numFreePages2m = UVM_READ_ONCE(gpu->pmm.pma_stats->numFreePages2m);
4105
4106 uvm_gpu_release(gpu);
4107 return NV_OK;
4108 }
4109