1 /*******************************************************************************
2 Copyright (c) 2015-2022 NVIDIA Corporation
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to
6 deal in the Software without restriction, including without limitation the
7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8 sell copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be
12 included in all copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 DEALINGS IN THE SOFTWARE.
21
22 *******************************************************************************/
23
24 #ifndef __UVM_VA_RANGE_H__
25 #define __UVM_VA_RANGE_H__
26
27 #include "uvm_linux.h"
28 #include "nv-kref.h"
29 #include "uvm_common.h"
30 #include "uvm_perf_module.h"
31 #include "uvm_processors.h"
32 #include "uvm_gpu.h"
33 #include "uvm_lock.h"
34 #include "uvm_va_space.h"
35 #include "uvm_range_tree.h"
36 #include "uvm_va_policy.h"
37 #include "uvm_test_ioctl.h"
38 #include "uvm_range_group.h"
39 #include "uvm_forward_decl.h"
40 #include "uvm_mmu.h"
41 #include "uvm_hal_types.h"
42 #include "uvm_mem.h"
43 #include "uvm_tracker.h"
44 #include "uvm_ioctl.h"
45
46 // VA Ranges are the UVM driver equivalent of Linux kernel vmas. They represent
47 // user allocations of any page-aligned size. We maintain these as a separate
48 // data structure from the vma tree for several reasons:
49 //
50 // 1) RM allocations mapped to the GPU by UVM don't have associated UVM vmas
51 //
52 // 2) We don't always have a separate reference on the vma's mm_struct, so we
53 // can't always lock mmap_lock on paths where current->mm != vma->vm_mm.
54 //
55 // 3) HMM vmas aren't ours, so we can't use their vm_private_data pointers.
56 //
57 // The tree as a whole is protected by va_space->lock. Faults and mappings only
58 // need to take the lock in read mode.
59 // Modification of the range state (such as changes to logical permissions or
60 // location preferences) must take the lock in write mode.
61 //
62 // VA ranges with type == UVM_VA_RANGE_TYPE_MANAGED:
63 // Each va_range is contained completely within a parent vma. There can be
64 // multiple va_ranges under the same vma, but not vice versa. All VAs within
65 // the va_range share the same policy state.
66 //
67 // Each va_range is a collection of VA blocks. The VA blocks each have
68 // individual locks, and they hold the current mapping and location state
69 // for their block across all processors (CPU and all GPUs).
70 //
71 // VA ranges with type == UVM_VA_RANGE_TYPE_EXTERNAL:
72 // These ranges track physical allocations made by RM. The UVM driver is
73 // responsible for mapping them to the GPU(s), but not to the CPU. These
74 // ranges do not support faulting nor migration, and they do not necessarily
75 // correspond to valid vmas.
76 //
77 // These ranges do not have blocks. All state (page tables, mapping handles,
78 // etc) is maintained within the range.
79 //
80 // VA ranges with type == UVM_VA_RANGE_TYPE_CHANNEL:
81 // These are similar to EXTERNAL ranges, except they represent internal
82 // allocations required for user channels to operate (context save areas,
83 // for example).
84 //
85 // VA ranges with type == UVM_VA_RANGE_TYPE_SKED_REFLECTED:
86 // These ranges track special SKED reflected mappings required for CNP. The
87 // mappings don't have any physical backing. They just use PTEs with a
88 // special kind, see make_sked_reflected_pte_pascal() for an example of the
89 // PTE encoding.
90 // Notably the API that creates these ranges calls them "dynamic parallelism
91 // regions", but we use "SKED reflected ranges" internally as it's more
92 // descriptive.
93 //
94 // VA ranges with type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL:
95 // These ranges track semaphore pool allocations. They are backed by sysmem,
96 // and persistently mapped on the CPU and all GPUs (with registered VA
97 // spaces) in a user VA space. The ranges are also mapped on UVM internal VA
98 // space on the CPU and all registered GPUs.
99 //
100 // These ranges do not have blocks.
101 //
102
103 // This enum must be kept in sync with UVM_TEST_VA_RANGE_TYPE in
104 // uvm_test_ioctl.h
105 typedef enum
106 {
107 UVM_VA_RANGE_TYPE_INVALID = 0,
108 UVM_VA_RANGE_TYPE_MANAGED,
109 UVM_VA_RANGE_TYPE_EXTERNAL,
110 UVM_VA_RANGE_TYPE_CHANNEL,
111 UVM_VA_RANGE_TYPE_SKED_REFLECTED,
112 UVM_VA_RANGE_TYPE_SEMAPHORE_POOL,
113 UVM_VA_RANGE_TYPE_MAX
114 } uvm_va_range_type_t;
115
116 // Wrapper to protect access to VMA's vm_page_prot
117 typedef struct
118 {
119 // Needed for creating CPU mappings on the va_range. Do not access this
120 // directly, instead use uvm_va_range_vma and friends.
121 struct vm_area_struct *vma;
122
123 uvm_rw_semaphore_t lock;
124 } uvm_vma_wrapper_t;
125
126 // TODO: Bug 1733295. VA range types should really be inverted. Instead of
127 // maintaining common node state with a union of structs, we should have
128 // separate C types for each VA range type. Each type would embed a common
129 // VA range node.
130 //
131 // There's a lot of state in the top-level uvm_va_range_t struct below
132 // which really belongs in the per-type structs (for example, blocks).
133 // We're deferring that cleanup to the full refactor.
134
135 // va_range state when va_range.type == UVM_VA_RANGE_TYPE_MANAGED
136 typedef struct
137 {
138 // This is null in the case of a zombie allocation. Zombie allocations are
139 // created from unfreed allocations at termination of a process which used
140 // UVM_INIT_FLAGS_MULTI_PROCESS_SHARING_MODE, when at least one other
141 // process is sharing the UVM file descriptor.
142 uvm_vma_wrapper_t *vma_wrapper;
143
144 // Managed allocations only use this policy and never use the policy
145 // stored in the va_block for HMM allocations.
146 uvm_va_policy_t policy;
147
148 uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT];
149 } uvm_va_range_managed_t;
150
151 typedef struct
152 {
153 // GPU mapping the allocation. The GPU's RM address space is required when
154 // releasing the handle.
155 uvm_gpu_t *gpu;
156
157 // RM handle to the physical allocation. This handle is dup'd into our client
158 // once - on initial mapping of the external allocation. If the allocation is
159 // ever split, its ref_count is incremented. The allocation is not released
160 // until the ref_count drops to 0.
161 NvHandle rm_handle;
162
163 // Refcount for this handle/allocation. The refcount is used when external
164 // ranges are split, resulting in two ranges using the same physical allocation.
165 nv_kref_t ref_count;
166 } uvm_ext_gpu_mem_handle;
167
168 typedef struct
169 {
170 uvm_range_tree_node_t node;
171
172 // Handle to the physical user allocation dup'd into our client. This
173 // prevents the allocation from being removed until we free it, even if the
174 // user frees their handle without telling us.
175 // This will be NULL for sparse mappings, which don't correspond to actual
176 // allocations.
177 uvm_ext_gpu_mem_handle *mem_handle;
178
179 // Tracks completion of PTE writes on pt_range_vec. The tree lock
180 // protecting this ext_gpu_map may be dropped before those writes are
181 // complete, so subsequent operations on this ext_gpu_map must acquire this
182 // tracker before operating on pt_range_vec.
183 uvm_tracker_t tracker;
184
185 // GPU on which this allocation is mapped.
186 uvm_gpu_t *gpu;
187
188 // GPU which owns the allocation. For sysmem, this is the GPU that the
189 // sysmem was originally allocated under. For the allocation to remain valid
190 // we need to prevent the GPU from going away, similarly to P2P mapped
191 // memory.
192 // Similarly for EGM memory.
193 //
194 // This field is not used for sparse mappings as they don't have an
195 // allocation and, hence, owning GPU.
196 //
197 // TODO: Bug 1811006: The semantics of sysmem might change depending on the
198 // resolution of this bug.
199 //
200 // TODO: Bug 1757136: For SLI, this is any GPU in the SLI group. We may need
201 // to handle that specially.
202 uvm_gpu_t *owning_gpu;
203
204 // We need to know whether this memory is actually located on owning_gpu so
205 // we know what type of membar is needed at TLB invalidate time, and to know
206 // if the mapping GPU has to be unmapped on UvmDisablePeerAccess.
207 //
208 // This field is not used for sparse mappings as they don't have physical
209 // backing.
210 bool is_sysmem;
211
212 // EGM memory. If true is_sysmem also has to be true and owning_gpu
213 // has to be valid.
214 bool is_egm;
215 // GPU page tables mapping the allocation
216 uvm_page_table_range_vec_t pt_range_vec;
217
218 // Node for the deferred free list where this allocation is stored upon
219 // unmapped.
220 //
221 // This field is unused for sparse mappings. Since they don't have physical
222 // backing there is no RM object to be freed when the mapping is unmapped.
223 uvm_deferred_free_object_t deferred_free;
224 } uvm_ext_gpu_map_t;
225
226 typedef struct
227 {
228 // Lock protecting the range tree.
229 uvm_mutex_t lock;
230
231 // Range tree that contains all of the mapped portions of an External VA
232 // range. The tree holds uvm_ext_gpu_map_t instances.
233 uvm_range_tree_t tree;
234 } uvm_ext_gpu_range_tree_t;
235
236 typedef struct
237 {
238 // Mask of GPUs which have mappings to this VA range. If a bit in this mask
239 // is set, the corresponding pointer in gpu_ranges is valid.
240 // The bitmap can be safely accessed by following the locking rules:
241 // * If the VA space lock is held for write, the mask can be read or written
242 // normally.
243 // * If the VA space lock is held for read, and one of the range tree locks is
244 // held, only the bit corresponding to that GPU range tree can be accessed.
245 // Writes must use uvm_processor_mask_set_atomic and
246 // uvm_processor_mask_clear_atomic to avoid clobbering other bits in the
247 // mask. If no range tree lock is held, the mask cannot be accessed.
248 // * If the VA space lock is not held, the mask cannot be accessed
249 uvm_processor_mask_t mapped_gpus;
250
251 // Per-GPU tree of mapped external allocations. This has to be per-GPU in the VA
252 // range because each GPU is able to map a completely different set of
253 // allocations to the same VA range.
254 uvm_ext_gpu_range_tree_t gpu_ranges[UVM_ID_MAX_GPUS];
255
256 // Dynamically allocated page mask allocated in
257 // uvm_va_range_create_external() and used and freed in uvm_free().
258 uvm_processor_mask_t *retained_mask;
259 } uvm_va_range_external_t;
260
261 // va_range state when va_range.type == UVM_VA_RANGE_TYPE_CHANNEL. This
262 // represents a channel buffer resource and mapping.
263 typedef struct
264 {
265 // Only a single GPU can map a channel resource, so we only need one GPU
266 // VA space parent.
267 uvm_gpu_va_space_t *gpu_va_space;
268
269 // Page tables mapped by this range
270 uvm_page_table_range_vec_t pt_range_vec;
271
272 // Physical location of this channel resource. All pages have the same
273 // aperture.
274 uvm_aperture_t aperture;
275
276 // Note that this is not a normal RM object handle. It is a non-zero opaque
277 // identifier underneath the GPU VA space which represents this channel
278 // resource. Each channel using this VA range has retained this descriptor
279 // and is responsible for releasing it. That's safe because channels outlive
280 // their VA ranges.
281 NvP64 rm_descriptor;
282
283 // This is an ID assigned by RM to each resource descriptor.
284 NvU32 rm_id;
285
286 // The TSG which owns this mapping. Sharing of VA ranges is only allowed
287 // within the same TSG. If valid == false, no sharing is allowed because the
288 // channel is not in a TSG.
289 struct
290 {
291 bool valid;
292 NvU32 id;
293 } tsg;
294
295 NvU64 ref_count;
296
297 // Storage in the corresponding uvm_gpu_va_space's channel_va_ranges list
298 struct list_head list_node;
299 } uvm_va_range_channel_t;
300
301 // va_range state when va_range.type == UVM_VA_RANGE_TYPE_SKED_REFLECTED. This
302 // represents a sked reflected mapping.
303 typedef struct
304 {
305 // Each SKED reflected range is unique to a single GPU so only a single GPU
306 // VA space needs to be tracked.
307 uvm_gpu_va_space_t *gpu_va_space;
308
309 // Page tables mapped by this range
310 uvm_page_table_range_vec_t pt_range_vec;
311 } uvm_va_range_sked_reflected_t;
312
313 typedef struct
314 {
315 uvm_mem_t *mem;
316
317 // The optional owner is a GPU (at most one) that has the allocation cached -
318 // in this case, all writes must be done from this GPU.
319 // protected by va_space lock
320 uvm_gpu_t *owner;
321
322 // Per-gpu attributes
323 uvm_mem_gpu_mapping_attrs_t gpu_attrs[UVM_ID_MAX_GPUS];
324
325 // Default attributes to assign when a new GPU is registered
326 uvm_mem_gpu_mapping_attrs_t default_gpu_attrs;
327
328 // Tracks all outstanding GPU work using this allocation.
329 uvm_tracker_t tracker;
330 uvm_mutex_t tracker_lock;
331 } uvm_va_range_semaphore_pool_t;
332
333 struct uvm_va_range_struct
334 {
335 // Parent uvm_va_space.
336 uvm_va_space_t *va_space;
337
338 // Storage in VA range tree. Also contains range start and end.
339 // start and end + 1 have to be PAGE_SIZED aligned.
340 uvm_range_tree_node_t node;
341
342 // Force the next split on this range to fail. Set by error injection ioctl
343 // (testing purposes only).
344 bool inject_split_error;
345
346 // Force the next register_gpu_va_space to fail while adding this va_range.
347 // Set by error injection ioctl (testing purposes only).
348 bool inject_add_gpu_va_space_error;
349
350 // Mask of UVM-Lite GPUs for the VA range
351 //
352 // If the preferred location is set to a non-faultable GPU or the CPU,
353 // this mask contains all non-faultable GPUs that are in the accessed by
354 // mask and the preferred location itself if it's a GPU. Empty otherwise.
355 //
356 // All UVM-Lite GPUs have mappings only to the preferred location. The
357 // mappings are initially established only when the pages are resident on
358 // the preferred location, but persist after that until the preferred
359 // location is changed or a GPU stops being a UVM-Lite GPU.
360 uvm_processor_mask_t uvm_lite_gpus;
361
362 // This is a uvm_va_block_t ** array of all VA block pointers under this
363 // range. The pointers can be accessed using the functions
364 // uvm_va_range_block() and uvm_va_range_block_create(). The latter
365 // allocates the block if it doesn't already exist. Once allocated, the
366 // blocks persist in the array until the parent VA range is destroyed.
367 //
368 // Concurrent on-demand allocation requires the use of either atomics or a
369 // spin lock. Given that we don't want to take a spin lock for every lookup,
370 // and that the blocks are persistent, atomics are preferred.
371 //
372 // The number of blocks is calculated from the range size using
373 // uvm_va_range_num_blocks().
374 //
375 // TODO: Bug 1766585: Compare perf of up-front allocation and demand-
376 // allocation of blocks in the common case (lots of accessed blocks)
377 // and the sparse case. If the common case is hurt by demand-
378 // allocation, or if the sparse case isn't helped much, just allocate
379 // them all at range allocation.
380 atomic_long_t *blocks;
381
382 uvm_va_range_type_t type;
383 union
384 {
385 uvm_va_range_managed_t managed;
386 uvm_va_range_external_t external;
387 uvm_va_range_channel_t channel;
388 uvm_va_range_sked_reflected_t sked_reflected;
389 uvm_va_range_semaphore_pool_t semaphore_pool;
390 };
391 };
392
393 // Module load/exit
394 NV_STATUS uvm_va_range_init(void);
395 void uvm_va_range_exit(void);
396
uvm_va_range_size(uvm_va_range_t * va_range)397 static NvU64 uvm_va_range_size(uvm_va_range_t *va_range)
398 {
399 return uvm_range_tree_node_size(&va_range->node);
400 }
401
uvm_va_range_is_aligned(uvm_va_range_t * va_range,NvU64 alignment)402 static bool uvm_va_range_is_aligned(uvm_va_range_t *va_range, NvU64 alignment)
403 {
404 return IS_ALIGNED(va_range->node.start, alignment) && IS_ALIGNED(uvm_va_range_size(va_range), alignment);
405 }
406
uvm_va_range_is_managed_zombie(uvm_va_range_t * va_range)407 static bool uvm_va_range_is_managed_zombie(uvm_va_range_t *va_range)
408 {
409 return va_range->type == UVM_VA_RANGE_TYPE_MANAGED && va_range->managed.vma_wrapper == NULL;
410 }
411
412 // Create a va_range with type UVM_VA_RANGE_TYPE_MANAGED. The out va_range pointer
413 // is optional.
414 //
415 // Returns NV_ERR_UVM_ADDRESS_IN_USE if the vma overlaps with an existing range
416 // in the va_space tree.
417 NV_STATUS uvm_va_range_create_mmap(uvm_va_space_t *va_space,
418 struct mm_struct *mm,
419 uvm_vma_wrapper_t *vma_wrapper,
420 uvm_va_range_t **out_va_range);
421
422 // Create a va_range with type UVM_VA_RANGE_TYPE_EXTERNAL. The out va_range
423 // pointer is optional.
424 //
425 // Returns NV_ERR_UVM_ADDRESS_IN_USE if the range overlaps with an existing
426 // range in the va_space tree.
427 NV_STATUS uvm_va_range_create_external(uvm_va_space_t *va_space,
428 struct mm_struct *mm,
429 NvU64 start,
430 NvU64 length,
431 uvm_va_range_t **out_va_range);
432
433 // Create a va_range with type UVM_VA_RANGE_TYPE_CHANNEL. The out va_range
434 // pointer is optional.
435 //
436 // Returns NV_ERR_UVM_ADDRESS_IN_USE if the range overlaps with an existing
437 // range in the va_space tree.
438 NV_STATUS uvm_va_range_create_channel(uvm_va_space_t *va_space,
439 struct mm_struct *mm,
440 NvU64 start,
441 NvU64 end,
442 uvm_va_range_t **out_va_range);
443
444 NV_STATUS uvm_va_range_create_sked_reflected(uvm_va_space_t *va_space,
445 struct mm_struct *mm,
446 NvU64 start,
447 NvU64 length,
448 uvm_va_range_t **out_va_range);
449
450 NV_STATUS uvm_va_range_create_semaphore_pool(uvm_va_space_t *va_space,
451 struct mm_struct *mm,
452 NvU64 start,
453 NvU64 length,
454 const UvmGpuMappingAttributes *per_gpu_attrs,
455 NvU32 per_gpu_attrs_count,
456 uvm_va_range_t **out_va_range);
457
458 // Destroys any state associated with this VA range, removes the VA range from
459 // the VA space, and frees the VA range.
460 //
461 // deferred_free_list may be NULL if the VA range type is known to not require
462 // deferred free. Otherwise this function adds entries to the list for later
463 // processing by uvm_deferred_free_object_list.
464 void uvm_va_range_destroy(uvm_va_range_t *va_range, struct list_head *deferred_free_list);
465
466 void uvm_va_range_zombify(uvm_va_range_t *va_range);
467
468 NV_STATUS uvm_api_clean_up_zombie_resources(UVM_CLEAN_UP_ZOMBIE_RESOURCES_PARAMS *params, struct file *filp);
469 NV_STATUS uvm_api_validate_va_range(UVM_VALIDATE_VA_RANGE_PARAMS *params, struct file *filp);
470
471 // Inform the VA range that a GPU VA space is now available for them to map, if
472 // the VA range is supposed to proactively map GPUs (UvmAllocSemaphorePool,
473 // UvmSetAccessedBy).
474 //
475 // If mm != NULL, that mm is used for any CPU mappings which may be created as
476 // a result of this call. See uvm_va_block_context_t::mm for details.
477 //
478 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
479 // mode.
480 NV_STATUS uvm_va_range_add_gpu_va_space(uvm_va_range_t *va_range,
481 uvm_gpu_va_space_t *gpu_va_space,
482 struct mm_struct *mm);
483
484 // Destroy the VA range's mappings on the GPU, if it has any
485 //
486 // If mm != NULL, that mm is used for any CPU mappings which may be created as
487 // a result of this call. See uvm_va_block_context_t::mm for details.
488 //
489 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
490 // mode.
491 void uvm_va_range_remove_gpu_va_space(uvm_va_range_t *va_range,
492 uvm_gpu_va_space_t *gpu_va_space,
493 struct mm_struct *mm,
494 struct list_head *deferred_free_list);
495
496 // Inform the VA range that peer mappings can now be established between the
497 // GPUs, if the VA range is supposed to proactively create them (UvmSetAccessedBy).
498 NV_STATUS uvm_va_range_enable_peer(uvm_va_range_t *va_range, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
499
500 // Unmap all page tables in this VA range which have peer mappings between these
501 // two GPUs, in either direction.
502 void uvm_va_range_disable_peer(uvm_va_range_t *va_range,
503 uvm_gpu_t *gpu0,
504 uvm_gpu_t *gpu1,
505 struct list_head *deferred_free_list);
506
507 // Notify the VA range of a newly registered GPU.
508 //
509 // LOCKING: the lock of the enclosing VA space is held in R/W mode
510 NV_STATUS uvm_va_range_register_gpu(uvm_va_range_t *va_range, uvm_gpu_t *gpu);
511
512 // Unmap all page tables in this VA range which map memory owned by this GPU.
513 // Managed ranges will have any memory still resident on this GPU evicted to
514 // system memory.
515 //
516 // deferred_free_list may be NULL if the VA range type is known to not require
517 // deferred free. Otherwise this function adds entries to the list for later
518 // processing by uvm_deferred_free_object_list.
519 //
520 // If mm != NULL, that mm is used for any CPU mappings which may be created as
521 // a result of this call. See uvm_va_block_context_t::mm for details.
522 //
523 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
524 // mode.
525 void uvm_va_range_unregister_gpu(uvm_va_range_t *va_range,
526 uvm_gpu_t *gpu,
527 struct mm_struct *mm,
528 struct list_head *deferred_free_list);
529
530 // Splits existing_va_range into two pieces, with new_va_range always after
531 // existing. existing is updated to have new_end. new_end+1 must be page-
532 // aligned.
533 //
534 // Before: [----------- existing ------------]
535 // After: [---- existing ----][---- new ----]
536 // ^new_end
537 //
538 // On error, existing_va_range is still accessible and is left in its original
539 // functional state.
540 //
541 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
542 NV_STATUS uvm_va_range_split(uvm_va_range_t *existing_va_range,
543 NvU64 new_end,
544 uvm_va_range_t **new_va_range);
545
546 // TODO: Bug 1707562: Merge va ranges
547
uvm_va_range_container(uvm_range_tree_node_t * node)548 static uvm_va_range_t *uvm_va_range_container(uvm_range_tree_node_t *node)
549 {
550 if (!node)
551 return NULL;
552 return container_of(node, uvm_va_range_t, node);
553 }
554
555 // Returns the va_range containing addr, if any
556 uvm_va_range_t *uvm_va_range_find(uvm_va_space_t *va_space, NvU64 addr);
557
uvm_ext_gpu_map_container(uvm_range_tree_node_t * node)558 static uvm_ext_gpu_map_t *uvm_ext_gpu_map_container(uvm_range_tree_node_t *node)
559 {
560 if (!node)
561 return NULL;
562 return container_of(node, uvm_ext_gpu_map_t, node);
563 }
564
565 // Iterators for all va_ranges
566
567 #define uvm_for_each_va_range(va_range, va_space) \
568 list_for_each_entry((va_range), &(va_space)->va_range_tree.head, node.list)
569
570 #define uvm_for_each_va_range_safe(va_range, va_range_next, va_space) \
571 list_for_each_entry_safe((va_range), (va_range_next), &(va_space)->va_range_tree.head, node.list)
572
573
574 // Iterators for specific ranges
575
576 // Returns the first va_range in the range [start, end], if any
577 uvm_va_range_t *uvm_va_space_iter_first(uvm_va_space_t *va_space, NvU64 start, NvU64 end);
578
579 // Returns the va_range following the provided va_range in address order, if
580 // that va_range's start <= the provided end.
581 uvm_va_range_t *uvm_va_space_iter_next(uvm_va_range_t *va_range, NvU64 end);
582
583 // Like uvm_va_space_iter_next, but also returns NULL if the next va_range
584 // is not adjacent to the provided va_range.
uvm_va_space_iter_next_contig(uvm_va_range_t * va_range,NvU64 end)585 static uvm_va_range_t *uvm_va_space_iter_next_contig(uvm_va_range_t *va_range, NvU64 end)
586 {
587 uvm_va_range_t *next = uvm_va_space_iter_next(va_range, end);
588 if (next && next->node.start != va_range->node.end + 1)
589 return NULL;
590 return next;
591 }
592
593 // Returns whether the range [start, end] has any VA ranges within it
uvm_va_space_range_empty(uvm_va_space_t * va_space,NvU64 start,NvU64 end)594 static bool uvm_va_space_range_empty(uvm_va_space_t *va_space, NvU64 start, NvU64 end)
595 {
596 return uvm_va_space_iter_first(va_space, start, end) == NULL;
597 }
598
599 #define uvm_for_each_va_range_in(va_range, va_space, start, end) \
600 for ((va_range) = uvm_va_space_iter_first((va_space), (start), (end)); \
601 (va_range); \
602 (va_range) = uvm_va_space_iter_next((va_range), (end)))
603
604 #define uvm_for_each_va_range_in_safe(va_range, va_range_next, va_space, start, end) \
605 for ((va_range) = uvm_va_space_iter_first((va_space), (start), (end)), \
606 (va_range_next) = uvm_va_space_iter_next((va_range), (end)); \
607 (va_range); \
608 (va_range) = (va_range_next), (va_range_next) = uvm_va_space_iter_next((va_range), (end)))
609
610 // Iterator for all contiguous VA ranges between [start, end]. If any part of
611 // [start, end] is not covered by a VA range, iteration stops.
612 #define uvm_for_each_va_range_in_contig(va_range, va_space, start, end) \
613 for ((va_range) = uvm_va_space_iter_first((va_space), (start), (start)); \
614 (va_range); \
615 (va_range) = uvm_va_space_iter_next_contig((va_range), (end)))
616
617 #define uvm_for_each_va_range_in_contig_from(va_range, va_space, first_va_range, end) \
618 for ((va_range) = (first_va_range); \
619 (va_range); \
620 (va_range) = uvm_va_space_iter_next_contig((va_range), (end)))
621
622 // Like uvm_for_each_va_range_in_contig but also stops iteration if any VA range
623 // has a type other than UVM_VA_RANGE_TYPE_MANAGED.
624 #define uvm_for_each_managed_va_range_in_contig(va_range, va_space, start, end) \
625 for ((va_range) = uvm_va_space_iter_first((va_space), (start), (start)); \
626 (va_range) && (va_range)->type == UVM_VA_RANGE_TYPE_MANAGED; \
627 (va_range) = uvm_va_space_iter_next_contig((va_range), (end)))
628
629 #define uvm_for_each_va_range_in_vma(va_range, vma) \
630 uvm_for_each_va_range_in(va_range, \
631 uvm_va_space_get(vma->vm_file), \
632 vma->vm_start, \
633 vma->vm_end - 1)
634
635 #define uvm_for_each_va_range_in_vma_safe(va_range, va_range_next, vma) \
636 uvm_for_each_va_range_in_safe(va_range, \
637 va_range_next, \
638 uvm_va_space_get(vma->vm_file), \
639 vma->vm_start, \
640 vma->vm_end - 1)
641
642 // Only call this if you're sure that either:
643 // 1) You have a reference on the vma's vm_mm and that vma->vm_mm's mmap_lock is
644 // held; or
645 // 2) You won't be operating on the vma (as with vm_insert_page) or accessing
646 // any fields in the vma that can change without va_space->lock being held
647 // (such as vm_flags).
648 //
649 // Otherwise, use uvm_va_range_vma_current or uvm_va_range_vma_check and be
650 // prepared to handle a NULL return value.
uvm_va_range_vma(uvm_va_range_t * va_range)651 static struct vm_area_struct *uvm_va_range_vma(uvm_va_range_t *va_range)
652 {
653 struct vm_area_struct *vma;
654 UVM_ASSERT_MSG(va_range->type == UVM_VA_RANGE_TYPE_MANAGED, "type: %d", va_range->type);
655 UVM_ASSERT(va_range->managed.vma_wrapper);
656
657 uvm_assert_rwsem_locked(&va_range->va_space->lock);
658
659 // vm_file, vm_private_data, vm_start, and vm_end are all safe to access
660 // here because they can't change without the kernel calling vm_ops->open
661 // or vm_ops->close, which both take va_space->lock.
662 vma = va_range->managed.vma_wrapper->vma;
663 UVM_ASSERT(vma);
664 UVM_ASSERT_MSG(vma->vm_private_data == va_range->managed.vma_wrapper,
665 "vma: 0x%llx [0x%lx, 0x%lx] has vm_private_data 0x%llx\n",
666 (NvU64)vma,
667 vma->vm_start,
668 vma->vm_end - 1,
669 (NvU64)vma->vm_private_data);
670 UVM_ASSERT_MSG(va_range->va_space == uvm_va_space_get(vma->vm_file),
671 "va_range va_space: 0x%llx vm_file: 0x%llx vm_file va_space: 0x%llx",
672 (NvU64)va_range->va_space,
673 (NvU64)vma->vm_file,
674 (NvU64)uvm_va_space_get(vma->vm_file));
675 UVM_ASSERT_MSG(va_range->node.start >= vma->vm_start,
676 "Range mismatch: va_range: [0x%llx, 0x%llx] vma: [0x%lx, 0x%lx]\n",
677 va_range->node.start,
678 va_range->node.end,
679 vma->vm_start,
680 vma->vm_end - 1);
681 UVM_ASSERT_MSG(va_range->node.end <= vma->vm_end - 1,
682 "Range mismatch: va_range: [0x%llx, 0x%llx] vma: [0x%lx, 0x%lx]\n",
683 va_range->node.start,
684 va_range->node.end,
685 vma->vm_start,
686 vma->vm_end - 1);
687
688 return vma;
689 }
690
691 // Check that the VA range's vma is safe to use under mm. If not, NULL is
692 // returned. If the vma is returned, there must be a reference on mm and
693 // mm->mmap_lock must be held.
uvm_va_range_vma_check(uvm_va_range_t * va_range,struct mm_struct * mm)694 static struct vm_area_struct *uvm_va_range_vma_check(uvm_va_range_t *va_range, struct mm_struct *mm)
695 {
696 struct vm_area_struct *vma;
697
698 UVM_ASSERT_MSG(va_range->type == UVM_VA_RANGE_TYPE_MANAGED, "type: %d\n", va_range->type);
699
700 // Zombies don't have a vma_wrapper.
701 if (!va_range->managed.vma_wrapper)
702 return NULL;
703
704 vma = uvm_va_range_vma(va_range);
705
706 // Examples of mm on various paths:
707 // - CPU fault vma->vm_mm
708 // - GPU fault current->mm or va_space->va_space_mm.mm
709 // - IOCTL current->mm or va_space->va_space_mm.mm
710 // - Process teardown NULL
711 //
712 // Since the "safe" mm varies based on the path, we may not have a reference
713 // on the vma's owning mm_struct. We won't know that until we look at the
714 // vma. By then it's too late to take mmap_lock since mmap_lock is above the
715 // va_space lock in our lock ordering, and we must be holding the va_space
716 // lock to query the va_range. Hence the need to detect the cases in which
717 // it's safe to operate on the vma.
718 //
719 // When we can't detect for certain that mm is safe to use, we shouldn't
720 // operate on the vma at all. The vma can't be outright freed until we drop
721 // the va_space lock so the pointer itself will remain valid, but its fields
722 // (like vm_start and vm_end) could be modified behind our back. We also
723 // aren't allowed to call vm_insert_page unless we hold the vma's mmap_lock.
724 //
725 // Note that if uvm_va_space_mm_enabled() is true, then vma->vm_mm must be
726 // va_space->va_space_mm.mm because we enforce that at mmap.
727 //
728 // An interesting case is when vma->vm_mm != current->mm. This can happen
729 // due to fork, ptrace, process teardown, etc. It will also be the case in
730 // the GPU fault handler.
731 if (mm != vma->vm_mm)
732 return NULL;
733
734 uvm_assert_mmap_lock_locked(vma->vm_mm);
735 return vma;
736 }
737
738 // Helper for use when the only mm which is known is current->mm
uvm_va_range_vma_current(uvm_va_range_t * va_range)739 static struct vm_area_struct *uvm_va_range_vma_current(uvm_va_range_t *va_range)
740 {
741 return uvm_va_range_vma_check(va_range, current->mm);
742 }
743
744 // Returns the maximum number of VA blocks which could be contained with the
745 // given va_range (number of elements in the va_range->blocks array).
746 // va_range->node.start and .end must be set.
747 //
748 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
749 size_t uvm_va_range_num_blocks(uvm_va_range_t *va_range);
750
751 // Get the index within the va_range->blocks array of the VA block
752 // corresponding to addr. The block pointer is not guaranteed to be valid. Use
753 // either uvm_va_range_block or uvm_va_range_block_create to look up the block.
754 //
755 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
756 size_t uvm_va_range_block_index(uvm_va_range_t *va_range, NvU64 addr);
757
758 // Looks up the VA block at va_range->blocks[index]. If no block is present at
759 // that index, NULL is returned.
760 //
761 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
uvm_va_range_block(uvm_va_range_t * va_range,size_t index)762 static uvm_va_block_t *uvm_va_range_block(uvm_va_range_t *va_range, size_t index)
763 {
764 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
765 UVM_ASSERT(index < uvm_va_range_num_blocks(va_range));
766 uvm_assert_rwsem_locked(&va_range->va_space->lock);
767
768 return (uvm_va_block_t *)atomic_long_read(&va_range->blocks[index]);
769 }
770
771 // Same as uvm_va_range_block except that the block is created if not already
772 // present in the array. If NV_OK is returned, the block has been allocated
773 // successfully.
774 //
775 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
776 NV_STATUS uvm_va_range_block_create(uvm_va_range_t *va_range, size_t index, uvm_va_block_t **out_block);
777
778 // Returns the first populated VA block in the VA range after the input
779 // va_block, or NULL if none. If the input va_block is NULL, this returns the
780 // first VA block in the VA range, if any exists.
781 uvm_va_block_t *uvm_va_range_block_next(uvm_va_range_t *va_range, uvm_va_block_t *va_block);
782
783 // Iterate over populated VA blocks in the range. Does not create new VA blocks.
784 #define for_each_va_block_in_va_range(__va_range, __va_block) \
785 for (__va_block = uvm_va_range_block_next(__va_range, NULL); \
786 __va_block; \
787 __va_block = uvm_va_range_block_next(__va_range, __va_block))
788
789 // Iterate over populated VA blocks in the range. Does not create new VA blocks. Safe version
790 #define for_each_va_block_in_va_range_safe(__va_range, __va_block, __va_block_next) \
791 for (__va_block = uvm_va_range_block_next(__va_range, NULL), \
792 __va_block_next = uvm_va_range_block_next(__va_range, __va_block); \
793 __va_block; \
794 __va_block = __va_block_next, \
795 __va_block_next = __va_block? uvm_va_range_block_next(__va_range, __va_block) : NULL)
796
797 // Set the VA range preferred location (or unset it if preferred location is
798 // UVM_ID_INVALID).
799 //
800 // Unsetting the preferred location potentially changes the range group
801 // association to UVM_RANGE_GROUP_ID_NONE if the VA range was previously
802 // associated with a non-migratable range group.
803 //
804 // Changing the preferred location also updates the mask and mappings of GPUs
805 // in UVM-Lite mode.
806 //
807 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
808 //
809 // If mm != NULL, that mm is used for any CPU mappings which may be created as
810 // a result of this call. See uvm_va_block_context_t::mm for details.
811 //
812 // If out_tracker != NULL any block work will be added to that tracker.
813 //
814 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
815 // mode.
816 NV_STATUS uvm_va_range_set_preferred_location(uvm_va_range_t *va_range,
817 uvm_processor_id_t preferred_location,
818 int preferred_cpu_nid,
819 struct mm_struct *mm,
820 uvm_tracker_t *out_tracker);
821
822 // Add a processor to the accessed_by mask and establish any new required
823 // mappings.
824 //
825 // Also update the mask of UVM-Lite GPUs if needed.
826 //
827 // If mm != NULL, that mm is used for any CPU mappings which may be created as
828 // a result of this call. See uvm_va_block_context_t::mm for details.
829 //
830 // If out_tracker != NULL any block work will be added to that tracker.
831 //
832 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
833 // mode.
834 NV_STATUS uvm_va_range_set_accessed_by(uvm_va_range_t *va_range,
835 uvm_processor_id_t processor_id,
836 struct mm_struct *mm,
837 uvm_tracker_t *out_tracker);
838
839 // Remove a processor from the accessed_by mask
840 //
841 // If out_tracker != NULL any block work will be added to that tracker.
842 //
843 // This also updates the mask and mappings of the UVM-Lite GPUs if required.
844 void uvm_va_range_unset_accessed_by(uvm_va_range_t *va_range,
845 uvm_processor_id_t processor_id,
846 uvm_tracker_t *out_tracker);
847
848 // Set read-duplication and remove any existing accessed_by and remote mappings
849 //
850 // If mm != NULL, that mm is used for any CPU mappings which may be created as
851 // a result of this call. See uvm_va_block_context_t::mm for details.
852 //
853 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
854 // mode.
855 NV_STATUS uvm_va_range_set_read_duplication(uvm_va_range_t *va_range, struct mm_struct *mm);
856
857 // Unset read-duplication and establish accessed_by mappings
858 //
859 // If mm != NULL, that mm is used for any CPU mappings which may be created as
860 // a result of this call. See uvm_va_block_context_t::mm for details.
861 //
862 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
863 // mode.
864 NV_STATUS uvm_va_range_unset_read_duplication(uvm_va_range_t *va_range, struct mm_struct *mm);
865
866 // Create and destroy vma wrappers
867 uvm_vma_wrapper_t *uvm_vma_wrapper_alloc(struct vm_area_struct *vma);
868 void uvm_vma_wrapper_destroy(uvm_vma_wrapper_t *vma_wrapper);
869
uvm_va_range_get_policy(uvm_va_range_t * va_range)870 static uvm_va_policy_t *uvm_va_range_get_policy(uvm_va_range_t *va_range)
871 {
872 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
873 return &va_range->managed.policy;
874 }
875
876 NV_STATUS uvm_test_va_range_info(UVM_TEST_VA_RANGE_INFO_PARAMS *params, struct file *filp);
877 NV_STATUS uvm_test_va_range_split(UVM_TEST_VA_RANGE_SPLIT_PARAMS *params, struct file *filp);
878 NV_STATUS uvm_test_va_range_inject_split_error(UVM_TEST_VA_RANGE_INJECT_SPLIT_ERROR_PARAMS *params, struct file *filp);
879 NV_STATUS uvm_test_va_range_inject_add_gpu_va_space_error(UVM_TEST_VA_RANGE_INJECT_ADD_GPU_VA_SPACE_ERROR_PARAMS *params,
880 struct file *filp);
881
882 #endif // __UVM_VA_RANGE_H__
883