1 /******************************************************************************* 2 Copyright (c) 2015-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #ifndef __UVM_VA_SPACE_H__ 25 #define __UVM_VA_SPACE_H__ 26 27 #include "uvm_processors.h" 28 #include "uvm_global.h" 29 #include "uvm_gpu.h" 30 #include "uvm_range_tree.h" 31 #include "uvm_range_group.h" 32 #include "uvm_forward_decl.h" 33 #include "uvm_mmu.h" 34 #include "uvm_linux.h" 35 #include "uvm_common.h" 36 #include "nv-kref.h" 37 #include "nv-linux.h" 38 #include "uvm_perf_events.h" 39 #include "uvm_perf_module.h" 40 #include "uvm_va_block_types.h" 41 #include "uvm_va_block.h" 42 #include "uvm_hmm.h" 43 #include "uvm_test_ioctl.h" 44 #include "uvm_ats.h" 45 #include "uvm_va_space_mm.h" 46 #include "uvm_conf_computing.h" 47 48 // uvm_deferred_free_object provides a mechanism for building and later freeing 49 // a list of objects which are owned by a VA space, but can't be freed while the 50 // VA space lock is held. 51 52 typedef enum 53 { 54 UVM_DEFERRED_FREE_OBJECT_TYPE_CHANNEL, 55 UVM_DEFERRED_FREE_OBJECT_GPU_VA_SPACE, 56 UVM_DEFERRED_FREE_OBJECT_TYPE_EXTERNAL_ALLOCATION, 57 UVM_DEFERRED_FREE_OBJECT_TYPE_COUNT 58 } uvm_deferred_free_object_type_t; 59 60 typedef struct 61 { 62 uvm_deferred_free_object_type_t type; 63 struct list_head list_node; 64 } uvm_deferred_free_object_t; 65 66 static void uvm_deferred_free_object_add(struct list_head *list, 67 uvm_deferred_free_object_t *object, 68 uvm_deferred_free_object_type_t type) 69 { 70 object->type = type; 71 list_add_tail(&object->list_node, list); 72 } 73 74 // Walks the list of pending objects and frees each one as appropriate to its 75 // type. 76 // 77 // LOCKING: May take the GPU isr_lock and the RM locks. 78 void uvm_deferred_free_object_list(struct list_head *deferred_free_list); 79 80 typedef enum 81 { 82 // The GPU VA space has been initialized but not yet inserted into the 83 // parent VA space. 84 UVM_GPU_VA_SPACE_STATE_INIT = 0, 85 86 // The GPU VA space is active in the VA space. 87 UVM_GPU_VA_SPACE_STATE_ACTIVE, 88 89 // The GPU VA space is no longer active in the VA space. This state can be 90 // observed when threads retain the gpu_va_space then drop the VA space 91 // lock. After re-taking the VA space lock, the state must be inspected to 92 // see if another thread unregistered the gpu_va_space in the meantime. 93 UVM_GPU_VA_SPACE_STATE_DEAD, 94 95 UVM_GPU_VA_SPACE_STATE_COUNT 96 } uvm_gpu_va_space_state_t; 97 98 struct uvm_gpu_va_space_struct 99 { 100 // Parent pointers 101 uvm_va_space_t *va_space; 102 uvm_gpu_t *gpu; 103 104 uvm_gpu_va_space_state_t state; 105 106 // Handle to the duped GPU VA space 107 // to be used for all further GPU VA space related UVM-RM interactions. 108 uvmGpuAddressSpaceHandle duped_gpu_va_space; 109 bool did_set_page_directory; 110 111 uvm_page_tree_t page_tables; 112 113 // List of all uvm_user_channel_t's under this GPU VA space 114 struct list_head registered_channels; 115 116 // List of all uvm_va_range_t's under this GPU VA space with type == 117 // UVM_VA_RANGE_TYPE_CHANNEL. Used at channel registration time to find 118 // shareable VA ranges without having to iterate through all VA ranges in 119 // the VA space. 120 struct list_head channel_va_ranges; 121 122 // Boolean which is 1 if no new channel registration is allowed. This is set 123 // when all the channels under the GPU VA space have been stopped to prevent 124 // new ones from entering after we drop the VA space lock. It is an atomic_t 125 // because multiple threads may set it to 1 concurrently. 126 atomic_t disallow_new_channels; 127 128 // Node for the deferred free list where this GPU VA space is stored upon 129 // being unregistered. 130 uvm_deferred_free_object_t deferred_free; 131 132 // Reference count for this gpu_va_space. This only protects the memory 133 // object itself, for use in cases when the gpu_va_space needs to be 134 // accessed across dropping and re-acquiring the VA space lock. 135 nv_kref_t kref; 136 137 // ATS specific state 138 uvm_ats_gpu_va_space_t ats; 139 }; 140 141 typedef struct 142 { 143 int numa_node; 144 145 uvm_processor_mask_t gpus; 146 } uvm_cpu_gpu_affinity_t; 147 148 struct uvm_va_space_struct 149 { 150 // Mask of gpus registered with the va space 151 uvm_processor_mask_t registered_gpus; 152 153 // Array of pointers to the uvm_gpu_t objects that correspond to the 154 // uvm_processor_id_t index. 155 // 156 // With SMC, GPUs can be partitioned so the number of uvm_gpu_t objects can 157 // be larger than UVM_ID_MAX_GPUS. However, each VA space can only 158 // subscribe to a single partition per GPU, so it is fine to have a regular 159 // processor mask. 160 uvm_gpu_t *registered_gpus_table[UVM_ID_MAX_GPUS]; 161 162 // Mask of processors registered with the va space that support replayable 163 // faults. 164 uvm_processor_mask_t faultable_processors; 165 166 // Mask of processors registered with the va space that don't support 167 // faulting. 168 uvm_processor_mask_t non_faultable_processors; 169 170 // This is a count of non fault capable processors with a GPU VA space 171 // registered. 172 NvU32 num_non_faultable_gpu_va_spaces; 173 174 // Semaphore protecting the state of the va space 175 uvm_rw_semaphore_t lock; 176 177 // Lock taken prior to taking the VA space lock in write mode, or prior to 178 // taking the VA space lock in read mode on a path which will call in RM. 179 // See UVM_LOCK_ORDER_VA_SPACE_SERIALIZE_WRITERS in uvm_lock.h. 180 uvm_mutex_t serialize_writers_lock; 181 182 // Lock taken to serialize down_reads on the VA space lock with up_writes in 183 // other threads. See 184 // UVM_LOCK_ORDER_VA_SPACE_READ_ACQUIRE_WRITE_RELEASE_LOCK in uvm_lock.h. 185 uvm_mutex_t read_acquire_write_release_lock; 186 187 // Tree of uvm_va_range_t's 188 uvm_range_tree_t va_range_tree; 189 190 // Kernel mapping structure passed to unmap_mapping range to unmap CPU PTEs 191 // in this process. 192 struct address_space *mapping; 193 194 // Storage in g_uvm_global.va_spaces.list 195 struct list_head list_node; 196 197 // Monotonically increasing counter for range groups IDs 198 atomic64_t range_group_id_counter; 199 200 // Range groups 201 struct radix_tree_root range_groups; 202 uvm_range_tree_t range_group_ranges; 203 204 // Peer to peer table 205 // A bitmask of peer to peer pairs enabled in this va_space 206 // indexed by a peer_table_index returned by uvm_gpu_peer_table_index(). 207 DECLARE_BITMAP(enabled_peers, UVM_MAX_UNIQUE_GPU_PAIRS); 208 209 // Temporary copy of the above state used to avoid allocation during VA 210 // space destroy. 211 DECLARE_BITMAP(enabled_peers_teardown, UVM_MAX_UNIQUE_GPU_PAIRS); 212 213 // Interpreting these processor masks: 214 // uvm_processor_mask_test(foo[A], B) 215 // ...should be read as "test if A foo B." For example: 216 // uvm_processor_mask_test(accessible_from[B], A) 217 // means "test if B is accessible_from A." 218 219 // Pre-computed masks that contain, for each processor, a mask of processors 220 // which that processor can directly access. In other words, this will test 221 // whether A has direct access to B: 222 // uvm_processor_mask_test(can_access[A], B) 223 uvm_processor_mask_t can_access[UVM_ID_MAX_PROCESSORS]; 224 225 // Pre-computed masks that contain, for each processor memory, a mask with 226 // the processors that have direct access enabled to its memory. This is the 227 // opposite direction as can_access. In other words, this will test whether 228 // A has direct access to B: 229 // uvm_processor_mask_test(accessible_from[B], A) 230 uvm_processor_mask_t accessible_from[UVM_ID_MAX_PROCESSORS]; 231 232 // Pre-computed masks that contain, for each processor memory, a mask with 233 // the processors that can directly copy to and from its memory. This is 234 // almost the same as accessible_from masks, but also requires peer identity 235 // mappings to be supported for peer access. 236 uvm_processor_mask_t can_copy_from[UVM_ID_MAX_PROCESSORS]; 237 238 // Pre-computed masks that contain, for each processor, a mask of processors 239 // to which that processor has NVLINK access. In other words, this will test 240 // whether A has NVLINK access to B: 241 // uvm_processor_mask_test(has_nvlink[A], B) 242 // This is a subset of can_access. 243 uvm_processor_mask_t has_nvlink[UVM_ID_MAX_PROCESSORS]; 244 245 // Pre-computed masks that contain, for each processor memory, a mask with 246 // the processors that have direct access to its memory and native support 247 // for atomics in HW. This is a subset of accessible_from. 248 uvm_processor_mask_t has_native_atomics[UVM_ID_MAX_PROCESSORS]; 249 250 // Pre-computed masks that contain, for each processor memory, a mask with 251 // the processors that are indirect peers. Indirect peers can access each 252 // other's memory like regular peers, but with additional latency and/or bw 253 // penalty. 254 uvm_processor_mask_t indirect_peers[UVM_ID_MAX_PROCESSORS]; 255 256 // Mask of gpu_va_spaces registered with the va space 257 // indexed by gpu->id 258 uvm_processor_mask_t registered_gpu_va_spaces; 259 260 // Mask of GPUs which have temporarily dropped the VA space lock mid- 261 // unregister. Used to make other paths return an error rather than 262 // corrupting state. 263 uvm_processor_mask_t gpu_unregister_in_progress; 264 265 // Mask of processors that are participating in system-wide atomics 266 uvm_processor_mask_t system_wide_atomics_enabled_processors; 267 268 // Mask of physical GPUs where access counters are enabled on this VA space 269 uvm_parent_processor_mask_t access_counters_enabled_processors; 270 271 // Array with information regarding CPU/GPU NUMA affinity. There is one 272 // entry per CPU NUMA node. Entries in the array are populated sequentially 273 // as new CPU NUMA nodes are discovered on GPU registration. Each entry 274 // contains a CPU NUMA node id, and a mask with the GPUs attached to it. 275 // Since each GPU can only be attached to one CPU node id, the array can 276 // contain information for up to UVM_ID_MAX_GPUS nodes. The information is 277 // stored in the VA space to avoid taking the global lock. 278 uvm_cpu_gpu_affinity_t gpu_cpu_numa_affinity[UVM_ID_MAX_GPUS]; 279 280 // Unregistering a GPU may trigger memory eviction from the GPU to the CPU. 281 // This must happen without allocation, thus, a buffer is preallocated 282 // at GPU register and freed at GPU unregister. 283 uvm_conf_computing_dma_buffer_t *gpu_unregister_dma_buffer[UVM_ID_MAX_GPUS]; 284 285 // Array of GPU VA spaces 286 uvm_gpu_va_space_t *gpu_va_spaces[UVM_ID_MAX_GPUS]; 287 288 // Tracking of GPU VA spaces which have dropped the VA space lock and are 289 // pending destruction. uvm_va_space_mm_shutdown has to wait for those 290 // destroy operations to be completely done. 291 struct 292 { 293 atomic_t num_pending; 294 wait_queue_head_t wait_queue; 295 } gpu_va_space_deferred_free; 296 297 // Per-va_space event notification information for performance heuristics 298 uvm_perf_va_space_events_t perf_events; 299 300 uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT]; 301 302 // Array of modules that are loaded in the va_space, indexed by module type 303 uvm_perf_module_t *perf_modules[UVM_PERF_MODULE_TYPE_COUNT]; 304 305 // Lists of counters listening for events on this VA space 306 // Protected by lock 307 struct 308 { 309 bool enabled; 310 311 uvm_rw_semaphore_t lock; 312 313 // Lists of counters listening for events on this VA space 314 struct list_head counters[UVM_TOTAL_COUNTERS]; 315 struct list_head queues_v1[UvmEventNumTypesAll]; 316 struct list_head queues_v2[UvmEventNumTypesAll]; 317 318 // Node for this va_space in global subscribers list 319 struct list_head node; 320 } tools; 321 322 // Boolean which is 1 if all user channels have been already stopped. This 323 // is an atomic_t because multiple threads may call 324 // uvm_va_space_stop_all_user_channels concurrently. 325 atomic_t user_channels_stopped; 326 327 // Prevent future registrations of any kind (GPU, GPU VA space, channel). 328 // This is used when the associated va_space_mm is torn down, which has to 329 // prevent any new work from being started in this VA space. 330 bool disallow_new_registers; 331 332 bool user_channel_stops_are_immediate; 333 334 // Block context used for GPU unmap operations so that allocation is not 335 // required on the teardown path. This can only be used while the VA space 336 // lock is held in write mode. Access using uvm_va_space_block_context(). 337 uvm_va_block_context_t *va_block_context; 338 339 NvU64 initialization_flags; 340 341 // The mm currently associated with this VA space, if any. 342 uvm_va_space_mm_t va_space_mm; 343 344 union 345 { 346 uvm_ats_va_space_t ats; 347 348 // HMM information about this VA space. 349 uvm_hmm_va_space_t hmm; 350 }; 351 352 struct 353 { 354 bool page_prefetch_enabled; 355 bool skip_migrate_vma; 356 357 atomic_t migrate_vma_allocation_fail_nth; 358 359 atomic_t va_block_allocation_fail_nth; 360 361 uvm_thread_context_wrapper_t *dummy_thread_context_wrappers; 362 size_t num_dummy_thread_context_wrappers; 363 364 atomic64_t destroy_gpu_va_space_delay_us; 365 366 atomic64_t split_invalidate_delay_us; 367 368 bool force_cpu_to_cpu_copy_with_ce; 369 370 bool allow_allocation_from_movable; 371 } test; 372 373 // Queue item for deferred f_ops->release() handling 374 nv_kthread_q_item_t deferred_release_q_item; 375 }; 376 377 static uvm_gpu_t *uvm_va_space_get_gpu(uvm_va_space_t *va_space, uvm_gpu_id_t gpu_id) 378 { 379 uvm_gpu_t *gpu; 380 381 UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu_id)); 382 383 gpu = va_space->registered_gpus_table[uvm_id_gpu_index(gpu_id)]; 384 385 UVM_ASSERT(gpu); 386 UVM_ASSERT(uvm_gpu_get(gpu->id) == gpu); 387 388 return gpu; 389 } 390 391 static const char *uvm_va_space_processor_name(uvm_va_space_t *va_space, uvm_processor_id_t id) 392 { 393 if (UVM_ID_IS_CPU(id)) 394 return "0: CPU"; 395 else 396 return uvm_gpu_name(uvm_va_space_get_gpu(va_space, id)); 397 } 398 399 static void uvm_va_space_processor_uuid(uvm_va_space_t *va_space, NvProcessorUuid *uuid, uvm_processor_id_t id) 400 { 401 if (UVM_ID_IS_CPU(id)) { 402 memcpy(uuid, &NV_PROCESSOR_UUID_CPU_DEFAULT, sizeof(*uuid)); 403 } 404 else { 405 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id); 406 UVM_ASSERT(gpu); 407 memcpy(uuid, &gpu->uuid, sizeof(*uuid)); 408 } 409 } 410 411 static bool uvm_va_space_processor_has_memory(uvm_va_space_t *va_space, uvm_processor_id_t id) 412 { 413 if (UVM_ID_IS_CPU(id)) 414 return true; 415 416 return uvm_va_space_get_gpu(va_space, id)->mem_info.size > 0; 417 } 418 419 NV_STATUS uvm_va_space_create(struct address_space *mapping, uvm_va_space_t **va_space_ptr, NvU64 flags); 420 void uvm_va_space_destroy(uvm_va_space_t *va_space); 421 422 // All VA space locking should be done with these wrappers. They're macros so 423 // lock assertions are attributed to line numbers correctly. 424 425 #define uvm_va_space_down_write(__va_space) \ 426 do { \ 427 uvm_mutex_lock(&(__va_space)->serialize_writers_lock); \ 428 uvm_mutex_lock(&(__va_space)->read_acquire_write_release_lock); \ 429 uvm_down_write(&(__va_space)->lock); \ 430 } while (0) 431 432 #define uvm_va_space_up_write(__va_space) \ 433 do { \ 434 uvm_up_write(&(__va_space)->lock); \ 435 uvm_mutex_unlock(&(__va_space)->read_acquire_write_release_lock); \ 436 uvm_mutex_unlock(&(__va_space)->serialize_writers_lock); \ 437 } while (0) 438 439 #define uvm_va_space_downgrade_write(__va_space) \ 440 do { \ 441 uvm_downgrade_write(&(__va_space)->lock); \ 442 uvm_mutex_unlock_out_of_order(&(__va_space)->read_acquire_write_release_lock); \ 443 uvm_mutex_unlock_out_of_order(&(__va_space)->serialize_writers_lock); \ 444 } while (0) 445 446 // Call this when holding the VA space lock for write in order to downgrade to 447 // read on a path which also needs to make RM calls. 448 #define uvm_va_space_downgrade_write_rm(__va_space) \ 449 do { \ 450 uvm_assert_mutex_locked(&(__va_space)->serialize_writers_lock); \ 451 uvm_downgrade_write(&(__va_space)->lock); \ 452 uvm_mutex_unlock_out_of_order(&(__va_space)->read_acquire_write_release_lock); \ 453 } while (0) 454 455 #define uvm_va_space_down_read(__va_space) \ 456 do { \ 457 uvm_mutex_lock(&(__va_space)->read_acquire_write_release_lock); \ 458 uvm_down_read(&(__va_space)->lock); \ 459 uvm_mutex_unlock_out_of_order(&(__va_space)->read_acquire_write_release_lock); \ 460 } while (0) 461 462 // Call this if RM calls need to be made while holding the VA space lock in read 463 // mode. Note that taking read_acquire_write_release_lock is unnecessary since 464 // the down_read is serialized with another thread's up_write by the 465 // serialize_writers_lock. 466 #define uvm_va_space_down_read_rm(__va_space) \ 467 do { \ 468 uvm_mutex_lock(&(__va_space)->serialize_writers_lock); \ 469 uvm_down_read(&(__va_space)->lock); \ 470 } while (0) 471 472 #define uvm_va_space_up_read(__va_space) uvm_up_read(&(__va_space)->lock) 473 474 #define uvm_va_space_up_read_rm(__va_space) \ 475 do { \ 476 uvm_up_read(&(__va_space)->lock); \ 477 uvm_mutex_unlock(&(__va_space)->serialize_writers_lock); \ 478 } while (0) 479 480 // Get a registered gpu by uuid. This restricts the search for GPUs, to those 481 // that have been registered with a va_space. This returns NULL if the GPU is 482 // not present, or not registered with the va_space. 483 // 484 // LOCKING: The VA space lock must be held. 485 uvm_gpu_t *uvm_va_space_get_gpu_by_uuid(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid); 486 487 // Like uvm_va_space_get_gpu_by_uuid, but also returns NULL if the GPU does 488 // not have a GPU VA space registered in the UVM va_space. 489 // 490 // LOCKING: The VA space lock must be held. 491 uvm_gpu_t *uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid); 492 493 // Same as uvm_va_space_get_gpu_by_uuid but it also retains the GPU. The caller 494 // cannot assume that the GPU is still registered in the VA space after the 495 // function returns. 496 // 497 // LOCKING: The function takes and releases the VA space lock in read mode. 498 uvm_gpu_t *uvm_va_space_retain_gpu_by_uuid(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid); 499 500 // Returns whether read-duplication is supported. 501 // If gpu is NULL, returns the current state. 502 // otherwise, it returns what the result would be once the gpu's va space is 503 // added or removed (by inverting the gpu's current state). 504 bool uvm_va_space_can_read_duplicate(uvm_va_space_t *va_space, uvm_gpu_t *changing_gpu); 505 506 // Register a gpu in the va space 507 // Note that each gpu can be only registered once in a va space 508 // 509 // The input gpu_uuid is for the phyisical GPU. The user_rm_va_space argument 510 // identifies the SMC partition if provided and SMC is enabled. 511 // 512 // This call returns whether the GPU memory is a NUMA node in the kernel and the 513 // corresponding node id. 514 // It also returns the GI UUID (if gpu_uuid is a SMC partition) or a copy of 515 // gpu_uuid if the GPU is not SMC capable or SMC is not enabled. 516 NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space, 517 const NvProcessorUuid *gpu_uuid, 518 const uvm_rm_user_object_t *user_rm_va_space, 519 NvBool *numa_enabled, 520 NvS32 *numa_node_id, 521 NvProcessorUuid *uuid_out); 522 523 // Unregister a gpu from the va space 524 NV_STATUS uvm_va_space_unregister_gpu(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid); 525 526 // Registers a GPU VA space with the UVM VA space. 527 NV_STATUS uvm_va_space_register_gpu_va_space(uvm_va_space_t *va_space, 528 uvm_rm_user_object_t *user_rm_va_space, 529 const NvProcessorUuid *gpu_uuid); 530 531 // Unregisters a GPU VA space from the UVM VA space. 532 NV_STATUS uvm_va_space_unregister_gpu_va_space(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid); 533 534 // Stop all user channels 535 // 536 // This function sets a flag in the VA space indicating that all the channels 537 // have been already stopped and should only be used when no new user channels 538 // can be registered. 539 // 540 // LOCKING: The VA space lock must be held in read mode, not write. 541 void uvm_va_space_stop_all_user_channels(uvm_va_space_t *va_space); 542 543 // Calls uvm_user_channel_detach on all user channels in a VA space. 544 // 545 // The detached channels are added to the input list. The caller is expected to 546 // drop the VA space lock and call uvm_deferred_free_object_list to complete the 547 // destroy operation. 548 // 549 // LOCKING: The owning VA space must be locked in write mode. 550 void uvm_va_space_detach_all_user_channels(uvm_va_space_t *va_space, struct list_head *deferred_free_list); 551 552 // Returns whether peer access between these two GPUs has been enabled in this 553 // VA space. Both GPUs must be registered in the VA space. 554 bool uvm_va_space_peer_enabled(uvm_va_space_t *va_space, const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1); 555 556 // Returns the va_space this file points to. Returns NULL if this file 557 // does not point to a va_space. 558 static uvm_va_space_t *uvm_fd_va_space(struct file *filp) 559 { 560 uvm_va_space_t *va_space; 561 uvm_fd_type_t type; 562 563 type = uvm_fd_type(filp, (void **) &va_space); 564 if (type != UVM_FD_VA_SPACE) 565 return NULL; 566 567 return va_space; 568 } 569 570 static uvm_va_space_t *uvm_va_space_get(struct file *filp) 571 { 572 uvm_fd_type_t fd_type; 573 uvm_va_space_t *va_space; 574 575 fd_type = uvm_fd_type(filp, (void **)&va_space); 576 UVM_ASSERT(uvm_file_is_nvidia_uvm(filp)); 577 UVM_ASSERT_MSG(fd_type == UVM_FD_VA_SPACE, "filp: 0x%llx", (NvU64)filp); 578 579 return va_space; 580 } 581 582 static uvm_va_block_context_t *uvm_va_space_block_context(uvm_va_space_t *va_space, struct mm_struct *mm) 583 { 584 uvm_assert_rwsem_locked_write(&va_space->lock); 585 if (mm) 586 uvm_assert_mmap_lock_locked(mm); 587 588 uvm_va_block_context_init(va_space->va_block_context, mm); 589 return va_space->va_block_context; 590 } 591 592 // Retains the GPU VA space memory object. destroy_gpu_va_space and 593 // uvm_gpu_va_space_release drop the count. This is used to keep the GPU VA 594 // space object allocated when dropping and re-taking the VA space lock. If 595 // another thread called remove_gpu_va_space in the meantime, 596 // gpu_va_space->state will be UVM_GPU_VA_SPACE_STATE_DEAD. 597 static inline void uvm_gpu_va_space_retain(uvm_gpu_va_space_t *gpu_va_space) 598 { 599 nv_kref_get(&gpu_va_space->kref); 600 } 601 602 // This only frees the GPU VA space object itself, so it must have been removed 603 // from its VA space and destroyed prior to the final release. 604 void uvm_gpu_va_space_release(uvm_gpu_va_space_t *gpu_va_space); 605 606 // Wrapper for nvUvmInterfaceUnsetPageDirectory 607 void uvm_gpu_va_space_unset_page_dir(uvm_gpu_va_space_t *gpu_va_space); 608 609 static uvm_gpu_va_space_state_t uvm_gpu_va_space_state(uvm_gpu_va_space_t *gpu_va_space) 610 { 611 UVM_ASSERT(gpu_va_space->gpu); 612 UVM_ASSERT(gpu_va_space->va_space); 613 614 return gpu_va_space->state; 615 } 616 617 // Return the GPU VA space for the given physical GPU. 618 // Locking: the va_space lock must be held. 619 uvm_gpu_va_space_t *uvm_gpu_va_space_get_by_parent_gpu(uvm_va_space_t *va_space, 620 uvm_parent_gpu_t *parent_gpu); 621 622 static uvm_gpu_va_space_t *uvm_gpu_va_space_get(uvm_va_space_t *va_space, uvm_gpu_t *gpu) 623 { 624 uvm_gpu_va_space_t *gpu_va_space; 625 626 if (!gpu) 627 return NULL; 628 629 gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent); 630 if (gpu_va_space) 631 UVM_ASSERT(gpu_va_space->gpu == gpu); 632 633 return gpu_va_space; 634 } 635 636 #define for_each_gpu_va_space(__gpu_va_space, __va_space) \ 637 for (__gpu_va_space = \ 638 uvm_gpu_va_space_get( \ 639 __va_space, \ 640 uvm_processor_mask_find_first_va_space_gpu(&__va_space->registered_gpu_va_spaces, va_space) \ 641 ); \ 642 __gpu_va_space; \ 643 __gpu_va_space = \ 644 uvm_gpu_va_space_get( \ 645 __va_space, \ 646 __uvm_processor_mask_find_next_va_space_gpu(&__va_space->registered_gpu_va_spaces, \ 647 va_space, \ 648 __gpu_va_space->gpu) \ 649 ) \ 650 ) 651 652 // Return the first GPU set in the given mask or NULL. The caller must ensure 653 // that the GPUs set in the mask are registered in the VA space and cannot be 654 // unregistered during this call. 655 static uvm_gpu_t *uvm_processor_mask_find_first_va_space_gpu(const uvm_processor_mask_t *mask, uvm_va_space_t *va_space) 656 { 657 uvm_gpu_t *gpu; 658 uvm_gpu_id_t gpu_id; 659 660 UVM_ASSERT(uvm_processor_mask_gpu_subset(mask, &va_space->registered_gpus)); 661 662 gpu_id = uvm_processor_mask_find_first_gpu_id(mask); 663 if (UVM_ID_IS_INVALID(gpu_id)) 664 return NULL; 665 666 gpu = uvm_va_space_get_gpu(va_space, gpu_id); 667 UVM_ASSERT_MSG(gpu, "gpu_id %u\n", uvm_id_value(gpu_id)); 668 669 return gpu; 670 } 671 672 static uvm_gpu_t *uvm_va_space_find_first_gpu(uvm_va_space_t *va_space) 673 { 674 uvm_assert_rwsem_locked(&va_space->lock); 675 676 return uvm_processor_mask_find_first_va_space_gpu(&va_space->registered_gpus, va_space); 677 } 678 679 // Same as uvm_processor_mask_find_next_va_space_gpu below, but gpu cannot be 680 // NULL 681 static uvm_gpu_t *__uvm_processor_mask_find_next_va_space_gpu(const uvm_processor_mask_t *mask, 682 uvm_va_space_t *va_space, 683 uvm_gpu_t *gpu) 684 { 685 uvm_gpu_id_t gpu_id; 686 687 UVM_ASSERT(gpu != NULL); 688 UVM_ASSERT(uvm_processor_mask_gpu_subset(mask, &va_space->registered_gpus)); 689 690 gpu_id = uvm_processor_mask_find_next_id(mask, uvm_gpu_id_next(gpu->id)); 691 if (UVM_ID_IS_INVALID(gpu_id)) 692 return NULL; 693 694 gpu = uvm_va_space_get_gpu(va_space, gpu_id); 695 UVM_ASSERT_MSG(gpu, "gpu_id %u\n", uvm_id_value(gpu_id)); 696 697 return gpu; 698 } 699 700 // Return the next GPU with an id larger than gpu->id set in the given mask. 701 // The function returns NULL if gpu is NULL. The caller must ensure that the 702 // GPUs set in the mask are registered in the VA space and cannot be 703 // unregistered during this call. 704 static uvm_gpu_t *uvm_processor_mask_find_next_va_space_gpu(const uvm_processor_mask_t *mask, 705 uvm_va_space_t *va_space, 706 uvm_gpu_t *gpu) 707 { 708 if (gpu == NULL) 709 return NULL; 710 711 return __uvm_processor_mask_find_next_va_space_gpu(mask, va_space, gpu); 712 } 713 714 #define for_each_va_space_gpu_in_mask(gpu, va_space, mask) \ 715 for (({uvm_assert_rwsem_locked(&(va_space)->lock); \ 716 gpu = uvm_processor_mask_find_first_va_space_gpu(mask, va_space);}); \ 717 gpu != NULL; \ 718 gpu = __uvm_processor_mask_find_next_va_space_gpu(mask, va_space, gpu)) 719 720 // Helper to iterate over all GPUs registered in a UVM VA space 721 #define for_each_va_space_gpu(gpu, va_space) \ 722 for_each_va_space_gpu_in_mask(gpu, va_space, &(va_space)->registered_gpus) 723 724 // Return the processor in the candidates mask that is "closest" to src, or 725 // UVM_ID_MAX_PROCESSORS if candidates is empty. The order is: 726 // - src itself 727 // - Direct NVLINK GPU peers if src is CPU or GPU (1) 728 // - NVLINK CPU if src is GPU 729 // - Indirect NVLINK GPU peers if src is GPU 730 // - PCIe peers if src is GPU (2) 731 // - CPU if src is GPU 732 // - Deterministic selection from the pool of candidates 733 // 734 // (1) When src is a GPU, NVLINK GPU peers are preferred over the CPU because in 735 // NUMA systems the CPU processor may refer to multiple CPU NUMA nodes, and 736 // the bandwidth between src and the farthest CPU node can be substantially 737 // lower than the bandwidth src and its peer GPUs. 738 // (2) TODO: Bug 1764943: Is copying from a PCI peer always better than copying 739 // from CPU? 740 uvm_processor_id_t uvm_processor_mask_find_closest_id(uvm_va_space_t *va_space, 741 const uvm_processor_mask_t *candidates, 742 uvm_processor_id_t src); 743 744 // Iterate over each ID in mask in order of proximity to src. This is 745 // destructive to mask. 746 #define for_each_closest_id(id, mask, src, va_space) \ 747 for (id = uvm_processor_mask_find_closest_id(va_space, mask, src); \ 748 UVM_ID_IS_VALID(id); \ 749 uvm_processor_mask_clear(mask, id), id = uvm_processor_mask_find_closest_id(va_space, mask, src)) 750 751 // Return the GPU whose memory corresponds to the given node_id 752 static uvm_gpu_t *uvm_va_space_find_gpu_with_memory_node_id(uvm_va_space_t *va_space, int node_id) 753 { 754 uvm_gpu_t *gpu; 755 756 UVM_ASSERT(nv_numa_node_has_memory(node_id)); 757 758 if (!g_uvm_global.ats.supported) 759 return NULL; 760 761 for_each_va_space_gpu(gpu, va_space) { 762 if (uvm_gpu_numa_node(gpu) == node_id) 763 return gpu; 764 } 765 766 return NULL; 767 } 768 769 static bool uvm_va_space_memory_node_is_gpu(uvm_va_space_t *va_space, int node_id) 770 { 771 return uvm_va_space_find_gpu_with_memory_node_id(va_space, node_id) != NULL; 772 } 773 774 // Return a processor mask with the GPUs attached to the node_id CPU memory 775 // node 776 static void uvm_va_space_get_gpus_attached_to_cpu_node(uvm_va_space_t *va_space, 777 int node_id, 778 uvm_processor_mask_t *gpus) 779 { 780 uvm_gpu_id_t gpu_id; 781 782 UVM_ASSERT(!uvm_va_space_memory_node_is_gpu(va_space, node_id)); 783 784 for_each_gpu_id(gpu_id) { 785 const uvm_cpu_gpu_affinity_t *affinity = &va_space->gpu_cpu_numa_affinity[uvm_id_gpu_index(gpu_id)]; 786 if (affinity->numa_node == node_id) { 787 uvm_processor_mask_copy(gpus, &affinity->gpus); 788 return; 789 } 790 } 791 792 uvm_processor_mask_zero(gpus); 793 } 794 795 // Helper that returns the first GPU in the mask returned by 796 // uvm_va_space_get_gpus_attached_to_cpu_node or NULL if empty 797 static uvm_gpu_t *uvm_va_space_find_first_gpu_attached_to_cpu_node(uvm_va_space_t *va_space, int node_id) 798 { 799 uvm_processor_mask_t gpus; 800 801 uvm_va_space_get_gpus_attached_to_cpu_node(va_space, node_id, &gpus); 802 803 return uvm_processor_mask_find_first_va_space_gpu(&gpus, va_space); 804 } 805 806 // Obtain the user channel with the given instance_ptr. This is used during 807 // non-replayable fault service. This function needs to be called with the va 808 // space lock held in order to prevent channels from being removed. 809 uvm_user_channel_t *uvm_gpu_va_space_get_user_channel(uvm_gpu_va_space_t *gpu_va_space, 810 uvm_gpu_phys_address_t instance_ptr); 811 812 // Whether some form of pageable access (ATS, HMM) is supported by the system on 813 // this VA space. This does NOT check whether GPUs with pageable support are 814 // present, just whether system + VA space support exists. 815 bool uvm_va_space_pageable_mem_access_supported(uvm_va_space_t *va_space); 816 817 NV_STATUS uvm_test_get_pageable_mem_access_type(UVM_TEST_GET_PAGEABLE_MEM_ACCESS_TYPE_PARAMS *params, 818 struct file *filp); 819 NV_STATUS uvm_test_enable_nvlink_peer_access(UVM_TEST_ENABLE_NVLINK_PEER_ACCESS_PARAMS *params, struct file *filp); 820 NV_STATUS uvm_test_disable_nvlink_peer_access(UVM_TEST_DISABLE_NVLINK_PEER_ACCESS_PARAMS *params, struct file *filp); 821 NV_STATUS uvm_test_destroy_gpu_va_space_delay(UVM_TEST_DESTROY_GPU_VA_SPACE_DELAY_PARAMS *params, struct file *filp); 822 NV_STATUS uvm_test_force_cpu_to_cpu_copy_with_ce(UVM_TEST_FORCE_CPU_TO_CPU_COPY_WITH_CE_PARAMS *params, 823 struct file *filp); 824 NV_STATUS uvm_test_va_space_allow_movable_allocations(UVM_TEST_VA_SPACE_ALLOW_MOVABLE_ALLOCATIONS_PARAMS *params, 825 struct file *filp); 826 827 // Handle a CPU fault in the given VA space for a managed allocation, 828 // performing any operations necessary to establish a coherent CPU mapping 829 // (migrations, cache invalidates, etc.). 830 // 831 // Locking: 832 // - vma->vm_mm->mmap_lock must be held in at least read mode. Note, that 833 // might not be the same as current->mm->mmap_lock. 834 // Returns: 835 // VM_FAULT_NOPAGE: if page was faulted in OK 836 // (possibly or'ed with VM_FAULT_MAJOR if a migration was needed). 837 // VM_FAULT_OOM: if system memory wasn't available. 838 // VM_FAULT_SIGBUS: if a CPU mapping to fault_addr cannot be accessed, 839 // for example because it's within a range group which is non-migratable. 840 vm_fault_t uvm_va_space_cpu_fault_managed(uvm_va_space_t *va_space, 841 struct vm_area_struct *vma, 842 struct vm_fault *vmf); 843 844 // Handle a CPU fault in the given VA space for a HMM allocation, 845 // performing any operations necessary to establish a coherent CPU mapping 846 // (migrations, cache invalidates, etc.). 847 // 848 // Locking: 849 // - vma->vm_mm->mmap_lock must be held in at least read mode. Note, that 850 // might not be the same as current->mm->mmap_lock. 851 // Returns: 852 // VM_FAULT_NOPAGE: if page was faulted in OK 853 // (possibly or'ed with VM_FAULT_MAJOR if a migration was needed). 854 // VM_FAULT_OOM: if system memory wasn't available. 855 // VM_FAULT_SIGBUS: if a CPU mapping to fault_addr cannot be accessed. 856 vm_fault_t uvm_va_space_cpu_fault_hmm(uvm_va_space_t *va_space, 857 struct vm_area_struct *vma, 858 struct vm_fault *vmf); 859 860 #endif // __UVM_VA_SPACE_H__ 861