1 /******************************************************************************* 2 Copyright (c) 2015-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #ifndef __UVM_VA_BLOCK_H__ 25 #define __UVM_VA_BLOCK_H__ 26 27 #include "uvm_forward_decl.h" 28 #include "uvm_types.h" 29 #include "uvm_linux.h" 30 #include "nv-kref.h" 31 #include "uvm_common.h" 32 #include "uvm_perf_module.h" 33 #include "uvm_processors.h" 34 #include "uvm_lock.h" 35 #include "uvm_test_ioctl.h" 36 #include "uvm_tracker.h" 37 #include "uvm_pmm_gpu.h" 38 #include "uvm_perf_thrashing.h" 39 #include "uvm_perf_utils.h" 40 #include "uvm_va_block_types.h" 41 #include "uvm_range_tree.h" 42 #include "uvm_mmu.h" 43 #include "nv-kthread-q.h" 44 45 #include <linux/mmu_notifier.h> 46 #include <linux/wait.h> 47 #include <linux/nodemask.h> 48 49 // VA blocks are the leaf nodes in the uvm_va_space tree for managed allocations 50 // (VA ranges with type == UVM_VA_RANGE_TYPE_MANAGED): 51 // 52 // UVM: uvm_va_space -> uvm_va_range -> uvm_va_block 53 // HMM: uvm_va_space -> uvm_va_block 54 // 55 // Each VA block is contained within a single VA range, and contains state on 56 // VAs covered by that block. Most importantly, the block tracks the current 57 // state of the virtual-to-physical mappings for all VAs within that block 58 // across all processors, along with the physical residency location for each 59 // VA. 60 // 61 // The block serializes both CPU and GPU operations on all VAs under that block. 62 // The CPU work is serialized with the block lock, and the GPU work is 63 // serialized by the block work tracker which itself is protected by the block 64 // lock. 65 // 66 // The size of each block varies from the size of the smallest VA range 67 // (PAGE_SIZE) to the max block size specified by UVM_VA_BLOCK_BITS. No block 68 // will span a 2^UVM_VA_BLOCK_BITS boundary in VA space. The size of the block 69 // is determined by the alignment of the parent VA range and the block's 70 // placement within the range. 71 // 72 // Note that this means user space will get best allocation efficiency if it 73 // allocates memory in 2^UVM_VA_BLOCK_BITS naturally-aligned chunks. 74 75 // enums used for indexing into the array of pte_bits bitmaps in the VA block 76 // which hold the current state of each PTE. For a given {processor, PTE}, the 77 // bits represented here must be enough to re-create the non-address portion of 78 // the PTE for that processor. 79 80 // If _READ is not set, the PTE mapping is not valid. 81 // If _WRITE is set, _READ is also set (_WRITE implies _READ). 82 typedef enum 83 { 84 UVM_PTE_BITS_CPU_READ, 85 UVM_PTE_BITS_CPU_WRITE, 86 UVM_PTE_BITS_CPU_MAX 87 } uvm_pte_bits_cpu_t; 88 89 // If _READ is not set, the PTE mapping is not valid. 90 // If _WRITE is set, _READ is also set (_WRITE implies _READ). 91 // If _ATOMIC is set, _WRITE is also set (_ATOMIC implies _WRITE and _READ). 92 // 93 // TODO: Bug 1764925: Track volatile here too if we add GPU L2 caching 94 typedef enum 95 { 96 UVM_PTE_BITS_GPU_READ, 97 UVM_PTE_BITS_GPU_WRITE, 98 UVM_PTE_BITS_GPU_ATOMIC, 99 UVM_PTE_BITS_GPU_MAX 100 } uvm_pte_bits_gpu_t; 101 102 typedef struct 103 { 104 // Per-page residency bit vector, used for fast traversal 105 // of resident pages. 106 // 107 // This follows the same semantics as the CPU residency bit vector and 108 // notably each bit still represents a PAGE_SIZE amount of data, but the 109 // physical GPU memory is tracked by an array of GPU chunks below. 110 uvm_page_mask_t resident; 111 112 // Pages that have been evicted to sysmem 113 uvm_page_mask_t evicted; 114 115 NvU64 *cpu_chunks_dma_addrs; 116 117 // Array of naturally-aligned chunks. Each chunk has the largest possible 118 // size which can fit within the block, so they are not uniform size. 119 // 120 // The number of chunks in the array is calculated using 121 // block_num_gpu_chunks. The size of each chunk is calculated using 122 // block_gpu_chunk_index. 123 uvm_gpu_chunk_t **chunks; 124 125 // These page table ranges are not necessarily all used at the same time. 126 // The block might also be too small or not aligned properly to use the 127 // larger ranges, in which case they're never allocated. 128 // 129 // Once a range is allocated we keep it around to avoid constant allocation 130 // overhead when doing PTE splitting and merging. 131 // 132 // Check range.table to see if a given range has been allocated yet. 133 // 134 // page_table_range_big's range covers the big PTEs which fit within the 135 // interior of this block. See the big_ptes field. 136 uvm_page_table_range_t page_table_range_2m; 137 uvm_page_table_range_t page_table_range_big; 138 uvm_page_table_range_t page_table_range_4k; 139 140 // These flags are ignored unless the {block, gpu} pair supports a 2M page 141 // size. In that case it's the responsibility of the block code to make the 142 // lower page tables active by calling uvm_page_tree_write_pde. 143 // 144 // They can be allocated and activated separately, so we have to track them 145 // separately. 146 // 147 // Activated only means that uvm_page_tree_write_pde has been called at some 148 // point in the past with the appropriate range allocated. It does not imply 149 // that the 2M entry is a PDE (see pte_is_2m). 150 bool activated_big; 151 bool activated_4k; 152 153 // For {block, gpu} pairs which support the 2M page size, the page table 154 // ranges are uninitialized on allocation. This flag tracks whether the big 155 // PTEs have been initialized. 156 // 157 // We don't need an equivalent flag for the 4k range because we always write 158 // just the 4k PTEs not covered by higher-level PTEs. Big PTEs however can 159 // be allocated and activated late while the 4k PTEs are already active, in 160 // which case we need to initialize the entire big range. 161 bool initialized_big; 162 163 // Sticky state to split PTEs to 4k and keep them there. Used when a fatal 164 // fault has been detected on this GPU to avoid false dependencies within 165 // the uTLB for fatal and non-fatal faults on the same larger PTE, which 166 // could lead to wrong fault attribution. 167 bool force_4k_ptes; 168 169 // This table shows the HW PTE states given all permutations of pte_is_2m, 170 // big_ptes, and pte_bits. Note that the first row assumes that the 4k page 171 // tables have been allocated (if not, then no PDEs are allocated either). 172 // 173 // |-------------- SW state --------------|------------------- HW state --------------------| 174 // pte_is_2m pte_is_big pte_bits[READ] | Page size PDE0(2M only) Big PTE 4k PTE 175 // ---------------------------------------------------------------------------------------- 176 // 0 0 0 | 4k Valid PDE Invalid [1] Invalid 177 // 0 0 1 | 4k Valid PDE Invalid [1] Valid 178 // 0 1 0 | Big Valid PDE Unmapped [2] x 179 // 0 1 1 | Big Valid PDE Valid x 180 // 1 must be 0 0 | 2M Invalid x x 181 // 1 must be 0 1 | 2M Valid PTE x x 182 // 183 // [1]: The big PTE may be unallocated, in which case its pointer won't be 184 // valid in the parent PDE. If the big PTE is allocated, it will be 185 // invalid so the 4k PTEs are active. 186 // 187 // [2]: The unmapped big PTE pattern differs from the invalid pattern, and 188 // it prevents HW from reading the 4k entries. See the unmapped_pte() 189 // MMU HAL function. 190 191 // If pte_is_2m is true, there is a 2M PTE covering this VA block (valid or 192 // invalid). If false then we're in one of the following scenarios: 193 // 1) This {block, gpu} does not support 2M pages. 194 // 2) 2M pages are supported but the page_table_range_2m has not been 195 // allocated (implying that the other page table ranges have not been 196 // allocated either). 197 // 3) page_table_range_2m has been allocated, but the big_ptes bitmap should 198 // be used to determine the mix of big and 4k PTEs. 199 bool pte_is_2m; 200 201 // When pte_is_2m is false, this block consists of any possible mix of big 202 // and 4k PTEs. This bitmap describes that mix. A set bit indicates that the 203 // corresponding big-page-sized region of the block is covered by a big PTE. 204 // A cleared bit indicates that it is covered by 4k PTEs. 205 // 206 // Neither setting implies that the PTE currently has a valid mapping, it 207 // just indicates which PTE is read by the GPU (see the table above). 208 // 209 // The indices represent the corresponding big PTEs in the block's interior. 210 // For example, a block with alignment and size of one 4k page on either 211 // side of a big page will only use bit 0. Use uvm_va_block_big_page_index to look 212 // the big_ptes index of a page. 213 // 214 // The block might not be able to fit any big PTEs, in which case this 215 // bitmap is always zero. Use uvm_va_block_gpu_num_big_pages to find the number of 216 // valid bits in this mask. 217 DECLARE_BITMAP(big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 218 219 // See the comments for uvm_va_block_mmap_t::cpu.pte_bits. 220 // 221 // The major difference is that these bits are always accurate since, unlike 222 // the CPU PTEs, the UVM driver is in full control of these mappings. 223 // 224 // Note that the granularity is always PAGE_SIZE, not whatever GPU PTE size 225 // happens to currently map these regions. PAGE_SIZE is the minimum 226 // granularity of operations on the VA blocks. As a future optimization we 227 // could consider sub-PAGE_SIZE operations if PAGE_SIZE > 4K and the CPU 228 // isn't involved, for example false sharing among peer GPUs. 229 uvm_page_mask_t pte_bits[UVM_PTE_BITS_GPU_MAX]; 230 231 } uvm_va_block_gpu_state_t; 232 233 typedef struct 234 { 235 // Per-page residency bit vector, used for fast traversal of resident 236 // pages. 237 // 238 // A set bit means the CPU has a coherent copy of the physical page 239 // resident in the NUMA node's memory, and that a CPU chunk for the 240 // corresponding page index has been allocated. This does not mean that 241 // the coherent copy is currently mapped anywhere, however. A page may be 242 // resident on multiple processors (but not multiple CPU NUMA nodes) when in 243 // read-duplicate mode. 244 // 245 // A cleared bit means the CPU NUMA node does not have a coherent copy of 246 // that page resident. A CPU chunk for the corresponding page index may or 247 // may not have been allocated. If the chunk is present, it's a cached chunk 248 // which can be reused in the future. 249 // 250 // Allocating PAGES_PER_UVM_VA_BLOCK is overkill when the block is 251 // smaller than UVM_VA_BLOCK_SIZE, but it's not much extra memory 252 // overhead on the whole. 253 uvm_page_mask_t resident; 254 255 // Per-page allocation bit vector. 256 // 257 // A set bit means that a CPU chunk has been allocated for the 258 // corresponding page index on this NUMA node. 259 uvm_page_mask_t allocated; 260 261 // CPU memory chunks represent physically contiguous CPU memory 262 // allocations. See uvm_pmm_sysmem.h for more details on CPU chunks. 263 // This member is meant to hold an opaque value indicating the CPU 264 // chunk storage method. For more details on CPU chunk storage, 265 // see uvm_cpu_chunk_storage_type_t in uvm_va_block.c. 266 unsigned long chunks; 267 } uvm_va_block_cpu_node_state_t; 268 269 // TODO: Bug 1766180: Worst-case we could have one of these per system page. 270 // Options: 271 // 1) Rely on the OOM killer to prevent the user from trying to do that 272 // 2) Be much more space-conscious in this struct (difficult) 273 // 3) Cap the per-process range and/or block count, like vm.max_map_count 274 // does for vmas 275 struct uvm_va_block_struct 276 { 277 // Reference count for this block. References are held by: 278 // - The parent VA range for managed blocks or VA space for HMM blocks 279 // - The reverse map 280 // - The eviction path temporarily when attempting to evict a GPU page under 281 // this block 282 // 283 // This isn't protected by the lock on the eviction path, so it must be 284 // atomic. nv_kref provides that. 285 nv_kref_t kref; 286 287 // Lock protecting the block. See the comment at the top of uvm.c. 288 uvm_mutex_t lock; 289 290 // Parent VA range. Managed blocks have this set. HMM blocks will have 291 // va_range set to NULL and hmm.va_space set instead. Dead blocks that are 292 // waiting for the last ref count to be removed have va_range and 293 // hmm.va_space set to NULL (could be either type of block). 294 // 295 // This field can be read while holding either the block lock or just the VA 296 // space lock in read mode, since it can only change when the VA space lock 297 // is held in write mode. 298 uvm_va_range_t *va_range; 299 300 // Virtual address [start, end] covered by this block. These fields can be 301 // read while holding either the block lock or just the VA space lock in 302 // read mode, since they can only change when the VA space lock is held in 303 // write mode. 304 NvU64 start; 305 NvU64 end; 306 307 // Per-processor residency bit vector, used for fast lookup of which 308 // processors are active in this block. 309 // 310 // A set bit means the corresponding processor has a coherent physical copy 311 // of memory somewhere in the block. The per-processor state must then be 312 // inspected to find out which pages. The processor may or may not have a 313 // mapping to that physical memory, however. 314 // 315 // A cleared bit means the corresponding processor does not have a coherent 316 // physical copy of any pages under this block. The processor may still have 317 // cached pages allocated for future use, however. It also may have mappings 318 // to pages resident on other processors. 319 uvm_processor_mask_t resident; 320 321 // Per-processor mapping bit vector, used for fast lookup of which 322 // processors are active in this block. 323 // 324 // A set bit means the corresponding processor has an active, valid page 325 // table mapping to some VA in this block. The per-processor pte_bits state 326 // must then be inspected to find out the mapping address and permissions. 327 // 328 // A cleared bit means the corresponding processor has no virtual mappings 329 // within this block (all pte_bits entries are 0). 330 uvm_processor_mask_t mapped; 331 332 // Per-processor evicted bit vector, used for fast lookup of which GPUs 333 // have evicted pages in this block. 334 // 335 // A set bit means the corresponding processor was the residency of some of 336 // the pages in the block when they were evicted due to memory capacity 337 // limitations. The per-processor state must then be inspected to find out 338 // which pages. 339 // 340 // A cleared bit means the corresponding processor has no evicted pages 341 // within this block (all evicted entries are 0). 342 uvm_processor_mask_t evicted_gpus; 343 344 struct 345 { 346 // Per-NUMA node tracking of CPU allocations. 347 // This is a dense array with one entry per possible NUMA node. 348 uvm_va_block_cpu_node_state_t **node_state; 349 350 // Per-page allocation bit vector. 351 // 352 // A set bit means that a CPU page has been allocated for the 353 // corresponding page index on at least one CPU NUMA node. 354 uvm_page_mask_t allocated; 355 356 // Per-page residency bit vector. See 357 // uvm_va_block_cpu_numa_state_t::resident for a detailed description. 358 // This mask is a cumulative mask (logical OR) of all 359 // uvm_va_block_cpu_node_state_t::resident masks. It is meant to be used 360 // only for fast testing of page residency when it matters only if the 361 // page is resident on the CPU. 362 // 363 // Note that this mask cannot be set directly as this will cause 364 // inconsistencies between this mask and the per-NUMA residency masks. 365 // In order to properly maintain consistency between the per-NUMA masks 366 // and this one, uvm_va_block_cpu_[set|clear]_residency_*() helpers 367 // should be used. 368 uvm_page_mask_t resident; 369 370 // Per-page mapping bit vectors, one per bit we need to track. These are 371 // used for fast traversal of valid mappings in the block. These contain 372 // all non-address bits needed to establish a virtual mapping on this 373 // processor (permissions, cacheability, etc). 374 // 375 // A cleared bit in UVM_PTE_BITS_CPU_READ means the CPU has no valid 376 // virtual mapping to that address (the access will fault). Further, 377 // UVM_PTE_BITS_CPU_WRITE is guaranteed to also be clear. 378 // 379 // A set bit in UVM_PTE_BITS_CPU_READ means the CPU has a valid mapping 380 // at that address with at least read permissions. The physical page for 381 // that mapping is contained in the pages array. If 382 // UVM_PTE_BITS_CPU_WRITE is not set, the mapping is read-only. 383 // Otherwise, the mapping is read-write. 384 // 385 // For managed allocations, this is the maximum permissions a PTE 386 // could have, but not necessarily the actual current permissions of the 387 // CPU PTEs. The UVM driver will never change the PTEs without updating 388 // this state, but the kernel can downgrade our CPU mappings at any time 389 // without notifying the UVM driver (for example in response to user 390 // space calling madvise with MADV_DONTNEED). 391 // 392 // For HMM allocations, this is the minimum permission the CPU has since 393 // Linux can upgrade a read-only PTE to read-write without notifying 394 // the UVM driver. This is why read duplication isn't currently 395 // supported. 396 // TODO: Bug 3660922: Need to handle read duplication at some point. 397 uvm_page_mask_t pte_bits[UVM_PTE_BITS_CPU_MAX]; 398 399 // Whether the CPU has ever mapped a page on this VA block. This is 400 // used to force GMMU PDE1 pre-population on ATS systems. See 401 // pre_populate_gpu_pde1 in uvm_va_block.c for more information. 402 NvU8 ever_mapped : 1; 403 404 // We can get "unexpected" faults if multiple CPU threads fault on the 405 // same address simultaneously and race to create the mapping. Since 406 // our CPU fault handler always unmaps to handle the case where the 407 // kernel downgrades our CPU mappings, we can introduce an infinite 408 // stream of CPU faults in multi-threaded workloads. 409 // 410 // In order to handle this scenario, we keep track of the first thread 411 // that faulted on a page with valid permissions and the timestamp. 412 // Then, we keep track of the subsequent faults on that page during a 413 // window of time. If the first thread faults again on the page, that 414 // will indicate that the mapping has been downgraded by the kernel and 415 // we need to remap it. Faults from the rest of threads are just 416 // ignored. The information is also cleared on the following events: 417 // - The tracking window finishes 418 // - The page is unmapped 419 struct 420 { 421 // Timestamp when the first fault was detected. This also is used 422 // as a flag that the contents of this struct are valid 423 NvU64 first_fault_stamp; 424 425 // First thread that faulted while having valid permissions. we 426 // don't take a reference on the pid so we shouldn't ever use it 427 // for task-lookup in the kernel. We only use it as a heuristic so 428 // it's OK if the pid gets destroyed or reused. 429 pid_t first_pid; 430 431 // Index of the page whose faults are being tracked 432 uvm_page_index_t page_index; 433 } fault_authorized; 434 } cpu; 435 436 // Per-GPU residency and mapping state 437 // 438 // TODO: Bug 1766180: Even though these are pointers, making this a static 439 // array will use up a non-trivial amount of storage for small blocks. 440 // In most cases we won't have anywhere near this many GPUs active 441 // anyway. Consider using a dense array of just the GPUs registered in 442 // this VA space, depending on the perf of accessing that array and on 443 // how noticeable this memory overhead actually is. 444 uvm_va_block_gpu_state_t *gpus[UVM_ID_MAX_GPUS]; 445 446 // Mask to keep track of the pages that are read-duplicate 447 uvm_page_mask_t read_duplicated_pages; 448 449 // Mask to keep track of the pages that are not mapped on any non-UVM-Lite 450 // processor. This mask is not used for HMM because the CPU can map pages 451 // at any time without notifying the driver. 452 // 0: Page is definitely not mapped by any processors 453 // 1: Page may or may not be mapped by a processor 454 // 455 // This mask sets the bit when the page is mapped on any non-UVM-Lite 456 // processor but it is not always unset on unmap (to avoid a performance 457 // impact). Therefore, it can contain false negatives. It should be only 458 // used for opportunistic optimizations that have a fast path for pages 459 // that are not mapped anywhere (see uvm_va_block_migrate_locked, for 460 // example), but not the other way around. 461 uvm_page_mask_t maybe_mapped_pages; 462 463 // Tracks all outstanding GPU work related to this block: GPU copies, PTE 464 // updates, TLB invalidates, etc. The residency and mapping state is only 465 // valid once this tracker is done. 466 // 467 // CPU operations need to wait for this tracker to be done. GPU operations 468 // need to acquire it before pushing their work, then that work must be 469 // added to this tracker before the block's lock is dropped. 470 uvm_tracker_t tracker; 471 472 // A queue item for establishing eviction mappings in a deferred way 473 nv_kthread_q_item_t eviction_mappings_q_item; 474 475 uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT]; 476 477 // Prefetch infomation that is updated while holding the va_block lock but 478 // records state while the lock is not held. 479 struct 480 { 481 uvm_processor_id_t last_migration_proc_id; 482 483 NvU16 fault_migrations_to_last_proc; 484 } prefetch_info; 485 486 struct 487 { 488 #if UVM_IS_CONFIG_HMM() 489 // The MMU notifier is registered per va_block. 490 struct mmu_interval_notifier notifier; 491 #endif 492 493 // This is used to serialize migrations between CPU and GPU while 494 // allowing the va_block lock to be dropped. 495 // This must be acquired before locking the va_block lock if the 496 // critical section can change the residency state. 497 // Do not access directly, use the uvm_hmm_migrate_*() routines. 498 uvm_mutex_t migrate_lock; 499 500 // Sequence number to tell if any changes were made to the va_block 501 // while not holding the block lock and calling hmm_range_fault(). 502 unsigned long changed; 503 504 // Parent VA space pointer. It is NULL for managed blocks or if 505 // the HMM block is dead. This field can be read while holding the 506 // block lock and is only modified while holding the va_space write 507 // lock and va_block lock (same as the va_range pointer). 508 uvm_va_space_t *va_space; 509 510 // Tree of uvm_va_policy_node_t. The policy node ranges always cover 511 // all or part of a VMA range or a contiguous range of VMAs within the 512 // va_block. Policy nodes are resized or deleted when the underlying 513 // VMA range is changed by Linux via the invalidate() callback. 514 // Otherwise, policies could be stale after munmap(). 515 // Locking: The va_block lock is needed to access or modify the tree. 516 uvm_range_tree_t va_policy_tree; 517 518 // Storage node for range tree of va_blocks. 519 uvm_range_tree_node_t node; 520 } hmm; 521 }; 522 523 // We define additional per-VA Block fields for testing. When 524 // uvm_enable_builtin_tests is defined, all VA Blocks will have 525 // uvm_va_block_wrapper_t size. Otherwise, the test fields are not available. 526 // Use the uvm_va_block_get_test function defined below to obtain a safe 527 // pointer to uvm_va_block_test_t from a uvm_va_block_t pointer. 528 struct uvm_va_block_wrapper_struct 529 { 530 uvm_va_block_t block; 531 532 struct uvm_va_block_test_struct 533 { 534 // Count of how many page table allocations should be forced to retry 535 // with eviction enabled. Used for testing only. 536 NvU32 page_table_allocation_retry_force_count; 537 538 // Count of how many user pages allocations should be forced to retry 539 // with eviction enabled. Used for testing only. 540 NvU32 user_pages_allocation_retry_force_count; 541 542 // Mask of chunk sizes to be used for CPU chunk allocations. 543 // The actual set of chunk sizes to be used will be the set resulting 544 // from AND'ing this value with the value of 545 // uvm_cpu_chunk_allocation_sizes module parameter. 546 NvU32 cpu_chunk_allocation_size_mask; 547 548 // Subsequent operations that need to allocate CPU pages will fail. As 549 // opposed to other error injection settings, this one fails N times 550 // and then succeeds instead of failing on the Nth try. A value of ~0u 551 // means fail indefinitely. 552 // This is because this error is supposed to be fatal and tests verify 553 // the state of the VA blocks after the failure. However, some tests 554 // use kernels to trigger migrations and a fault replay could trigger 555 // a successful migration if this error flag is cleared. 556 NvU32 inject_cpu_pages_allocation_error_count; 557 558 // A NUMA node ID on which any CPU chunks will be allocated from. 559 // This will override any other setting and/or policy. 560 // Note that the kernel is still free to allocate from any of the 561 // nodes in the thread's policy. 562 int cpu_chunk_allocation_target_id; 563 int cpu_chunk_allocation_actual_id; 564 565 // Force the next eviction attempt on this block to fail. Used for 566 // testing only. 567 bool inject_eviction_error; 568 569 // Force the next successful chunk allocation to then fail. Used for testing 570 // only to simulate driver metadata allocation failure. 571 bool inject_populate_error; 572 573 // Force the next split on this block to fail. 574 // Set by error injection ioctl for testing purposes only. 575 bool inject_split_error; 576 } test; 577 }; 578 579 // Tracking needed for supporting allocation-retry of user GPU memory 580 struct uvm_va_block_retry_struct 581 { 582 // A tracker used for all allocations from PMM. 583 uvm_tracker_t tracker; 584 585 // List of allocated chunks (uvm_gpu_chunk_t). Currently all chunks are of 586 // the same size. However it can contain chunks from multiple GPUs. All 587 // remaining free chunks are freed when the operation is finished with 588 // uvm_va_block_retry_deinit(). 589 struct list_head free_chunks; 590 591 // List of chunks allocated and used during the block operation. This list 592 // can contain chunks from multiple GPUs. All the used chunks are unpinned 593 // when the operation is finished with uvm_va_block_retry_deinit(). 594 struct list_head used_chunks; 595 }; 596 597 // Module load/exit 598 NV_STATUS uvm_va_block_init(void); 599 void uvm_va_block_exit(void); 600 601 // Allocates and initializes the block. The block's ref count is initialized to 602 // 1. The caller is responsible for inserting the block into its parent 603 // va_range. 604 // 605 // The caller must be holding the VA space lock in at least read mode. 606 // 607 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED. 608 NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range, 609 NvU64 start, 610 NvU64 end, 611 uvm_va_block_t **out_block); 612 613 // Internal function called only when uvm_va_block_release drops the ref count 614 // to 0. Do not call directly. 615 void uvm_va_block_destroy(nv_kref_t *kref); 616 617 static inline void uvm_va_block_retain(uvm_va_block_t *va_block) 618 { 619 nv_kref_get(&va_block->kref); 620 } 621 622 // Locking: The va_block lock must not be held. 623 // The va_space lock must be held in write mode unless it is the special case 624 // that the block has no GPU state; for example, right after calling 625 // uvm_va_block_create(). In that case, the va_space lock can be held in read 626 // mode. 627 static inline void uvm_va_block_release(uvm_va_block_t *va_block) 628 { 629 if (va_block) { 630 // The calling thread shouldn't be holding the block's mutex when 631 // releasing the block as it might get destroyed. 632 uvm_assert_unlocked_order(UVM_LOCK_ORDER_VA_BLOCK); 633 nv_kref_put(&va_block->kref, uvm_va_block_destroy); 634 } 635 } 636 637 // Same as uvm_va_block_release but the caller may be holding the VA block lock. 638 // The caller must ensure that the refcount will not get to zero in this call. 639 static inline void uvm_va_block_release_no_destroy(uvm_va_block_t *va_block) 640 { 641 int destroyed = nv_kref_put(&va_block->kref, uvm_va_block_destroy); 642 UVM_ASSERT(!destroyed); 643 } 644 645 // Returns true if the block is managed by HMM. 646 // Locking: This can be called while holding either the block lock or just the 647 // VA space lock in read mode, since it can only change when the VA space lock 648 // is held in write mode. 649 static inline bool uvm_va_block_is_hmm(uvm_va_block_t *va_block) 650 { 651 #if UVM_IS_CONFIG_HMM() 652 return va_block->hmm.va_space; 653 #else 654 return false; 655 #endif 656 } 657 658 // Return true if the block is dead. 659 // Locking: This can be called while holding either the block lock or just the 660 // VA space lock in read mode, since it can only change when the VA space lock 661 // is held in write mode. 662 static inline bool uvm_va_block_is_dead(uvm_va_block_t *va_block) 663 { 664 if (va_block->va_range) 665 return false; 666 667 #if UVM_IS_CONFIG_HMM() 668 if (va_block->hmm.va_space) 669 return false; 670 #endif 671 672 return true; 673 } 674 675 static inline uvm_va_block_gpu_state_t *uvm_va_block_gpu_state_get(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id) 676 { 677 return va_block->gpus[uvm_id_gpu_index(gpu_id)]; 678 } 679 680 // Return the va_space pointer of the given block or NULL if the block is dead. 681 // Locking: This can be called while holding either the block lock or just the 682 // VA space lock in read mode, since it can only change when the VA space lock 683 // is held in write mode. 684 uvm_va_space_t *uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t *va_block); 685 686 // Return the va_space pointer of the given block assuming the block is not dead 687 // (asserts that it is not dead and asserts va_space is not NULL). 688 // Locking: This can be called while holding either the block lock or just the 689 // VA space lock in read mode, since it can only change when the VA space lock 690 // is held in write mode. 691 uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block); 692 693 // Return true if the VA space has access counter migrations enabled and should 694 // remote map pages evicted to system memory. This is OK since access counters 695 // can pull the data back to vidmem if sufficient accesses trigger a migration. 696 // The caller must ensure that the VA space cannot go away. 697 bool uvm_va_space_map_remote_on_eviction(uvm_va_space_t *va_space); 698 699 // Dynamic cache-based allocation for uvm_va_block_context_t. 700 // 701 // See uvm_va_block_context_init() for a description of the mm parameter. 702 uvm_va_block_context_t *uvm_va_block_context_alloc(struct mm_struct *mm); 703 void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context); 704 705 // Initialization of an already-allocated uvm_va_block_context_t. 706 // 707 // mm is used to initialize the value of va_block_context->mm. NULL is allowed. 708 void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context, struct mm_struct *mm); 709 710 // Return the preferred NUMA node ID for the block's policy. 711 // If the preferred node ID is NUMA_NO_NODE, the current NUMA node ID 712 // is returned. 713 int uvm_va_block_context_get_node(uvm_va_block_context_t *va_block_context); 714 715 // TODO: Bug 1766480: Using only page masks instead of a combination of regions 716 // and page masks could simplify the below APIs and their implementations 717 // at the cost of having to scan the whole mask for small regions. 718 // Investigate the performance effects of doing that. 719 720 // Moves the physical pages of the given region onto the destination processor. 721 // If page_mask is non-NULL, the movement is further restricted to only those 722 // pages in the region which are present in the mask. 723 // 724 // prefetch_page_mask may be passed as a subset of page_mask when cause is 725 // UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT, 726 // UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT, or 727 // UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER to indicate pages that have been 728 // pulled due to automatic page prefetching heuristics. For pages in this mask, 729 // UVM_MAKE_RESIDENT_CAUSE_PREFETCH will be reported in migration events, 730 // instead. 731 // 732 // This function breaks read duplication for all given pages even if they 733 // don't migrate. Pages which are not resident on the destination processor 734 // will also be unmapped from all existing processors, be populated in the 735 // destination processor's memory, and copied to the new physical location. 736 // Any new memory will be zeroed if it is the first allocation for that page 737 // in the system. 738 // 739 // This function does not create any new virtual mappings. 740 // 741 // This function acquires/waits for the va_block tracker and updates that 742 // tracker with any new work pushed. 743 // 744 // Allocation-retry: this operation may need to perform eviction to be able to 745 // allocate GPU memory successfully and if that happens, 746 // NV_ERR_MORE_PROCESSING_REQUIRED will be returned. That also means that the 747 // block's lock has been unlocked and relocked as part of the call and that the 748 // whole sequence of operations performed under the block's lock needs to be 749 // attempted again. To facilitate that, the caller needs to provide the same 750 // va_block_retry struct for each attempt that has been initialized before the 751 // first attempt and needs to be deinitialized after the last one. Most callers 752 // can just use UVM_VA_BLOCK_LOCK_RETRY() that takes care of that for the 753 // caller. 754 // 755 // If dest_id is the CPU then va_block_retry can be NULL and allocation-retry of 756 // user memory is guaranteed not to happen. Allocation-retry of GPU page tables 757 // can still occur though. 758 // 759 // va_block_context must not be NULL and policy for the region must 760 // match. This function will set a bit in 761 // va_block_context->make_resident.pages_changed_residency for each 762 // page that changed residency (due to a migration or first 763 // population) as a result of the operation and 764 // va_block_context->make_resident.all_involved_processors for each 765 // processor involved in the copy. This function only sets bits in 766 // those masks. It is the caller's responsiblity to zero the masks or 767 // not first. 768 // 769 // va_block_context->make_resident.dest_nid is used to guide the NUMA node for 770 // CPU allocations. 771 // 772 // Notably any status other than NV_OK indicates that the block's lock might 773 // have been unlocked and relocked. 774 // 775 // LOCKING: The caller must hold the va_block lock. 776 // If va_block_context->mm != NULL, va_block_context->mm->mmap_lock must be 777 // held in at least read mode. 778 NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block, 779 uvm_va_block_retry_t *va_block_retry, 780 uvm_va_block_context_t *va_block_context, 781 uvm_processor_id_t dest_id, 782 uvm_va_block_region_t region, 783 const uvm_page_mask_t *page_mask, 784 const uvm_page_mask_t *prefetch_page_mask, 785 uvm_make_resident_cause_t cause); 786 787 // Similar to uvm_va_block_make_resident (read documentation there). The main 788 // differences are: 789 // - Pages are copied not moved (i.e. other copies of the page are not 790 // unmapped) 791 // - Processors with a resident copy of pages that migrated have write and 792 // atomic access permission revoked, unlike in uvm_va_block_make_resident 793 // where they are unmapped 794 // - All remote mappings (due to either SetAccessedBy or performance heuristics) 795 // are broken 796 // - Only managed va_blocks are supported. 797 // TODO: Bug 3660922: need to implement HMM read duplication support. 798 NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block, 799 uvm_va_block_retry_t *va_block_retry, 800 uvm_va_block_context_t *va_block_context, 801 uvm_processor_id_t dest_id, 802 uvm_va_block_region_t region, 803 const uvm_page_mask_t *page_mask, 804 const uvm_page_mask_t *prefetch_page_mask, 805 uvm_make_resident_cause_t cause); 806 807 // Similar to uvm_va_block_make_resident() (read documentation there). The 808 // difference is that source pages are only copied to the destination and the 809 // residency is not updated until uvm_va_block_make_resident_finish() is called. 810 // Otherwise, the combination of uvm_va_block_make_resident_copy() and 811 // uvm_va_block_make_resident_finish() is the same as just calling 812 // uvm_va_block_make_resident(). Note, however, that the va_block lock must be 813 // held across the two calls for the operation to be complete. The va_block 814 // lock can be dropped after calling uvm_va_block_make_resident_copy() but 815 // uvm_va_block_make_resident_copy() must be called again after relocking the 816 // va_block lock and before calling uvm_va_block_make_resident_finish(). 817 // This split is needed when using migrate_vma_setup() and migrate_vma_pages() 818 // so that when migrate_vma_pages() indicates a page is not migrating, the 819 // va_block state is not updated. 820 // LOCKING: The caller must hold the va_block lock. 821 NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block, 822 uvm_va_block_retry_t *va_block_retry, 823 uvm_va_block_context_t *va_block_context, 824 uvm_processor_id_t dest_id, 825 uvm_va_block_region_t region, 826 const uvm_page_mask_t *page_mask, 827 const uvm_page_mask_t *prefetch_page_mask, 828 uvm_make_resident_cause_t cause); 829 830 // The page_mask must be the same or a subset of the page_mask passed to 831 // uvm_va_block_make_resident_copy(). This step updates the residency and breaks 832 // read duplication. 833 // LOCKING: The caller must hold the va_block lock. 834 void uvm_va_block_make_resident_finish(uvm_va_block_t *va_block, 835 uvm_va_block_context_t *va_block_context, 836 uvm_va_block_region_t region, 837 const uvm_page_mask_t *page_mask); 838 839 // Creates or upgrades a mapping from the input processor to the given virtual 840 // address region. Pages which already have new_prot permissions or higher are 841 // skipped, so this call ensures that the range is mapped with at least new_prot 842 // permissions. new_prot must not be UVM_PROT_NONE. uvm_va_block_unmap or 843 // uvm_va_block_revoke_prot should be used to downgrade permissions instead. 844 // 845 // The mapped pages are described by the region parameter and the map page mask 846 // that allows the caller to restrict the map operation to specific pages within 847 // the region. If the page mask is NULL then the whole region is mapped. 848 // 849 // If the input processor is a GPU with no GPU VA space registered, or if the 850 // input processor is the CPU and this thread is not allowed to create CPU 851 // mappings, this function does nothing. CPU mappings are only allowed if 852 // uvm_va_range_vma_check(va_block_context->mm) is valid, so the caller must 853 // set va_block_context->mm before calling this function. 854 // 855 // cause specifies the cause to be reported in events in case a remote mapping 856 // is created. 857 // 858 // Any CPU mappings will wait for the va_block tracker. If this function pushes 859 // GPU work it will first acquire the va_block tracker, then add the pushed work 860 // to out_tracker. It is the caller's responsibility to add this work to 861 // va_block's tracker. Note that while it is generally safe to run map 862 // operations on different GPUs concurrently, two PTE operations (map, unmap, 863 // revoke) on the same GPU must be serialized even if they target different 864 // pages because the earlier operation can cause a PTE split or merge which is 865 // assumed by the later operation. 866 // 867 // va_block_context must not be NULL and policy for the region must match. 868 // See the comments for uvm_va_block_check_policy_is_valid(). 869 // 870 // If allocation-retry was required as part of the operation and was successful, 871 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the 872 // out_tracker were added to the block's tracker and then the block's lock was 873 // unlocked and relocked. 874 // 875 // In general, any status other than NV_OK indicates that the block's lock might 876 // have been unlocked and relocked. 877 // 878 // LOCKING: The caller must hold the va block lock. If va_block_context->mm != 879 // NULL, va_block_context->mm->mmap_lock must be held in at least read 880 // mode. 881 NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block, 882 uvm_va_block_context_t *va_block_context, 883 uvm_processor_id_t id, 884 uvm_va_block_region_t region, 885 const uvm_page_mask_t *map_page_mask, 886 uvm_prot_t new_prot, 887 UvmEventMapRemoteCause cause, 888 uvm_tracker_t *out_tracker); 889 890 // Like uvm_va_block_map, except it maps all processors in the input mask. The 891 // VA block tracker contains all map operations on return. 892 // 893 // Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like 894 // uvm_va_block_map() indicating that the operation needs to be retried. 895 NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block, 896 uvm_va_block_context_t *va_block_context, 897 const uvm_processor_mask_t *map_processor_mask, 898 uvm_va_block_region_t region, 899 const uvm_page_mask_t *map_page_mask, 900 uvm_prot_t new_prot, 901 UvmEventMapRemoteCause cause); 902 903 // Unmaps virtual regions from a single processor. This does not free page 904 // tables or physical memory. This is safe to call on the eviction path, but the 905 // caller must ensure that the block hasn't been killed. 906 // 907 // The unmapped pages are described by the region parameter and the unmap page 908 // mask that allows the caller to restrict the unmap operation to specific pages 909 // within the region. If the page mask is NULL then the whole region is 910 // unmapped. 911 // 912 // If id is UVM_ID_CPU, this is guaranteed to return NV_OK, and this is safe to 913 // call without holding a reference on the mm which owns the associated vma. 914 // 915 // Any CPU unmappings will wait for the va_block tracker. If this function 916 // pushes GPU work it will first acquire the va_block tracker, then add the 917 // pushed work to out_tracker. It is the caller's responsibility to add this 918 // work to va_block's tracker. Note that while it is generally safe to run unmap 919 // operations on different GPUs concurrently, two PTE operations (map, unmap, 920 // revoke) on the same GPU must be serialized even if they target different 921 // pages because the earlier operation can cause a PTE split or merge which is 922 // assumed by the later operation. 923 // 924 // va_block_context must not be NULL. 925 // 926 // If allocation-retry was required as part of the operation and was successful, 927 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the 928 // out_tracker were added to the block's tracker and then the block's lock was 929 // unlocked and relocked. It is guaranteed that retry will not be required if 930 // the unmap does not cause a PTE split. Examples of operations which will not 931 // cause a PTE split include unmapping the entire block, unmapping all PTEs with 932 // matching attributes, and unmapping all PTEs which point to the same physical 933 // chunk. 934 // 935 // LOCKING: The caller must hold the va_block lock. 936 NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block, 937 uvm_va_block_context_t *va_block_context, 938 uvm_processor_id_t id, 939 uvm_va_block_region_t region, 940 const uvm_page_mask_t *unmap_page_mask, 941 uvm_tracker_t *out_tracker); 942 943 // Like uvm_va_block_unmap, except it unmaps all processors in the input mask. 944 // The VA block tracker contains all map operations on return. 945 NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block, 946 uvm_va_block_context_t *va_block_context, 947 const uvm_processor_mask_t *unmap_processor_mask, 948 uvm_va_block_region_t region, 949 const uvm_page_mask_t *unmap_page_mask); 950 951 // Function called when the preferred location changes. Notably: 952 // - Mark all CPU pages as dirty because the new processor may not have 953 // up-to-date data. 954 // - Unmap the preferred location's processor from any pages in this region 955 // which are not resident on the preferred location. 956 // 957 // va_block_context must not be NULL and policy for the region must match. 958 // See the comments for uvm_va_block_check_policy_is_valid(). 959 // 960 // LOCKING: The caller must hold the VA block lock. 961 NV_STATUS uvm_va_block_set_preferred_location_locked(uvm_va_block_t *va_block, 962 uvm_va_block_context_t *va_block_context, 963 uvm_va_block_region_t region); 964 965 // Maps the given processor to all resident pages in this block, as allowed by 966 // location and policy. Waits for the operation to complete before returning. 967 // This function should only be called with managed va_blocks. 968 // 969 // va_block_context must not be NULL and policy for the region must match. 970 // See the comments for uvm_va_block_check_policy_is_valid(). 971 // 972 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm 973 // != NULL, va_block_context->mm->mmap_lock must be held in at least 974 // read mode. 975 NV_STATUS uvm_va_block_set_accessed_by(uvm_va_block_t *va_block, 976 uvm_va_block_context_t *va_block_context, 977 uvm_processor_id_t processor_id); 978 979 // Maps given processor to all resident pages in this block and region, as 980 // allowed by location and policy. The caller is responsible for waiting for 981 // the tracker after all mappings have been started. 982 // This function can be called with HMM and managed va_blocks. 983 // 984 // va_block_context must not be NULL and policy for the region must match. 985 // See the comments for uvm_va_block_check_policy_is_valid(). 986 // 987 // LOCKING: The caller must hold the va_block lock and 988 // va_block_context->mm->mmap_lock must be held in at least read mode. 989 NV_STATUS uvm_va_block_set_accessed_by_locked(uvm_va_block_t *va_block, 990 uvm_va_block_context_t *va_block_context, 991 uvm_processor_id_t processor_id, 992 uvm_va_block_region_t region, 993 uvm_tracker_t *out_tracker); 994 995 // Breaks SetAccessedBy and remote mappings 996 // This function should only be called with managed va_blocks. 997 // 998 // va_block_context must not be NULL and policy for the region must match. 999 // See the comments for uvm_va_block_check_policy_is_valid(). 1000 // 1001 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm 1002 // != NULL, va_block_context->mm->mmap_lock must be held in at least 1003 // read mode. 1004 NV_STATUS uvm_va_block_set_read_duplication(uvm_va_block_t *va_block, 1005 uvm_va_block_context_t *va_block_context); 1006 1007 // Restores SetAccessedBy mappings 1008 // This function should only be called with managed va_blocks. 1009 // 1010 // va_block_context must not be NULL and policy for the region must match. 1011 // See the comments for uvm_va_block_check_policy_is_valid(). 1012 // 1013 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm 1014 // != NULL, va_block_context->mm->mmap_lock must be held in at least 1015 // read mode. 1016 NV_STATUS uvm_va_block_unset_read_duplication(uvm_va_block_t *va_block, 1017 uvm_va_block_context_t *va_block_context); 1018 1019 // Check if processor_id is allowed to access the va_block with access_type 1020 // permissions. Return values: 1021 // 1022 // NV_ERR_INVALID_ADDRESS The VA block is logically dead (zombie) 1023 // NV_ERR_INVALID_ACCESS_TYPE The vma corresponding to the VA range does not 1024 // allow access_type permissions, or migration is 1025 // disallowed and processor_id cannot access the 1026 // range remotely (UVM-Lite). 1027 // NV_ERR_INVALID_OPERATION The access would violate the policies specified 1028 // by UvmPreventMigrationRangeGroups. 1029 // 1030 // va_block_context must not be NULL, policy must match, and if the va_block is 1031 // a HMM block, va_block_context->hmm.vma must be valid which also means the 1032 // va_block_context->mm is not NULL, retained, and locked for at least read. 1033 // Locking: the va_block lock must be held. 1034 NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block, 1035 uvm_va_block_context_t *va_block_context, 1036 uvm_processor_id_t processor_id, 1037 uvm_page_index_t page_index, 1038 uvm_fault_type_t access_type, 1039 bool allow_migration); 1040 1041 // API for access privilege revocation 1042 // 1043 // Revoke prot_to_revoke access permissions for the given processor. 1044 // 1045 // The revoked pages are described by the region parameter and the revoke page 1046 // mask that allows the caller to restrict the revoke operation to specific 1047 // pages within the region. 1048 // 1049 // prot_to_revoke must be greater than UVM_PROT_READ_ONLY. Caller should call 1050 // unmap explicitly if it wants to revoke all access privileges. 1051 // 1052 // If id is UVM_ID_CPU, and prot_to_revoke is UVM_PROT_READ_WRITE_ATOMIC, no 1053 // action is performed. If the processor id corresponds to the CPU and the 1054 // caller cannot establish CPU mappings because it does not have a reference on 1055 // vma->vm_mm (va_block_context->mm != vma->vm_mm), the page will be simply 1056 // unmapped. Caller should call unmap explicitly if it wants to revoke all 1057 // access privileges. 1058 // 1059 // Any CPU revocation will wait for the va_block tracker. If this function 1060 // pushes GPU work it will first acquire the va_block tracker, then add the 1061 // pushed work to out_tracker. It is the caller's responsibility to add this 1062 // work to va_block's tracker. Note that while it is generally safe to run 1063 // revocation operations on different GPUs concurrently, two PTE operations 1064 // (map, unmap, revoke) on the same GPU must be serialized even if they target 1065 // different pages because the earlier operation can cause a PTE split or merge 1066 // which is assumed by the later operation. 1067 // 1068 // va_block_context must not be NULL. 1069 // 1070 // If allocation-retry was required as part of the operation and was successful, 1071 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the 1072 // out_tracker were added to the block's tracker and then the block's lock was 1073 // unlocked and relocked. 1074 // 1075 // In general, any status other than NV_OK indicates that the block's lock might 1076 // have been unlocked and relocked. 1077 // 1078 // LOCKING: The caller must hold the va block lock. If va_block_context->mm != 1079 // NULL, va_block_context->mm->mmap_lock must be held in at least read 1080 // mode. 1081 NV_STATUS uvm_va_block_revoke_prot(uvm_va_block_t *va_block, 1082 uvm_va_block_context_t *va_block_context, 1083 uvm_processor_id_t id, 1084 uvm_va_block_region_t region, 1085 const uvm_page_mask_t *revoke_page_mask, 1086 uvm_prot_t prot_to_revoke, 1087 uvm_tracker_t *out_tracker); 1088 1089 // Like uvm_va_block_revoke_prot(), except it revokes all processors in the 1090 // input mask. The VA block tracker contains all revocation operations on 1091 // return. 1092 // 1093 // Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like 1094 // uvm_va_block_revoke_prot() indicating that the operation needs to be retried. 1095 NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block, 1096 uvm_va_block_context_t *va_block_context, 1097 const uvm_processor_mask_t *revoke_processor_mask, 1098 uvm_va_block_region_t region, 1099 const uvm_page_mask_t *revoke_page_mask, 1100 uvm_prot_t prot_to_revoke); 1101 1102 // Tries to map all pages in the given region and map_page_mask with at most 1103 // max_prot privileges for appropriate processors as determined by the 1104 // accessed_by mask, heuristics and the given processor mask (excluding 1105 // processor_id, which triggered the migration and should have already been 1106 // mapped). 1107 // 1108 // va_block_context must not be NULL and policy for the region must match. 1109 // See the comments for uvm_va_block_check_policy_is_valid(). 1110 // 1111 // This function acquires/waits for the va_block tracker and updates that 1112 // tracker with any new work pushed. 1113 // 1114 // Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like 1115 // uvm_va_block_map() indicating that the operation needs to be retried. 1116 // 1117 // LOCKING: The caller must hold the va block lock. If va_block_context->mm != 1118 // NULL, va_block_context->mm->mmap_lock must be held in at least read 1119 // mode. 1120 NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block, 1121 uvm_va_block_context_t *va_block_context, 1122 uvm_processor_id_t new_residency, 1123 uvm_processor_id_t processor_id, 1124 uvm_va_block_region_t region, 1125 const uvm_page_mask_t *map_page_mask, 1126 uvm_prot_t max_prot, 1127 const uvm_processor_mask_t *processor_mask); 1128 1129 // Maps processors using SetAccessedBy to all resident pages in the region 1130 // parameter. On Volta+ it is also used to map evicted pages that can be later 1131 // pulled back by using access counters. 1132 // 1133 // This function acquires/waits for the va_block tracker and updates that 1134 // tracker with any new work pushed. 1135 // 1136 // Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like 1137 // uvm_va_block_map() indicating that the operation needs to be retried. 1138 // 1139 // va_block_context must not be NULL and policy must for the region must match. 1140 // See the comments for uvm_va_block_check_policy_is_valid(). 1141 // 1142 // LOCKING: The caller must hold the va block lock. If va_block_context->mm != 1143 // NULL, va_block_context->mm->mmap_lock must be held in at least read 1144 // mode. 1145 NV_STATUS uvm_va_block_add_mappings(uvm_va_block_t *va_block, 1146 uvm_va_block_context_t *va_block_context, 1147 uvm_processor_id_t processor_id, 1148 uvm_va_block_region_t region, 1149 const uvm_page_mask_t *page_mask, 1150 UvmEventMapRemoteCause cause); 1151 1152 // Notifies the VA block that a new GPU VA space has been created. 1153 // LOCKING: The caller must hold the va_block lock 1154 NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space); 1155 1156 // Destroys the VA block's mappings and page tables on the GPU, if it has any. 1157 // 1158 // If mm != NULL, that mm is used for any CPU mappings which may be created as 1159 // a result of this call. See uvm_va_block_context_t::mm for details. 1160 // 1161 // va_block_context must not be NULL. 1162 // 1163 // LOCKING: The caller must hold the va_block lock. If block_context->mm is not 1164 // NULL, the caller must hold mm->mmap_lock in at least read mode. 1165 void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block, 1166 uvm_gpu_va_space_t *gpu_va_space, 1167 uvm_va_block_context_t *block_context); 1168 1169 // Creates any mappings necessary in this VA block between the two GPUs, in 1170 // either direction. 1171 // LOCKING: The caller must hold the va_block lock 1172 NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1); 1173 1174 // Unmaps all page tables in this VA block which have peer mappings between 1175 // the two GPUs, in either direction. 1176 // LOCKING: The caller must hold the va_block lock 1177 void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1); 1178 1179 // Unmap any mappings from GPU to the preferred location. 1180 // 1181 // The GPU has to be in UVM-Lite mode. 1182 // 1183 // LOCKING: The caller must hold the va_block lock 1184 void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uvm_gpu_t *gpu); 1185 1186 // Frees all memory under this block associated with this GPU. Any portion of 1187 // the block which is resident on the GPU is evicted to sysmem before being 1188 // freed. 1189 // 1190 // If mm != NULL, that mm is used for any CPU mappings which may be created as 1191 // a result of this call. See uvm_va_block_context_t::mm for details. 1192 // 1193 // LOCKING: This takes and releases the VA block lock. If mm != NULL, the caller 1194 // must hold mm->mmap_lock in at least read mode. 1195 void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm); 1196 1197 // Same as uvm_va_block_unregister_gpu() but the VA block lock must be held. 1198 // Note that this handles allocation-retry internally and hence might unlock 1199 // and relock block's lock. 1200 void uvm_va_block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm); 1201 1202 // Unmaps all memory associated with the block and drops the ref count of the 1203 // block. This allows the caller to free resources associated with this block 1204 // regardless of the block's current ref count. Most importantly it allows the 1205 // VA covered by this block to be immediately available for other page table 1206 // mappings upon return. 1207 // 1208 // This clears block->va_range, so only the VA range destroy path should call 1209 // it. Other paths with references on this block, specifically the eviction path 1210 // which temporarily takes a reference to the block, must always check the block 1211 // state after taking the block lock to see if their mapping is still in place. 1212 // 1213 // All of the unmap and state destruction steps are also performed when the ref 1214 // count goes to 0, so this function only needs to be called if the block's 1215 // resources need to be reclaimed immediately. 1216 // 1217 // The caller should not lock the block before calling this function. 1218 // 1219 // This performs a uvm_va_block_release. 1220 void uvm_va_block_kill(uvm_va_block_t *va_block); 1221 1222 // Exactly the same split semantics as uvm_va_range_split, including error 1223 // handling. See that function's comments for details. 1224 // 1225 // new_va_block's va_range is set to new_va_range before any reverse mapping is 1226 // established to the new block, but the caller is responsible for inserting the 1227 // new block into the range. 1228 NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block, 1229 NvU64 new_end, 1230 uvm_va_block_t **new_va_block, 1231 uvm_va_range_t *new_va_range); 1232 1233 // Exactly the same split semantics as uvm_va_block_split, including error 1234 // handling except the existing_va_block block lock needs to be held and 1235 // the new_va_block has to be preallocated. 1236 // Also note that the existing_va_block lock may be dropped and re-acquired. 1237 NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block, 1238 NvU64 new_end, 1239 uvm_va_block_t *new_va_block, 1240 uvm_va_range_t *new_va_range); 1241 1242 // Handles a CPU fault in the given VA block, performing any operations 1243 // necessary to establish a coherent CPU mapping (migrations, cache invalidates, 1244 // etc.). 1245 // 1246 // Locking: 1247 // - vma->vm_mm->mmap_lock must be held in at least read mode. Note, that 1248 // might not be the same as current->mm->mmap_lock. 1249 // - va_space lock must be held in at least read mode 1250 // 1251 // service_context->block_context.mm is ignored and vma->vm_mm is used instead. 1252 // 1253 // Returns NV_ERR_INVALID_ACCESS_TYPE if a CPU mapping to fault_addr cannot be 1254 // accessed, for example because it's within a range group which is non- 1255 // migratable. 1256 NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block, 1257 NvU64 fault_addr, 1258 bool is_write, 1259 uvm_service_block_context_t *service_context); 1260 1261 // Performs any operations necessary to establish a coherent mapping 1262 // (migrations, cache invalidates, etc.) in response to the given service block 1263 // context. 1264 // 1265 // service_context must not be NULL and policy for service_context->region must 1266 // match. See the comments for uvm_va_block_check_policy_is_valid(). If 1267 // va_block is a HMM block, va_block_context->hmm.vma must be valid. See the 1268 // comments for uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h. 1269 // service_context->prefetch_hint is set by this function. 1270 // 1271 // Locking: 1272 // - service_context->block_context.mm->mmap_lock must be held in at least 1273 // read mode, if valid. 1274 // - va_space lock must be held in at least read mode 1275 // - va_block lock must be held 1276 // 1277 // If allocation-retry was required as part of the operation and was successful, 1278 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the block's lock 1279 // was unlocked and relocked. 1280 // 1281 // NV_WARN_MORE_PROCESSING_REQUIRED indicates that thrashing has been detected 1282 // and the performance heuristics logic decided to throttle execution. 1283 // Any other error code different than NV_OK indicates OOM or a global fatal 1284 // error. 1285 NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id, 1286 uvm_va_block_t *va_block, 1287 uvm_va_block_retry_t *block_retry, 1288 uvm_service_block_context_t *service_context); 1289 1290 // Performs population of the destination pages, unmapping and copying source 1291 // pages to new_residency. 1292 // 1293 // service_context must not be NULL and policy for service_context->region must 1294 // match. See the comments for uvm_va_block_check_policy_is_valid(). If 1295 // va_block is a HMM block, va_block_context->hmm.vma must be valid. See the 1296 // comments for uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h. 1297 // service_context->prefetch_hint should be set before calling this function. 1298 // 1299 // Locking: 1300 // - service_context->block_context.mm->mmap_lock must be held in at least 1301 // read mode, if valid. 1302 // - va_space lock must be held in at least read mode 1303 // - va_block lock must be held 1304 // 1305 // If allocation-retry was required as part of the operation and was successful, 1306 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the block's lock 1307 // was unlocked and relocked. 1308 // 1309 // NV_WARN_MORE_PROCESSING_REQUIRED indicates that thrashing has been detected 1310 // and the performance heuristics logic decided to throttle execution. 1311 // Any other error code different than NV_OK indicates OOM or a global fatal 1312 // error. 1313 NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id, 1314 uvm_processor_id_t new_residency, 1315 uvm_va_block_t *va_block, 1316 uvm_va_block_retry_t *block_retry, 1317 uvm_service_block_context_t *service_context); 1318 1319 // This updates the va_block residency state and maps the faulting processor_id 1320 // to the new residency (which may be remote). 1321 // 1322 // service_context must not be NULL and policy for service_context->region must 1323 // match. See the comments for uvm_va_block_check_policy_is_valid(). If 1324 // va_block is a HMM block, va_block_context->hmm.vma must be valid. See the 1325 // comments for uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h. 1326 // service_context must be initialized by calling uvm_va_block_service_copy() 1327 // before calling this function. 1328 // 1329 // Locking: 1330 // - service_context->block_context.mm->mmap_lock must be held in at least 1331 // read mode, if valid. 1332 // - va_space lock must be held in at least read mode 1333 // - va_block lock must be held 1334 // - the mmap lock and va_space lock must be held across the calls to 1335 // uvm_va_block_service_copy() and this function. If the va_block lock is 1336 // dropped inbetween, special care is needed to check for eviction and 1337 // invalidation callbacks. 1338 // 1339 // If allocation-retry was required as part of the operation and was successful, 1340 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the block's lock 1341 // was unlocked and relocked. 1342 // 1343 // NV_WARN_MORE_PROCESSING_REQUIRED indicates that thrashing has been detected 1344 // and the performance heuristics logic decided to throttle execution. 1345 // Any other error code different than NV_OK indicates OOM or a global fatal 1346 // error. 1347 NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id, 1348 uvm_va_block_t *va_block, 1349 uvm_service_block_context_t *service_context); 1350 1351 // Allocate GPU state for the given va_block and registered GPUs. 1352 // Locking: The block lock must be held. 1353 NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block); 1354 1355 // Release any GPU or policy data associated with the given region in response 1356 // to munmap(). 1357 // Locking: The va_block lock must be held. 1358 void uvm_va_block_munmap_region(uvm_va_block_t *va_block, 1359 uvm_va_block_region_t region); 1360 1361 // Size of the block in bytes. Guaranteed to be a page-aligned value between 1362 // PAGE_SIZE and UVM_VA_BLOCK_SIZE. 1363 static inline NvU64 uvm_va_block_size(uvm_va_block_t *block) 1364 { 1365 NvU64 size = block->end - block->start + 1; 1366 UVM_ASSERT(PAGE_ALIGNED(size)); 1367 UVM_ASSERT(size >= PAGE_SIZE); 1368 UVM_ASSERT(size <= UVM_VA_BLOCK_SIZE); 1369 return size; 1370 } 1371 1372 // Number of pages with PAGE_SIZE in the block 1373 static inline size_t uvm_va_block_num_cpu_pages(uvm_va_block_t *block) 1374 { 1375 return uvm_va_block_size(block) / PAGE_SIZE; 1376 } 1377 1378 // VA of the given page using CPU page size. page_index must be valid 1379 static inline NvU64 uvm_va_block_cpu_page_address(uvm_va_block_t *block, uvm_page_index_t page_index) 1380 { 1381 UVM_ASSERT(page_index < uvm_va_block_num_cpu_pages(block)); 1382 return block->start + PAGE_SIZE * page_index; 1383 } 1384 1385 // Get the physical address on the given GPU for given residency 1386 uvm_gpu_phys_address_t uvm_va_block_res_phys_page_address(uvm_va_block_t *va_block, 1387 uvm_page_index_t page_index, 1388 uvm_processor_id_t residency, 1389 uvm_gpu_t *gpu); 1390 1391 // Get the page physical address on the given GPU 1392 // 1393 // This will assert that GPU state is indeed present. 1394 uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block, 1395 uvm_page_index_t page_index, 1396 uvm_gpu_t *gpu); 1397 1398 static bool uvm_va_block_contains_address(uvm_va_block_t *block, NvU64 address) 1399 { 1400 return address >= block->start && address <= block->end; 1401 } 1402 1403 // Obtain a pointer to the uvm_va_block_test_t structure for the given VA 1404 // block. If uvm_enable_builtin_tests is unset, NULL will be returned. 1405 static uvm_va_block_test_t *uvm_va_block_get_test(uvm_va_block_t *va_block) 1406 { 1407 if (uvm_enable_builtin_tests) 1408 return &container_of(va_block, uvm_va_block_wrapper_t, block)->test; 1409 1410 return NULL; 1411 } 1412 1413 // Get the page residency mask for a processor if it's known to be there. 1414 // 1415 // If the processor is the CPU, the residency mask for the NUMA node ID 1416 // specified by nid will be returned (see 1417 // uvm_va_block_cpu_node_state_t::resident). If nid is NUMA_NO_NODE, 1418 // the cumulative CPU residency mask will be returned (see 1419 // uvm_va_block_t::cpu::resident). 1420 // 1421 // If the processor is a GPU, this will assert that GPU state is indeed present. 1422 uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor, int nid); 1423 1424 // Get the page mapped mask for a processor. The returned mask cannot be 1425 // directly modified by the caller 1426 // 1427 // If the processor is a GPU, this will assert that GPU state is indeed present. 1428 const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor); 1429 1430 // Return a mask of non-UVM-Lite pages that are unmapped within the given 1431 // region. 1432 // Locking: The block lock must be held. 1433 void uvm_va_block_unmapped_pages_get(uvm_va_block_t *va_block, 1434 uvm_va_block_region_t region, 1435 uvm_page_mask_t *out_mask); 1436 1437 // VA block lookup functions. There are a number of permutations which might be 1438 // useful, such as looking up the block from {va_space, va_range} x {addr, 1439 // block index}. The ones implemented here and in uvm_va_range.h support the 1440 // primary three use cases, which are: 1441 // 1) Iterating over all VA blocks in a VA range. This uses block indices on the 1442 // VA range: 1443 // uvm_va_range_num_blocks 1444 // uvm_va_range_block_index 1445 // uvm_va_range_block 1446 // uvm_va_range_block_create 1447 // 2) Operating on a single VA block (fault). This looks up the block using the 1448 // VA space and address: 1449 // uvm_va_block_find 1450 // uvm_va_block_find_create 1451 // 3) Operating on a single VA block (fault). This looks up the block using the 1452 // supplied VA range and address: 1453 // uvm_va_block_find_create_in_range 1454 1455 // Finds the UVM or HMM VA block containing addr, if any. The va_space->lock 1456 // must be held in at least read mode. Return values: 1457 // NV_ERR_INVALID_ADDRESS addr is not a UVM_VA_RANGE_TYPE_MANAGED va_range nor 1458 // a HMM enabled VMA. 1459 // 1460 // NV_ERR_OBJECT_NOT_FOUND addr is valid but no block has been allocated to 1461 // cover it yet 1462 // 1463 // NV_OK The block was returned successfully 1464 NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t **out_block); 1465 1466 // Same as uvm_va_block_find except that the block is created if not found. 1467 // If addr is covered by a UVM_VA_RANGE_TYPE_MANAGED va_range a managed block 1468 // will be created. If addr is not covered by any va_range and HMM is 1469 // enabled in the va_space then a HMM block will be created and hmm_vma is 1470 // set to the VMA covering 'addr'. The va_space_mm must be retained and locked. 1471 // Otherwise hmm_vma is set to NULL. 1472 // Return values: 1473 // NV_ERR_INVALID_ADDRESS addr is not a UVM_VA_RANGE_TYPE_MANAGED va_range nor 1474 // a HMM enabled VMA. 1475 // NV_ERR_NO_MEMORY memory could not be allocated. 1476 NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space, 1477 NvU64 addr, 1478 struct vm_area_struct **hmm_vma, 1479 uvm_va_block_t **out_block); 1480 1481 // Same as uvm_va_block_find_create except that only managed va_blocks are 1482 // created if not already present in the VA range. Does not require va_space_mm 1483 // to be locked or retained. 1484 NV_STATUS uvm_va_block_find_create_managed(uvm_va_space_t *va_space, 1485 NvU64 addr, 1486 uvm_va_block_t **out_block); 1487 1488 // Same as uvm_va_block_find_create_managed except that va_range lookup was 1489 // already done by the caller. The supplied va_range must not be NULL. 1490 NV_STATUS uvm_va_block_find_create_in_range(uvm_va_space_t *va_space, 1491 uvm_va_range_t *va_range, 1492 NvU64 addr, 1493 uvm_va_block_t **out_block); 1494 1495 // Look up a chunk backing a specific address within the VA block. 1496 // Returns NULL if none. 1497 uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU64 address); 1498 1499 // Implementation of the UvmMigrate() API at the VA block scope. 1500 // 1501 // The out_tracker can be NULL. 1502 // 1503 // If do_mappings is false, mappings are not added after pages have been 1504 // migrated. 1505 // 1506 // The caller needs to handle allocation-retry. va_block_retry can be NULL if 1507 // the destination is the CPU. 1508 // 1509 // va_block_context must not be NULL and policy for the region must match. See 1510 // the comments for uvm_va_block_check_policy_is_valid(). If va_block is a HMM 1511 // block, va_block_context->hmm.vma must be valid. See the comments for 1512 // uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h. 1513 // 1514 // LOCKING: The caller must hold the va_block lock. If va_block_context->mm != 1515 // NULL, va_block_context->mm->mmap_lock must be held in at least 1516 // read mode. 1517 NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block, 1518 uvm_va_block_retry_t *va_block_retry, 1519 uvm_va_block_context_t *va_block_context, 1520 uvm_va_block_region_t region, 1521 uvm_processor_id_t dest_id, 1522 uvm_migrate_mode_t mode, 1523 uvm_tracker_t *out_tracker); 1524 1525 // Write block's data from a CPU buffer 1526 // 1527 // The [dst, dst + size) range has to fit within a single PAGE_SIZE page. 1528 // 1529 // va_block_context must not be NULL. The caller is not required to set 1530 // va_block_context->hmm.vma. 1531 // 1532 // The caller needs to support allocation-retry of page tables. 1533 // 1534 // LOCKING: The caller must hold the va_block lock 1535 NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block, 1536 uvm_va_block_context_t *block_context, 1537 NvU64 dst, 1538 uvm_mem_t *src, 1539 size_t size); 1540 1541 // Read block's data into a CPU buffer 1542 // 1543 // The [src, src + size) range has to fit within a single PAGE_SIZE page. 1544 // 1545 // LOCKING: The caller must hold the va_block lock 1546 NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block, uvm_mem_t *dst, NvU64 src, size_t size); 1547 1548 // Initialize va block retry tracking 1549 void uvm_va_block_retry_init(uvm_va_block_retry_t *uvm_va_block_retry); 1550 1551 // Deinitialize va block retry tracking after a block operation 1552 // 1553 // Frees all the remaining free chunks and unpins all the used chunks. 1554 void uvm_va_block_retry_deinit(uvm_va_block_retry_t *uvm_va_block_retry, uvm_va_block_t *va_block); 1555 1556 // Evict all chunks from the block that are subchunks of the passed in root_chunk. 1557 // 1558 // Add all the work tracking the eviction to the tracker. 1559 // 1560 // Returns NV_OK if the block is dead or doesn't have any subchunks of the 1561 // root_chunk. 1562 // 1563 // LOCKING: The caller must hold the va_block lock 1564 NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block, 1565 uvm_gpu_t *gpu, 1566 uvm_gpu_chunk_t *root_chunk, 1567 uvm_tracker_t *tracker); 1568 1569 NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *params, struct file *filp); 1570 NV_STATUS uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS *params, struct file *filp); 1571 NV_STATUS uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS *params, struct file *filp); 1572 NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params, struct file *filp); 1573 1574 // Compute the offset in system pages of addr from the start of va_block. 1575 static uvm_page_index_t uvm_va_block_cpu_page_index(uvm_va_block_t *va_block, NvU64 addr) 1576 { 1577 UVM_ASSERT(addr >= va_block->start); 1578 UVM_ASSERT(addr <= va_block->end); 1579 return (addr - va_block->start) / PAGE_SIZE; 1580 } 1581 1582 // Computes the size and index in the gpu_state chunks array of the GPU chunk 1583 // which corresponds to the given page_index of the VA region. 1584 // Note this is only used for testing and does not work on HMM va_blocks as it 1585 // returns incorrect results for those. 1586 size_t uvm_va_block_gpu_chunk_index_range(NvU64 start, 1587 NvU64 size, 1588 uvm_gpu_t *gpu, 1589 uvm_page_index_t page_index, 1590 uvm_chunk_size_t *out_chunk_size); 1591 1592 // If there are any resident CPU pages in the block, mark them as dirty 1593 void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block); 1594 1595 // Sets the internal state required to handle fault cancellation 1596 // 1597 // This function may require allocating page tables to split big pages into 4K 1598 // pages. If allocation-retry was required as part of the operation and was 1599 // successful, NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case the 1600 // block's lock was unlocked and relocked. 1601 // 1602 // va_block_context must not be NULL. 1603 // 1604 // LOCKING: The caller must hold the va_block lock. 1605 NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu); 1606 1607 // 1608 // uvm_va_block_region_t helpers 1609 // 1610 1611 static uvm_va_block_region_t uvm_va_block_region(uvm_page_index_t first, uvm_page_index_t outer) 1612 { 1613 BUILD_BUG_ON(PAGES_PER_UVM_VA_BLOCK >= (1 << (sizeof(first) * 8))); 1614 1615 UVM_ASSERT(first <= outer); 1616 1617 return (uvm_va_block_region_t){ .first = first, .outer = outer }; 1618 } 1619 1620 static uvm_va_block_region_t uvm_va_block_region_for_page(uvm_page_index_t page_index) 1621 { 1622 return uvm_va_block_region(page_index, page_index + 1); 1623 } 1624 1625 static size_t uvm_va_block_region_num_pages(uvm_va_block_region_t region) 1626 { 1627 return region.outer - region.first; 1628 } 1629 1630 static NvU64 uvm_va_block_region_size(uvm_va_block_region_t region) 1631 { 1632 return uvm_va_block_region_num_pages(region) * PAGE_SIZE; 1633 } 1634 1635 static NvU64 uvm_va_block_region_start(uvm_va_block_t *va_block, uvm_va_block_region_t region) 1636 { 1637 return va_block->start + region.first * PAGE_SIZE; 1638 } 1639 1640 static NvU64 uvm_va_block_region_end(uvm_va_block_t *va_block, uvm_va_block_region_t region) 1641 { 1642 return va_block->start + region.outer * PAGE_SIZE - 1; 1643 } 1644 1645 static bool uvm_va_block_region_contains_region(uvm_va_block_region_t region, uvm_va_block_region_t subregion) 1646 { 1647 return subregion.first >= region.first && subregion.outer <= region.outer; 1648 } 1649 1650 static bool uvm_va_block_region_contains_page(uvm_va_block_region_t region, uvm_page_index_t page_index) 1651 { 1652 return uvm_va_block_region_contains_region(region, uvm_va_block_region_for_page(page_index)); 1653 } 1654 1655 // Create a block range from a va block and start and end virtual addresses 1656 // within the block. 1657 static uvm_va_block_region_t uvm_va_block_region_from_start_end(uvm_va_block_t *va_block, NvU64 start, NvU64 end) 1658 { 1659 uvm_va_block_region_t region; 1660 1661 UVM_ASSERT(start < end); 1662 UVM_ASSERT(start >= va_block->start); 1663 UVM_ASSERT(end <= va_block->end); 1664 UVM_ASSERT(PAGE_ALIGNED(start)); 1665 UVM_ASSERT(PAGE_ALIGNED(end + 1)); 1666 1667 region.first = uvm_va_block_cpu_page_index(va_block, start); 1668 region.outer = uvm_va_block_cpu_page_index(va_block, end) + 1; 1669 1670 return region; 1671 } 1672 1673 static uvm_va_block_region_t uvm_va_block_region_from_start_size(uvm_va_block_t *va_block, NvU64 start, NvU64 size) 1674 { 1675 return uvm_va_block_region_from_start_end(va_block, start, start + size - 1); 1676 } 1677 1678 static uvm_va_block_region_t uvm_va_block_region_from_block(uvm_va_block_t *va_block) 1679 { 1680 return uvm_va_block_region(0, uvm_va_block_num_cpu_pages(va_block)); 1681 } 1682 1683 // Create a block region from a va block and page mask. If va_block is NULL, the 1684 // region is assumed to cover the maximum va_block size. Note that the region 1685 // covers the first through the last set bit and may have unset bits in between. 1686 static uvm_va_block_region_t uvm_va_block_region_from_mask(uvm_va_block_t *va_block, const uvm_page_mask_t *page_mask) 1687 { 1688 uvm_va_block_region_t region; 1689 uvm_page_index_t outer; 1690 1691 if (va_block) 1692 outer = uvm_va_block_num_cpu_pages(va_block); 1693 else 1694 outer = PAGES_PER_UVM_VA_BLOCK; 1695 1696 region.first = find_first_bit(page_mask->bitmap, outer); 1697 if (region.first >= outer) { 1698 region = uvm_va_block_region(0, 0); 1699 } 1700 else { 1701 // At least one bit is set so find_last_bit() should not return 'outer'. 1702 region.outer = find_last_bit(page_mask->bitmap, outer) + 1; 1703 UVM_ASSERT(region.outer <= outer); 1704 } 1705 1706 return region; 1707 } 1708 1709 static bool uvm_page_mask_test(const uvm_page_mask_t *mask, uvm_page_index_t page_index) 1710 { 1711 UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK); 1712 1713 return test_bit(page_index, mask->bitmap); 1714 } 1715 1716 static bool uvm_page_mask_test_and_set(uvm_page_mask_t *mask, uvm_page_index_t page_index) 1717 { 1718 UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK); 1719 1720 return __test_and_set_bit(page_index, mask->bitmap); 1721 } 1722 1723 static bool uvm_page_mask_test_and_clear(uvm_page_mask_t *mask, uvm_page_index_t page_index) 1724 { 1725 UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK); 1726 1727 return __test_and_clear_bit(page_index, mask->bitmap); 1728 } 1729 1730 static void uvm_page_mask_set(uvm_page_mask_t *mask, uvm_page_index_t page_index) 1731 { 1732 UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK); 1733 1734 __set_bit(page_index, mask->bitmap); 1735 } 1736 1737 static void uvm_page_mask_clear(uvm_page_mask_t *mask, uvm_page_index_t page_index) 1738 { 1739 UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK); 1740 1741 __clear_bit(page_index, mask->bitmap); 1742 } 1743 1744 static bool uvm_page_mask_region_test(const uvm_page_mask_t *mask, 1745 uvm_va_block_region_t region, 1746 uvm_page_index_t page_index) 1747 { 1748 if (!uvm_va_block_region_contains_page(region, page_index)) 1749 return false; 1750 1751 return !mask || uvm_page_mask_test(mask, page_index); 1752 } 1753 1754 static NvU32 uvm_page_mask_region_weight(const uvm_page_mask_t *mask, uvm_va_block_region_t region) 1755 { 1756 NvU32 weight_before = 0; 1757 1758 if (region.first > 0) 1759 weight_before = bitmap_weight(mask->bitmap, region.first); 1760 1761 return bitmap_weight(mask->bitmap, region.outer) - weight_before; 1762 } 1763 1764 static bool uvm_page_mask_region_empty(const uvm_page_mask_t *mask, uvm_va_block_region_t region) 1765 { 1766 return find_next_bit(mask->bitmap, region.outer, region.first) == region.outer; 1767 } 1768 1769 static bool uvm_page_mask_region_full(const uvm_page_mask_t *mask, uvm_va_block_region_t region) 1770 { 1771 return find_next_zero_bit(mask->bitmap, region.outer, region.first) == region.outer; 1772 } 1773 1774 static void uvm_page_mask_region_fill(uvm_page_mask_t *mask, uvm_va_block_region_t region) 1775 { 1776 bitmap_set(mask->bitmap, region.first, region.outer - region.first); 1777 } 1778 1779 static void uvm_page_mask_region_clear(uvm_page_mask_t *mask, uvm_va_block_region_t region) 1780 { 1781 bitmap_clear(mask->bitmap, region.first, region.outer - region.first); 1782 } 1783 1784 static void uvm_page_mask_region_clear_outside(uvm_page_mask_t *mask, uvm_va_block_region_t region) 1785 { 1786 if (region.first > 0) 1787 bitmap_clear(mask->bitmap, 0, region.first); 1788 if (region.outer < PAGES_PER_UVM_VA_BLOCK) 1789 bitmap_clear(mask->bitmap, region.outer, PAGES_PER_UVM_VA_BLOCK - region.outer); 1790 } 1791 1792 static void uvm_page_mask_zero(uvm_page_mask_t *mask) 1793 { 1794 bitmap_zero(mask->bitmap, PAGES_PER_UVM_VA_BLOCK); 1795 } 1796 1797 static bool uvm_page_mask_empty(const uvm_page_mask_t *mask) 1798 { 1799 return bitmap_empty(mask->bitmap, PAGES_PER_UVM_VA_BLOCK); 1800 } 1801 1802 static bool uvm_page_mask_full(const uvm_page_mask_t *mask) 1803 { 1804 return bitmap_full(mask->bitmap, PAGES_PER_UVM_VA_BLOCK); 1805 } 1806 1807 static void uvm_page_mask_fill(uvm_page_mask_t *mask) 1808 { 1809 bitmap_fill(mask->bitmap, PAGES_PER_UVM_VA_BLOCK); 1810 } 1811 1812 static bool uvm_page_mask_and(uvm_page_mask_t *mask_out, 1813 const uvm_page_mask_t *mask_in1, 1814 const uvm_page_mask_t *mask_in2) 1815 { 1816 return bitmap_and(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK); 1817 } 1818 1819 static bool uvm_page_mask_andnot(uvm_page_mask_t *mask_out, 1820 const uvm_page_mask_t *mask_in1, 1821 const uvm_page_mask_t *mask_in2) 1822 { 1823 return bitmap_andnot(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK); 1824 } 1825 1826 static void uvm_page_mask_or(uvm_page_mask_t *mask_out, 1827 const uvm_page_mask_t *mask_in1, 1828 const uvm_page_mask_t *mask_in2) 1829 { 1830 bitmap_or(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK); 1831 } 1832 1833 static void uvm_page_mask_complement(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in) 1834 { 1835 bitmap_complement(mask_out->bitmap, mask_in->bitmap, PAGES_PER_UVM_VA_BLOCK); 1836 } 1837 1838 static void uvm_page_mask_copy(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in) 1839 { 1840 bitmap_copy(mask_out->bitmap, mask_in->bitmap, PAGES_PER_UVM_VA_BLOCK); 1841 } 1842 1843 static NvU32 uvm_page_mask_weight(const uvm_page_mask_t *mask) 1844 { 1845 return bitmap_weight(mask->bitmap, PAGES_PER_UVM_VA_BLOCK); 1846 } 1847 1848 static bool uvm_page_mask_subset(const uvm_page_mask_t *subset, const uvm_page_mask_t *mask) 1849 { 1850 return bitmap_subset(subset->bitmap, mask->bitmap, PAGES_PER_UVM_VA_BLOCK); 1851 } 1852 1853 static bool uvm_page_mask_equal(const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2) 1854 { 1855 return bitmap_equal(mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK); 1856 } 1857 1858 static bool uvm_page_mask_init_from_region(uvm_page_mask_t *mask_out, 1859 uvm_va_block_region_t region, 1860 const uvm_page_mask_t *mask_in) 1861 { 1862 uvm_page_mask_zero(mask_out); 1863 uvm_page_mask_region_fill(mask_out, region); 1864 1865 if (mask_in) 1866 return uvm_page_mask_and(mask_out, mask_out, mask_in); 1867 1868 return true; 1869 } 1870 1871 static void uvm_page_mask_shift_right(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in, unsigned shift) 1872 { 1873 bitmap_shift_right(mask_out->bitmap, mask_in->bitmap, shift, PAGES_PER_UVM_VA_BLOCK); 1874 } 1875 1876 static void uvm_page_mask_shift_left(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in, unsigned shift) 1877 { 1878 bitmap_shift_left(mask_out->bitmap, mask_in->bitmap, shift, PAGES_PER_UVM_VA_BLOCK); 1879 } 1880 1881 static bool uvm_page_mask_intersects(const uvm_page_mask_t *mask1, const uvm_page_mask_t *mask2) 1882 { 1883 return bitmap_intersects(mask1->bitmap, mask2->bitmap, PAGES_PER_UVM_VA_BLOCK); 1884 } 1885 1886 // Print the given page mask on the given buffer using hex symbols. The 1887 // minimum required size of the buffer is UVM_PAGE_MASK_PRINT_MIN_BUFFER_SIZE. 1888 static void uvm_page_mask_print(const uvm_page_mask_t *mask, char *buffer) 1889 { 1890 // There are two cases, which depend on PAGE_SIZE 1891 if (PAGES_PER_UVM_VA_BLOCK > 32) { 1892 NvLength current_long_idx = UVM_PAGE_MASK_WORDS - 1; 1893 const char *buffer_end = buffer + UVM_PAGE_MASK_PRINT_MIN_BUFFER_SIZE; 1894 1895 UVM_ASSERT(sizeof(*mask->bitmap) == 8); 1896 1897 // For 4KB pages, we need to iterate over multiple words 1898 do { 1899 NvU64 current_long = mask->bitmap[current_long_idx]; 1900 1901 buffer += sprintf(buffer, "%016llx", current_long); 1902 if (current_long_idx != 0) 1903 buffer += sprintf(buffer, ":"); 1904 } while (current_long_idx-- != 0); 1905 1906 UVM_ASSERT(buffer <= buffer_end); 1907 } 1908 else { 1909 NvU32 value = (unsigned)*mask->bitmap; 1910 1911 UVM_ASSERT(PAGES_PER_UVM_VA_BLOCK == 32); 1912 1913 // For 64KB pages, a single print suffices 1914 sprintf(buffer, "%08x", value); 1915 } 1916 } 1917 1918 static uvm_va_block_region_t uvm_va_block_first_subregion_in_mask(uvm_va_block_region_t region, 1919 const uvm_page_mask_t *page_mask) 1920 { 1921 uvm_va_block_region_t subregion; 1922 1923 if (!page_mask) 1924 return region; 1925 1926 subregion.first = find_next_bit(page_mask->bitmap, region.outer, region.first); 1927 subregion.outer = find_next_zero_bit(page_mask->bitmap, region.outer, subregion.first + 1); 1928 return subregion; 1929 } 1930 1931 static uvm_va_block_region_t uvm_va_block_next_subregion_in_mask(uvm_va_block_region_t region, 1932 const uvm_page_mask_t *page_mask, 1933 uvm_va_block_region_t previous_subregion) 1934 { 1935 uvm_va_block_region_t subregion; 1936 1937 if (!page_mask) { 1938 subregion.first = region.outer; 1939 subregion.outer = region.outer; 1940 return subregion; 1941 } 1942 1943 subregion.first = find_next_bit(page_mask->bitmap, region.outer, previous_subregion.outer + 1); 1944 subregion.outer = find_next_zero_bit(page_mask->bitmap, region.outer, subregion.first + 1); 1945 return subregion; 1946 } 1947 1948 // Iterate over contiguous subregions of the region given by the page mask. 1949 // If the page mask is NULL then it behaves as if it was a fully set mask and 1950 // the only subregion iterated over will be the region itself. 1951 #define for_each_va_block_subregion_in_mask(subregion, page_mask, region) \ 1952 for ((subregion) = uvm_va_block_first_subregion_in_mask((region), (page_mask)); \ 1953 (subregion).first != (region).outer; \ 1954 (subregion) = uvm_va_block_next_subregion_in_mask((region), (page_mask), (subregion))) 1955 1956 static uvm_page_index_t uvm_va_block_first_page_in_mask(uvm_va_block_region_t region, 1957 const uvm_page_mask_t *page_mask) 1958 { 1959 if (page_mask) 1960 return find_next_bit(page_mask->bitmap, region.outer, region.first); 1961 else 1962 return region.first; 1963 } 1964 1965 static uvm_page_index_t uvm_va_block_next_page_in_mask(uvm_va_block_region_t region, 1966 const uvm_page_mask_t *page_mask, 1967 uvm_page_index_t previous_page) 1968 { 1969 if (page_mask) { 1970 return find_next_bit(page_mask->bitmap, region.outer, previous_page + 1); 1971 } 1972 else { 1973 UVM_ASSERT(previous_page < region.outer); 1974 return previous_page + 1; 1975 } 1976 } 1977 1978 static uvm_page_index_t uvm_va_block_first_unset_page_in_mask(uvm_va_block_region_t region, 1979 const uvm_page_mask_t *page_mask) 1980 { 1981 if (page_mask) 1982 return find_next_zero_bit(page_mask->bitmap, region.outer, region.first); 1983 else 1984 return region.first; 1985 } 1986 1987 static uvm_page_index_t uvm_va_block_next_unset_page_in_mask(uvm_va_block_region_t region, 1988 const uvm_page_mask_t *page_mask, 1989 uvm_page_index_t previous_page) 1990 { 1991 if (page_mask) { 1992 return find_next_zero_bit(page_mask->bitmap, region.outer, previous_page + 1); 1993 } 1994 else { 1995 UVM_ASSERT(previous_page < region.outer); 1996 return previous_page + 1; 1997 } 1998 } 1999 2000 static NvU64 uvm_reverse_map_start(const uvm_reverse_map_t *reverse_map) 2001 { 2002 return uvm_va_block_cpu_page_address(reverse_map->va_block, reverse_map->region.first); 2003 } 2004 2005 static NvU64 uvm_reverse_map_end(const uvm_reverse_map_t *reverse_map) 2006 { 2007 return uvm_va_block_cpu_page_address(reverse_map->va_block, reverse_map->region.first) + 2008 uvm_va_block_region_size(reverse_map->region) - 1; 2009 } 2010 2011 // Iterate over contiguous pages of the region given by the page mask. 2012 // If the page mask is NULL then it behaves as if it was a fully set mask and 2013 // it will iterate over all pages within the region. 2014 #define for_each_va_block_page_in_region_mask(page_index, page_mask, region) \ 2015 for ((page_index) = uvm_va_block_first_page_in_mask((region), (page_mask)); \ 2016 (page_index) != (region).outer; \ 2017 (page_index) = uvm_va_block_next_page_in_mask((region), (page_mask), (page_index))) 2018 2019 // Same as for_each_va_block_page_in_region_mask, but the region spans the 2020 // whole given VA block 2021 #define for_each_va_block_page_in_mask(page_index, page_mask, va_block) \ 2022 for_each_va_block_page_in_region_mask(page_index, page_mask, uvm_va_block_region_from_block(va_block)) 2023 2024 // Similar to for_each_va_block_page_in_region_mask, but iterating over pages 2025 // whose bit is unset. 2026 #define for_each_va_block_unset_page_in_region_mask(page_index, page_mask, region) \ 2027 for ((page_index) = uvm_va_block_first_unset_page_in_mask((region), (page_mask)); \ 2028 (page_index) != (region).outer; \ 2029 (page_index) = uvm_va_block_next_unset_page_in_mask((region), (page_mask), (page_index))) 2030 2031 // Similar to for_each_va_block_page_in_mask, but iterating over pages whose 2032 // bit is unset. 2033 #define for_each_va_block_unset_page_in_mask(page_index, page_mask, va_block) \ 2034 for_each_va_block_unset_page_in_region_mask(page_index, page_mask, uvm_va_block_region_from_block(va_block)) 2035 2036 // Iterate over all pages within the given region 2037 #define for_each_va_block_page_in_region(page_index, region) \ 2038 for_each_va_block_page_in_region_mask((page_index), NULL, (region)) 2039 2040 // Iterate over all pages within the given VA block 2041 #define for_each_va_block_page(page_index, va_block) \ 2042 for_each_va_block_page_in_region((page_index), uvm_va_block_region_from_block(va_block)) 2043 2044 // Return the first vma intersecting the region [start, va_block->end] 2045 // or NULL if no such vma exists. Also returns the region covered by 2046 // the vma within the va_block. 2047 struct vm_area_struct *uvm_va_block_find_vma_region(uvm_va_block_t *va_block, 2048 struct mm_struct *mm, 2049 NvU64 start, 2050 uvm_va_block_region_t *region); 2051 2052 // Iterate over all vma regions covered by a va_block 2053 #define for_each_va_block_vma_region(va_block, mm, vma, region) \ 2054 for (vma = uvm_va_block_find_vma_region((va_block), (mm), (va_block)->start, (region)); \ 2055 (vma); \ 2056 vma = uvm_va_block_find_vma_region((va_block), \ 2057 (mm), \ 2058 uvm_va_block_region_end((va_block), *(region)) + 1, \ 2059 (region))) 2060 2061 // Return the block region covered by the given chunk size. page_index must be 2062 // any page within the block known to be covered by the chunk. 2063 static uvm_va_block_region_t uvm_va_block_chunk_region(uvm_va_block_t *block, 2064 uvm_chunk_size_t chunk_size, 2065 uvm_page_index_t page_index) 2066 { 2067 NvU64 page_addr = uvm_va_block_cpu_page_address(block, page_index); 2068 NvU64 chunk_start_addr = UVM_ALIGN_DOWN(page_addr, chunk_size); 2069 uvm_page_index_t first = (uvm_page_index_t)((chunk_start_addr - block->start) / PAGE_SIZE); 2070 return uvm_va_block_region(first, first + (chunk_size / PAGE_SIZE)); 2071 } 2072 2073 // 2074 // Helpers for page state (permissions, size, residency) 2075 // 2076 2077 bool uvm_va_block_page_is_gpu_authorized(uvm_va_block_t *va_block, 2078 uvm_page_index_t page_index, 2079 uvm_gpu_id_t gpu_id, 2080 uvm_prot_t required_prot); 2081 2082 // Compute the processors that have a copy of the given page resident in their 2083 // memory. 2084 void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block, 2085 uvm_page_index_t page_index, 2086 uvm_processor_mask_t *resident_processors); 2087 2088 // Count how many processors have a copy of the given page resident in their 2089 // memory. 2090 NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block, uvm_page_index_t page_index); 2091 2092 // Get the processor with a resident copy of a page closest to the given 2093 // processor. 2094 uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block, 2095 uvm_page_index_t page_index, 2096 uvm_processor_id_t processor); 2097 2098 // Mark CPU page page_index as resident on NUMA node specified by nid. 2099 // nid cannot be NUMA_NO_NODE. 2100 void uvm_va_block_cpu_set_resident_page(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index); 2101 2102 // Test if a CPU page is resident on NUMA node nid. If nid is NUMA_NO_NODE, 2103 // the function will return True if the page is resident on any CPU NUMA node. 2104 bool uvm_va_block_cpu_is_page_resident_on(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index); 2105 2106 // Test if all pages in region are resident on NUMA node nid. If nid is 2107 // NUMA_NO_NODE, the function will test if the pages in the region are 2108 // resident on any CPU NUMA node. 2109 bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, uvm_va_block_region_t region); 2110 2111 // Insert a CPU chunk at the given page_index into the va_block. 2112 // Locking: The va_block lock must be held. 2113 NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index); 2114 2115 // Remove a CPU chunk at the given page_index from the va_block. 2116 // nid cannot be NUMA_NO_NODE. 2117 // Locking: The va_block lock must be held. 2118 void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index); 2119 2120 // Return the CPU chunk at the given page_index on the given NUMA node from the 2121 // va_block. nid cannot be NUMA_NO_NODE. 2122 // Locking: The va_block lock must be held. 2123 uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block, 2124 int nid, 2125 uvm_page_index_t page_index); 2126 2127 // Return the struct page * from the chunk corresponding to the given page_index 2128 // Locking: The va_block lock must be held. 2129 struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index); 2130 2131 // Return the struct page * of the resident chunk at the given page_index from 2132 // the va_block. The given page_index must be resident on the CPU. 2133 // Locking: The va_block lock must be held. 2134 struct page *uvm_va_block_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index); 2135 2136 // Physically map a CPU chunk so it is DMA'able from all registered GPUs. 2137 // nid cannot be NUMA_NO_NODE. 2138 // Locking: The va_block lock must be held. 2139 NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *va_block, 2140 uvm_cpu_chunk_t *chunk, 2141 uvm_page_index_t page_index); 2142 2143 // Physically unmap a CPU chunk from all registered GPUs. 2144 // Locking: The va_block lock must be held. 2145 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *va_block, 2146 uvm_cpu_chunk_t *chunk, 2147 uvm_page_index_t page_index); 2148 2149 // Remove any CPU chunks in the given region. 2150 // Locking: The va_block lock must be held. 2151 void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region); 2152 2153 // Get CPU page size or 0 if it is not mapped 2154 NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, 2155 uvm_page_index_t page_index); 2156 2157 // Get GPU page size or 0 if it is not mapped on the given GPU 2158 NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index); 2159 2160 // Get page size or 0 if it is not mapped on the given processor 2161 static NvU32 uvm_va_block_page_size_processor(uvm_va_block_t *va_block, 2162 uvm_processor_id_t processor_id, 2163 uvm_page_index_t page_index) 2164 { 2165 if (UVM_ID_IS_CPU(processor_id)) 2166 return uvm_va_block_page_size_cpu(va_block, page_index); 2167 else 2168 return uvm_va_block_page_size_gpu(va_block, processor_id, page_index); 2169 } 2170 2171 // Returns the big page size for the GPU VA space of the block 2172 NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu); 2173 2174 // Returns the number of big pages in the VA block for the given size 2175 size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size); 2176 2177 // Returns the number of big pages in the VA block for the big page size on the 2178 // given GPU 2179 static size_t uvm_va_block_gpu_num_big_pages(uvm_va_block_t *va_block, uvm_gpu_t *gpu) 2180 { 2181 return uvm_va_block_num_big_pages(va_block, uvm_va_block_gpu_big_page_size(va_block, gpu)); 2182 } 2183 2184 // Returns the start address of the given big page index and big page size 2185 NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size); 2186 2187 // Returns the region [start, end] of the given big page index and big page size 2188 uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, 2189 size_t big_page_index, 2190 NvU32 big_page_size); 2191 2192 // Returns the largest sub-region region of [start, end] which can fit big 2193 // pages. If the region cannot fit any big pages, an invalid region (0, 0) is 2194 // returned. 2195 uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size); 2196 2197 // Returns the largest sub-region region of 'region' which can fit big pages. 2198 // If the region cannot fit any big pages, an invalid region (0, 0) is returned. 2199 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block, 2200 uvm_va_block_region_t region, 2201 NvU32 big_page_size); 2202 2203 // Returns the big page index (the bit index within 2204 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If 2205 // page_index cannot be covered by a big PTE due to alignment or block size, 2206 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned. 2207 size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size); 2208 2209 // Returns the new residency for a page that faulted or triggered access counter 2210 // notifications. The read_duplicate output parameter indicates if the page 2211 // meets the requirements to be read-duplicated va_block_context must not be 2212 // NULL, and if the va_block is a HMM block, va_block_context->hmm.vma must be 2213 // valid which also means the va_block_context->mm is not NULL, retained, and 2214 // locked for at least read. See the comments for 2215 // uvm_va_block_check_policy_is_valid() and uvm_hmm_check_context_vma_is_valid() 2216 // in uvm_hmm.h. Locking: the va_block lock must be held. 2217 uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block, 2218 uvm_va_block_context_t *va_block_context, 2219 uvm_page_index_t page_index, 2220 uvm_processor_id_t processor_id, 2221 NvU32 access_type_mask, 2222 const uvm_va_policy_t *policy, 2223 const uvm_perf_thrashing_hint_t *thrashing_hint, 2224 uvm_service_operation_t operation, 2225 bool *read_duplicate); 2226 2227 // Return the maximum mapping protection for processor_id that will not require 2228 // any permision revocation on the rest of processors. 2229 uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block, 2230 uvm_processor_id_t processor_id, 2231 uvm_page_index_t page_index); 2232 2233 // A helper macro for handling allocation-retry 2234 // 2235 // The macro takes a VA block, uvm_va_block_retry_t struct and a function call 2236 // to retry as long as it returns NV_ERR_MORE_PROCESSING_REQUIRED. 2237 // 2238 // block_retry can be NULL if it's not necessary for the function call, 2239 // otherwise it will be initialized and deinitialized by the macro. 2240 // 2241 // The macro also locks and unlocks the block's lock internally as it's expected 2242 // that the block's lock has been unlocked and relocked whenever the function call 2243 // returns NV_ERR_MORE_PROCESSING_REQUIRED and this makes it clear that the 2244 // block's state is not locked across these calls. 2245 #define UVM_VA_BLOCK_LOCK_RETRY(va_block, block_retry, call) ({ \ 2246 NV_STATUS status; \ 2247 uvm_va_block_t *__block = (va_block); \ 2248 uvm_va_block_retry_t *__retry = (block_retry); \ 2249 \ 2250 uvm_va_block_retry_init(__retry); \ 2251 \ 2252 uvm_mutex_lock(&__block->lock); \ 2253 \ 2254 do { \ 2255 status = (call); \ 2256 } while (status == NV_ERR_MORE_PROCESSING_REQUIRED); \ 2257 \ 2258 uvm_mutex_unlock(&__block->lock); \ 2259 \ 2260 uvm_va_block_retry_deinit(__retry, __block); \ 2261 \ 2262 status; \ 2263 }) 2264 2265 // A helper macro for handling allocation-retry 2266 // 2267 // The macro takes a VA block, uvm_va_block_retry_t struct and a function call 2268 // to retry as long as it returns NV_ERR_MORE_PROCESSING_REQUIRED. 2269 // 2270 // block_retry can be NULL if it's not necessary for the function call, 2271 // otherwise it will be initialized and deinitialized by the macro. 2272 // 2273 // This macro, as opposed to UVM_VA_BLOCK_LOCK_RETRY(), expects the block lock 2274 // to be already taken. Notably the block's lock might be unlocked and relocked 2275 // as part of the call. 2276 #define UVM_VA_BLOCK_RETRY_LOCKED(va_block, block_retry, call) ({ \ 2277 NV_STATUS status; \ 2278 uvm_va_block_t *__block = (va_block); \ 2279 uvm_va_block_retry_t *__retry = (block_retry); \ 2280 \ 2281 uvm_va_block_retry_init(__retry); \ 2282 \ 2283 uvm_assert_mutex_locked(&__block->lock); \ 2284 \ 2285 do { \ 2286 status = (call); \ 2287 } while (status == NV_ERR_MORE_PROCESSING_REQUIRED); \ 2288 \ 2289 uvm_va_block_retry_deinit(__retry, __block); \ 2290 \ 2291 status; \ 2292 }) 2293 2294 #endif // __UVM_VA_BLOCK_H__ 2295