1 /*******************************************************************************
2     Copyright (c) 2015-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #ifndef __UVM_VA_BLOCK_H__
25 #define __UVM_VA_BLOCK_H__
26 
27 #include "uvm_forward_decl.h"
28 #include "uvm_types.h"
29 #include "uvm_linux.h"
30 #include "nv-kref.h"
31 #include "uvm_common.h"
32 #include "uvm_perf_module.h"
33 #include "uvm_processors.h"
34 #include "uvm_lock.h"
35 #include "uvm_test_ioctl.h"
36 #include "uvm_tracker.h"
37 #include "uvm_pmm_gpu.h"
38 #include "uvm_perf_thrashing.h"
39 #include "uvm_perf_utils.h"
40 #include "uvm_va_block_types.h"
41 #include "uvm_range_tree.h"
42 #include "uvm_mmu.h"
43 #include "nv-kthread-q.h"
44 
45 #include <linux/mmu_notifier.h>
46 #include <linux/wait.h>
47 #include <linux/nodemask.h>
48 
49 // VA blocks are the leaf nodes in the uvm_va_space tree for managed allocations
50 // (VA ranges with type == UVM_VA_RANGE_TYPE_MANAGED):
51 //
52 //  UVM: uvm_va_space -> uvm_va_range -> uvm_va_block
53 //  HMM: uvm_va_space -> uvm_va_block
54 //
55 // Each VA block is contained within a single VA range, and contains state on
56 // VAs covered by that block. Most importantly, the block tracks the current
57 // state of the virtual-to-physical mappings for all VAs within that block
58 // across all processors, along with the physical residency location for each
59 // VA.
60 //
61 // The block serializes both CPU and GPU operations on all VAs under that block.
62 // The CPU work is serialized with the block lock, and the GPU work is
63 // serialized by the block work tracker which itself is protected by the block
64 // lock.
65 //
66 // The size of each block varies from the size of the smallest VA range
67 // (PAGE_SIZE) to the max block size specified by UVM_VA_BLOCK_BITS. No block
68 // will span a 2^UVM_VA_BLOCK_BITS boundary in VA space. The size of the block
69 // is determined by the alignment of the parent VA range and the block's
70 // placement within the range.
71 //
72 // Note that this means user space will get best allocation efficiency if it
73 // allocates memory in 2^UVM_VA_BLOCK_BITS naturally-aligned chunks.
74 
75 // enums used for indexing into the array of pte_bits bitmaps in the VA block
76 // which hold the current state of each PTE. For a given {processor, PTE}, the
77 // bits represented here must be enough to re-create the non-address portion of
78 // the PTE for that processor.
79 
80 // If _READ is not set, the PTE mapping is not valid.
81 // If _WRITE is set, _READ is also set (_WRITE implies _READ).
82 typedef enum
83 {
84     UVM_PTE_BITS_CPU_READ,
85     UVM_PTE_BITS_CPU_WRITE,
86     UVM_PTE_BITS_CPU_MAX
87 } uvm_pte_bits_cpu_t;
88 
89 // If _READ is not set, the PTE mapping is not valid.
90 // If _WRITE is set, _READ is also set (_WRITE implies _READ).
91 // If _ATOMIC is set, _WRITE is also set (_ATOMIC implies _WRITE and _READ).
92 //
93 // TODO: Bug 1764925: Track volatile here too if we add GPU L2 caching
94 typedef enum
95 {
96     UVM_PTE_BITS_GPU_READ,
97     UVM_PTE_BITS_GPU_WRITE,
98     UVM_PTE_BITS_GPU_ATOMIC,
99     UVM_PTE_BITS_GPU_MAX
100 } uvm_pte_bits_gpu_t;
101 
102 typedef struct
103 {
104     // Per-page residency bit vector, used for fast traversal
105     // of resident pages.
106     //
107     // This follows the same semantics as the CPU residency bit vector and
108     // notably each bit still represents a PAGE_SIZE amount of data, but the
109     // physical GPU memory is tracked by an array of GPU chunks below.
110     uvm_page_mask_t resident;
111 
112     // Pages that have been evicted to sysmem
113     uvm_page_mask_t evicted;
114 
115     NvU64 *cpu_chunks_dma_addrs;
116 
117     // Array of naturally-aligned chunks. Each chunk has the largest possible
118     // size which can fit within the block, so they are not uniform size.
119     //
120     // The number of chunks in the array is calculated using
121     // block_num_gpu_chunks. The size of each chunk is calculated using
122     // block_gpu_chunk_index.
123     uvm_gpu_chunk_t **chunks;
124 
125     // These page table ranges are not necessarily all used at the same time.
126     // The block might also be too small or not aligned properly to use the
127     // larger ranges, in which case they're never allocated.
128     //
129     // Once a range is allocated we keep it around to avoid constant allocation
130     // overhead when doing PTE splitting and merging.
131     //
132     // Check range.table to see if a given range has been allocated yet.
133     //
134     // page_table_range_big's range covers the big PTEs which fit within the
135     // interior of this block. See the big_ptes field.
136     uvm_page_table_range_t page_table_range_2m;
137     uvm_page_table_range_t page_table_range_big;
138     uvm_page_table_range_t page_table_range_4k;
139 
140     // These flags are ignored unless the {block, gpu} pair supports a 2M page
141     // size. In that case it's the responsibility of the block code to make the
142     // lower page tables active by calling uvm_page_tree_write_pde.
143     //
144     // They can be allocated and activated separately, so we have to track them
145     // separately.
146     //
147     // Activated only means that uvm_page_tree_write_pde has been called at some
148     // point in the past with the appropriate range allocated. It does not imply
149     // that the 2M entry is a PDE (see pte_is_2m).
150     bool activated_big;
151     bool activated_4k;
152 
153     // For {block, gpu} pairs which support the 2M page size, the page table
154     // ranges are uninitialized on allocation. This flag tracks whether the big
155     // PTEs have been initialized.
156     //
157     // We don't need an equivalent flag for the 4k range because we always write
158     // just the 4k PTEs not covered by higher-level PTEs. Big PTEs however can
159     // be allocated and activated late while the 4k PTEs are already active, in
160     // which case we need to initialize the entire big range.
161     bool initialized_big;
162 
163     // Sticky state to split PTEs to 4k and keep them there. Used when a fatal
164     // fault has been detected on this GPU to avoid false dependencies within
165     // the uTLB for fatal and non-fatal faults on the same larger PTE, which
166     // could lead to wrong fault attribution.
167     bool force_4k_ptes;
168 
169     // This table shows the HW PTE states given all permutations of pte_is_2m,
170     // big_ptes, and pte_bits. Note that the first row assumes that the 4k page
171     // tables have been allocated (if not, then no PDEs are allocated either).
172     //
173     // |-------------- SW state --------------|------------------- HW state --------------------|
174     //  pte_is_2m  pte_is_big  pte_bits[READ] | Page size  PDE0(2M only)  Big PTE       4k PTE
175     //  ----------------------------------------------------------------------------------------
176     //  0          0           0              | 4k         Valid PDE      Invalid [1]   Invalid
177     //  0          0           1              | 4k         Valid PDE      Invalid [1]   Valid
178     //  0          1           0              | Big        Valid PDE      Unmapped [2]  x
179     //  0          1           1              | Big        Valid PDE      Valid         x
180     //  1          must be 0   0              | 2M         Invalid        x             x
181     //  1          must be 0   1              | 2M         Valid PTE      x             x
182     //
183     // [1]: The big PTE may be unallocated, in which case its pointer won't be
184     //      valid in the parent PDE. If the big PTE is allocated, it will be
185     //      invalid so the 4k PTEs are active.
186     //
187     // [2]: The unmapped big PTE pattern differs from the invalid pattern, and
188     //      it prevents HW from reading the 4k entries. See the unmapped_pte()
189     //      MMU HAL function.
190 
191     // If pte_is_2m is true, there is a 2M PTE covering this VA block (valid or
192     // invalid). If false then we're in one of the following scenarios:
193     // 1) This {block, gpu} does not support 2M pages.
194     // 2) 2M pages are supported but the page_table_range_2m has not been
195     //    allocated (implying that the other page table ranges have not been
196     //    allocated either).
197     // 3) page_table_range_2m has been allocated, but the big_ptes bitmap should
198     //    be used to determine the mix of big and 4k PTEs.
199     bool pte_is_2m;
200 
201     // When pte_is_2m is false, this block consists of any possible mix of big
202     // and 4k PTEs. This bitmap describes that mix. A set bit indicates that the
203     // corresponding big-page-sized region of the block is covered by a big PTE.
204     // A cleared bit indicates that it is covered by 4k PTEs.
205     //
206     // Neither setting implies that the PTE currently has a valid mapping, it
207     // just indicates which PTE is read by the GPU (see the table above).
208     //
209     // The indices represent the corresponding big PTEs in the block's interior.
210     // For example, a block with alignment and size of one 4k page on either
211     // side of a big page will only use bit 0. Use uvm_va_block_big_page_index to look
212     // the big_ptes index of a page.
213     //
214     // The block might not be able to fit any big PTEs, in which case this
215     // bitmap is always zero. Use uvm_va_block_gpu_num_big_pages to find the number of
216     // valid bits in this mask.
217     DECLARE_BITMAP(big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
218 
219     // See the comments for uvm_va_block_mmap_t::cpu.pte_bits.
220     //
221     // The major difference is that these bits are always accurate since, unlike
222     // the CPU PTEs, the UVM driver is in full control of these mappings.
223     //
224     // Note that the granularity is always PAGE_SIZE, not whatever GPU PTE size
225     // happens to currently map these regions. PAGE_SIZE is the minimum
226     // granularity of operations on the VA blocks. As a future optimization we
227     // could consider sub-PAGE_SIZE operations if PAGE_SIZE > 4K and the CPU
228     // isn't involved, for example false sharing among peer GPUs.
229     uvm_page_mask_t pte_bits[UVM_PTE_BITS_GPU_MAX];
230 
231 } uvm_va_block_gpu_state_t;
232 
233 typedef struct
234 {
235     // Per-page residency bit vector, used for fast traversal of resident
236     // pages.
237     //
238     // A set bit means the CPU has a coherent copy of the physical page
239     // resident in the NUMA node's memory, and that a CPU chunk for the
240     // corresponding page index has been allocated. This does not mean that
241     // the coherent copy is currently mapped anywhere, however. A page may be
242     // resident on multiple processors (but not multiple CPU NUMA nodes) when in
243     // read-duplicate mode.
244     //
245     // A cleared bit means the CPU NUMA node does not have a coherent copy of
246     // that page resident. A CPU chunk for the corresponding page index may or
247     // may not have been allocated. If the chunk is present, it's a cached chunk
248     // which can be reused in the future.
249     //
250     // Allocating PAGES_PER_UVM_VA_BLOCK is overkill when the block is
251     // smaller than UVM_VA_BLOCK_SIZE, but it's not much extra memory
252     // overhead on the whole.
253     uvm_page_mask_t resident;
254 
255     // Per-page allocation bit vector.
256     //
257     // A set bit means that a CPU chunk has been allocated for the
258     // corresponding page index on this NUMA node.
259     uvm_page_mask_t allocated;
260 
261     // CPU memory chunks represent physically contiguous CPU memory
262     // allocations. See uvm_pmm_sysmem.h for more details on CPU chunks.
263     // This member is meant to hold an opaque value indicating the CPU
264     // chunk storage method. For more details on CPU chunk storage,
265     // see uvm_cpu_chunk_storage_type_t in uvm_va_block.c.
266     unsigned long chunks;
267 } uvm_va_block_cpu_node_state_t;
268 
269 // TODO: Bug 1766180: Worst-case we could have one of these per system page.
270 //       Options:
271 //       1) Rely on the OOM killer to prevent the user from trying to do that
272 //       2) Be much more space-conscious in this struct (difficult)
273 //       3) Cap the per-process range and/or block count, like vm.max_map_count
274 //          does for vmas
275 struct uvm_va_block_struct
276 {
277     // Reference count for this block. References are held by:
278     // - The parent VA range for managed blocks or VA space for HMM blocks
279     // - The reverse map
280     // - The eviction path temporarily when attempting to evict a GPU page under
281     //   this block
282     //
283     // This isn't protected by the lock on the eviction path, so it must be
284     // atomic. nv_kref provides that.
285     nv_kref_t kref;
286 
287     // Lock protecting the block. See the comment at the top of uvm.c.
288     uvm_mutex_t lock;
289 
290     // Parent VA range. Managed blocks have this set. HMM blocks will have
291     // va_range set to NULL and hmm.va_space set instead. Dead blocks that are
292     // waiting for the last ref count to be removed have va_range and
293     // hmm.va_space set to NULL (could be either type of block).
294     //
295     // This field can be read while holding either the block lock or just the VA
296     // space lock in read mode, since it can only change when the VA space lock
297     // is held in write mode.
298     uvm_va_range_t *va_range;
299 
300     // Virtual address [start, end] covered by this block. These fields can be
301     // read while holding either the block lock or just the VA space lock in
302     // read mode, since they can only change when the VA space lock is held in
303     // write mode.
304     NvU64 start;
305     NvU64 end;
306 
307     // Per-processor residency bit vector, used for fast lookup of which
308     // processors are active in this block.
309     //
310     // A set bit means the corresponding processor has a coherent physical copy
311     // of memory somewhere in the block. The per-processor state must then be
312     // inspected to find out which pages. The processor may or may not have a
313     // mapping to that physical memory, however.
314     //
315     // A cleared bit means the corresponding processor does not have a coherent
316     // physical copy of any pages under this block. The processor may still have
317     // cached pages allocated for future use, however. It also may have mappings
318     // to pages resident on other processors.
319     uvm_processor_mask_t resident;
320 
321     // Per-processor mapping bit vector, used for fast lookup of which
322     // processors are active in this block.
323     //
324     // A set bit means the corresponding processor has an active, valid page
325     // table mapping to some VA in this block. The per-processor pte_bits state
326     // must then be inspected to find out the mapping address and permissions.
327     //
328     // A cleared bit means the corresponding processor has no virtual mappings
329     // within this block (all pte_bits entries are 0).
330     uvm_processor_mask_t mapped;
331 
332     // Per-processor evicted bit vector, used for fast lookup of which GPUs
333     // have evicted pages in this block.
334     //
335     // A set bit means the corresponding processor was the residency of some of
336     // the pages in the block when they were evicted due to memory capacity
337     // limitations. The per-processor state must then be inspected to find out
338     // which pages.
339     //
340     // A cleared bit means the corresponding processor has no evicted pages
341     // within this block (all evicted entries are 0).
342     uvm_processor_mask_t evicted_gpus;
343 
344     struct
345     {
346         // Per-NUMA node tracking of CPU allocations.
347         // This is a dense array with one entry per possible NUMA node.
348         uvm_va_block_cpu_node_state_t **node_state;
349 
350         // Per-page allocation bit vector.
351         //
352         // A set bit means that a CPU page has been allocated for the
353         // corresponding page index on at least one CPU NUMA node.
354         uvm_page_mask_t allocated;
355 
356         // Per-page residency bit vector. See
357         // uvm_va_block_cpu_numa_state_t::resident for a detailed description.
358         // This mask is a cumulative mask (logical OR) of all
359         // uvm_va_block_cpu_node_state_t::resident masks. It is meant to be used
360         // only for fast testing of page residency when it matters only if the
361         // page is resident on the CPU.
362         //
363         // Note that this mask cannot be set directly as this will cause
364         // inconsistencies between this mask and the per-NUMA residency masks.
365         // In order to properly maintain consistency between the per-NUMA masks
366         // and this one, uvm_va_block_cpu_[set|clear]_residency_*() helpers
367         // should be used.
368         uvm_page_mask_t resident;
369 
370         // Per-page mapping bit vectors, one per bit we need to track. These are
371         // used for fast traversal of valid mappings in the block. These contain
372         // all non-address bits needed to establish a virtual mapping on this
373         // processor (permissions, cacheability, etc).
374         //
375         // A cleared bit in UVM_PTE_BITS_CPU_READ means the CPU has no valid
376         // virtual mapping to that address (the access will fault). Further,
377         // UVM_PTE_BITS_CPU_WRITE is guaranteed to also be clear.
378         //
379         // A set bit in UVM_PTE_BITS_CPU_READ means the CPU has a valid mapping
380         // at that address with at least read permissions. The physical page for
381         // that mapping is contained in the pages array. If
382         // UVM_PTE_BITS_CPU_WRITE is not set, the mapping is read-only.
383         // Otherwise, the mapping is read-write.
384         //
385         // For managed allocations, this is the maximum permissions a PTE
386         // could have, but not necessarily the actual current permissions of the
387         // CPU PTEs. The UVM driver will never change the PTEs without updating
388         // this state, but the kernel can downgrade our CPU mappings at any time
389         // without notifying the UVM driver (for example in response to user
390         // space calling madvise with MADV_DONTNEED).
391         //
392         // For HMM allocations, this is the minimum permission the CPU has since
393         // Linux can upgrade a read-only PTE to read-write without notifying
394         // the UVM driver. This is why read duplication isn't currently
395         // supported.
396         // TODO: Bug 3660922: Need to handle read duplication at some point.
397         uvm_page_mask_t pte_bits[UVM_PTE_BITS_CPU_MAX];
398 
399         // Whether the CPU has ever mapped a page on this VA block. This is
400         // used to force GMMU PDE1 pre-population on ATS systems. See
401         // pre_populate_gpu_pde1 in uvm_va_block.c for more information.
402         NvU8 ever_mapped        : 1;
403 
404         // We can get "unexpected" faults if multiple CPU threads fault on the
405         // same address simultaneously and race to create the mapping. Since
406         // our CPU fault handler always unmaps to handle the case where the
407         // kernel downgrades our CPU mappings, we can introduce an infinite
408         // stream of CPU faults in multi-threaded workloads.
409         //
410         // In order to handle this scenario, we keep track of the first thread
411         // that faulted on a page with valid permissions and the timestamp.
412         // Then, we keep track of the subsequent faults on that page during a
413         // window of time. If the first thread faults again on the page, that
414         // will indicate that the mapping has been downgraded by the kernel and
415         // we need to remap it. Faults from the rest of threads are just
416         // ignored. The information is also cleared on the following events:
417         // - The tracking window finishes
418         // - The page is unmapped
419         struct
420         {
421             // Timestamp when the first fault was detected. This also is used
422             // as a flag that the contents of this struct are valid
423             NvU64             first_fault_stamp;
424 
425             // First thread that faulted while having valid permissions. we
426             // don't take a reference on the pid so we shouldn't ever use it
427             // for task-lookup in the kernel. We only use it as a heuristic so
428             // it's OK if the pid gets destroyed or reused.
429             pid_t             first_pid;
430 
431             // Index of the page whose faults are being tracked
432             uvm_page_index_t  page_index;
433         } fault_authorized;
434     } cpu;
435 
436     // Per-GPU residency and mapping state
437     //
438     // TODO: Bug 1766180: Even though these are pointers, making this a static
439     //       array will use up a non-trivial amount of storage for small blocks.
440     //       In most cases we won't have anywhere near this many GPUs active
441     //       anyway. Consider using a dense array of just the GPUs registered in
442     //       this VA space, depending on the perf of accessing that array and on
443     //       how noticeable this memory overhead actually is.
444     uvm_va_block_gpu_state_t *gpus[UVM_ID_MAX_GPUS];
445 
446     // Mask to keep track of the pages that are read-duplicate
447     uvm_page_mask_t read_duplicated_pages;
448 
449     // Mask to keep track of the pages that are not mapped on any non-UVM-Lite
450     // processor. This mask is not used for HMM because the CPU can map pages
451     // at any time without notifying the driver.
452     //     0: Page is definitely not mapped by any processors
453     //     1: Page may or may not be mapped by a processor
454     //
455     // This mask sets the bit when the page is mapped on any non-UVM-Lite
456     // processor but it is not always unset on unmap (to avoid a performance
457     // impact). Therefore, it can contain false negatives. It should be only
458     // used for opportunistic optimizations that have a fast path for pages
459     // that are not mapped anywhere (see uvm_va_block_migrate_locked, for
460     // example), but not the other way around.
461     uvm_page_mask_t maybe_mapped_pages;
462 
463     // Tracks all outstanding GPU work related to this block: GPU copies, PTE
464     // updates, TLB invalidates, etc. The residency and mapping state is only
465     // valid once this tracker is done.
466     //
467     // CPU operations need to wait for this tracker to be done. GPU operations
468     // need to acquire it before pushing their work, then that work must be
469     // added to this tracker before the block's lock is dropped.
470     uvm_tracker_t tracker;
471 
472     // A queue item for establishing eviction mappings in a deferred way
473     nv_kthread_q_item_t eviction_mappings_q_item;
474 
475     uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT];
476 
477     // Prefetch infomation that is updated while holding the va_block lock but
478     // records state while the lock is not held.
479     struct
480     {
481         uvm_processor_id_t last_migration_proc_id;
482 
483         NvU16 fault_migrations_to_last_proc;
484     } prefetch_info;
485 
486     struct
487     {
488 #if UVM_IS_CONFIG_HMM()
489         // The MMU notifier is registered per va_block.
490         struct mmu_interval_notifier notifier;
491 #endif
492 
493         // This is used to serialize migrations between CPU and GPU while
494         // allowing the va_block lock to be dropped.
495         // This must be acquired before locking the va_block lock if the
496         // critical section can change the residency state.
497         // Do not access directly, use the uvm_hmm_migrate_*() routines.
498         uvm_mutex_t migrate_lock;
499 
500         // Sequence number to tell if any changes were made to the va_block
501         // while not holding the block lock and calling hmm_range_fault().
502         unsigned long changed;
503 
504         // Parent VA space pointer. It is NULL for managed blocks or if
505         // the HMM block is dead. This field can be read while holding the
506         // block lock and is only modified while holding the va_space write
507         // lock and va_block lock (same as the va_range pointer).
508         uvm_va_space_t *va_space;
509 
510         // Tree of uvm_va_policy_node_t. The policy node ranges always cover
511         // all or part of a VMA range or a contiguous range of VMAs within the
512         // va_block. Policy nodes are resized or deleted when the underlying
513         // VMA range is changed by Linux via the invalidate() callback.
514         // Otherwise, policies could be stale after munmap().
515         // Locking: The va_block lock is needed to access or modify the tree.
516         uvm_range_tree_t va_policy_tree;
517 
518         // Storage node for range tree of va_blocks.
519         uvm_range_tree_node_t node;
520     } hmm;
521 };
522 
523 // We define additional per-VA Block fields for testing. When
524 // uvm_enable_builtin_tests is defined, all VA Blocks will have
525 // uvm_va_block_wrapper_t size. Otherwise, the test fields are not available.
526 // Use the uvm_va_block_get_test function defined below to obtain a safe
527 // pointer to uvm_va_block_test_t from a uvm_va_block_t pointer.
528 struct uvm_va_block_wrapper_struct
529 {
530     uvm_va_block_t block;
531 
532     struct uvm_va_block_test_struct
533     {
534         // Count of how many page table allocations should be forced to retry
535         // with eviction enabled. Used for testing only.
536         NvU32 page_table_allocation_retry_force_count;
537 
538         // Count of how many user pages allocations should be forced to retry
539         // with eviction enabled. Used for testing only.
540         NvU32 user_pages_allocation_retry_force_count;
541 
542         // Mask of chunk sizes to be used for CPU chunk allocations.
543         // The actual set of chunk sizes to be used will be the set resulting
544         // from AND'ing this value with the value of
545         // uvm_cpu_chunk_allocation_sizes module parameter.
546         NvU32 cpu_chunk_allocation_size_mask;
547 
548         // Subsequent operations that need to allocate CPU pages will fail. As
549         // opposed to other error injection settings, this one fails N times
550         // and then succeeds instead of failing on the Nth try. A value of ~0u
551         // means fail indefinitely.
552         // This is because this error is supposed to be fatal and tests verify
553         // the state of the VA blocks after the failure. However, some tests
554         // use kernels to trigger migrations and a fault replay could trigger
555         // a successful migration if this error flag is cleared.
556         NvU32 inject_cpu_pages_allocation_error_count;
557 
558         // A NUMA node ID on which any CPU chunks will be allocated from.
559         // This will override any other setting and/or policy.
560         // Note that the kernel is still free to allocate from any of the
561         // nodes in the thread's policy.
562         int cpu_chunk_allocation_target_id;
563         int cpu_chunk_allocation_actual_id;
564 
565         // Force the next eviction attempt on this block to fail. Used for
566         // testing only.
567         bool inject_eviction_error;
568 
569         // Force the next successful chunk allocation to then fail. Used for testing
570         // only to simulate driver metadata allocation failure.
571         bool inject_populate_error;
572 
573         // Force the next split on this block to fail.
574         // Set by error injection ioctl for testing purposes only.
575         bool inject_split_error;
576     } test;
577 };
578 
579 // Tracking needed for supporting allocation-retry of user GPU memory
580 struct uvm_va_block_retry_struct
581 {
582     // A tracker used for all allocations from PMM.
583     uvm_tracker_t tracker;
584 
585     // List of allocated chunks (uvm_gpu_chunk_t). Currently all chunks are of
586     // the same size. However it can contain chunks from multiple GPUs. All
587     // remaining free chunks are freed when the operation is finished with
588     // uvm_va_block_retry_deinit().
589     struct list_head free_chunks;
590 
591     // List of chunks allocated and used during the block operation. This list
592     // can contain chunks from multiple GPUs. All the used chunks are unpinned
593     // when the operation is finished with uvm_va_block_retry_deinit().
594     struct list_head used_chunks;
595 };
596 
597 // Module load/exit
598 NV_STATUS uvm_va_block_init(void);
599 void uvm_va_block_exit(void);
600 
601 // Allocates and initializes the block. The block's ref count is initialized to
602 // 1. The caller is responsible for inserting the block into its parent
603 // va_range.
604 //
605 // The caller must be holding the VA space lock in at least read mode.
606 //
607 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
608 NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range,
609                               NvU64 start,
610                               NvU64 end,
611                               uvm_va_block_t **out_block);
612 
613 // Internal function called only when uvm_va_block_release drops the ref count
614 // to 0. Do not call directly.
615 void uvm_va_block_destroy(nv_kref_t *kref);
616 
617 static inline void uvm_va_block_retain(uvm_va_block_t *va_block)
618 {
619     nv_kref_get(&va_block->kref);
620 }
621 
622 // Locking: The va_block lock must not be held.
623 // The va_space lock must be held in write mode unless it is the special case
624 // that the block has no GPU state; for example, right after calling
625 // uvm_va_block_create(). In that case, the va_space lock can be held in read
626 // mode.
627 static inline void uvm_va_block_release(uvm_va_block_t *va_block)
628 {
629     if (va_block) {
630         // The calling thread shouldn't be holding the block's mutex when
631         // releasing the block as it might get destroyed.
632         uvm_assert_unlocked_order(UVM_LOCK_ORDER_VA_BLOCK);
633         nv_kref_put(&va_block->kref, uvm_va_block_destroy);
634     }
635 }
636 
637 // Same as uvm_va_block_release but the caller may be holding the VA block lock.
638 // The caller must ensure that the refcount will not get to zero in this call.
639 static inline void uvm_va_block_release_no_destroy(uvm_va_block_t *va_block)
640 {
641     int destroyed = nv_kref_put(&va_block->kref, uvm_va_block_destroy);
642     UVM_ASSERT(!destroyed);
643 }
644 
645 // Returns true if the block is managed by HMM.
646 // Locking: This can be called while holding either the block lock or just the
647 // VA space lock in read mode, since it can only change when the VA space lock
648 // is held in write mode.
649 static inline bool uvm_va_block_is_hmm(uvm_va_block_t *va_block)
650 {
651 #if UVM_IS_CONFIG_HMM()
652     return va_block->hmm.va_space;
653 #else
654     return false;
655 #endif
656 }
657 
658 // Return true if the block is dead.
659 // Locking: This can be called while holding either the block lock or just the
660 // VA space lock in read mode, since it can only change when the VA space lock
661 // is held in write mode.
662 static inline bool uvm_va_block_is_dead(uvm_va_block_t *va_block)
663 {
664     if (va_block->va_range)
665         return false;
666 
667 #if UVM_IS_CONFIG_HMM()
668     if (va_block->hmm.va_space)
669         return false;
670 #endif
671 
672     return true;
673 }
674 
675 static inline uvm_va_block_gpu_state_t *uvm_va_block_gpu_state_get(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id)
676 {
677     return va_block->gpus[uvm_id_gpu_index(gpu_id)];
678 }
679 
680 // Return the va_space pointer of the given block or NULL if the block is dead.
681 // Locking: This can be called while holding either the block lock or just the
682 // VA space lock in read mode, since it can only change when the VA space lock
683 // is held in write mode.
684 uvm_va_space_t *uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t *va_block);
685 
686 // Return the va_space pointer of the given block assuming the block is not dead
687 // (asserts that it is not dead and asserts va_space is not NULL).
688 // Locking: This can be called while holding either the block lock or just the
689 // VA space lock in read mode, since it can only change when the VA space lock
690 // is held in write mode.
691 uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block);
692 
693 // Return true if the VA space has access counter migrations enabled and should
694 // remote map pages evicted to system memory. This is OK since access counters
695 // can pull the data back to vidmem if sufficient accesses trigger a migration.
696 // The caller must ensure that the VA space cannot go away.
697 bool uvm_va_space_map_remote_on_eviction(uvm_va_space_t *va_space);
698 
699 // Dynamic cache-based allocation for uvm_va_block_context_t.
700 //
701 // See uvm_va_block_context_init() for a description of the mm parameter.
702 uvm_va_block_context_t *uvm_va_block_context_alloc(struct mm_struct *mm);
703 void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context);
704 
705 // Initialization of an already-allocated uvm_va_block_context_t.
706 //
707 // mm is used to initialize the value of va_block_context->mm. NULL is allowed.
708 void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context, struct mm_struct *mm);
709 
710 // Return the preferred NUMA node ID for the block's policy.
711 // If the preferred node ID is NUMA_NO_NODE, the current NUMA node ID
712 // is returned.
713 int uvm_va_block_context_get_node(uvm_va_block_context_t *va_block_context);
714 
715 // TODO: Bug 1766480: Using only page masks instead of a combination of regions
716 //       and page masks could simplify the below APIs and their implementations
717 //       at the cost of having to scan the whole mask for small regions.
718 //       Investigate the performance effects of doing that.
719 
720 // Moves the physical pages of the given region onto the destination processor.
721 // If page_mask is non-NULL, the movement is further restricted to only those
722 // pages in the region which are present in the mask.
723 //
724 // prefetch_page_mask may be passed as a subset of page_mask when cause is
725 // UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT,
726 // UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT, or
727 // UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER to indicate pages that have been
728 // pulled due to automatic page prefetching heuristics. For pages in this mask,
729 // UVM_MAKE_RESIDENT_CAUSE_PREFETCH will be reported in migration events,
730 // instead.
731 //
732 // This function breaks read duplication for all given pages even if they
733 // don't migrate. Pages which are not resident on the destination processor
734 // will also be unmapped from all existing processors, be populated in the
735 // destination processor's memory, and copied to the new physical location.
736 // Any new memory will be zeroed if it is the first allocation for that page
737 // in the system.
738 //
739 // This function does not create any new virtual mappings.
740 //
741 // This function acquires/waits for the va_block tracker and updates that
742 // tracker with any new work pushed.
743 //
744 // Allocation-retry: this operation may need to perform eviction to be able to
745 // allocate GPU memory successfully and if that happens,
746 // NV_ERR_MORE_PROCESSING_REQUIRED will be returned. That also means that the
747 // block's lock has been unlocked and relocked as part of the call and that the
748 // whole sequence of operations performed under the block's lock needs to be
749 // attempted again. To facilitate that, the caller needs to provide the same
750 // va_block_retry struct for each attempt that has been initialized before the
751 // first attempt and needs to be deinitialized after the last one. Most callers
752 // can just use UVM_VA_BLOCK_LOCK_RETRY() that takes care of that for the
753 // caller.
754 //
755 // If dest_id is the CPU then va_block_retry can be NULL and allocation-retry of
756 // user memory is guaranteed not to happen. Allocation-retry of GPU page tables
757 // can still occur though.
758 //
759 // va_block_context must not be NULL and policy for the region must
760 // match. This function will set a bit in
761 // va_block_context->make_resident.pages_changed_residency for each
762 // page that changed residency (due to a migration or first
763 // population) as a result of the operation and
764 // va_block_context->make_resident.all_involved_processors for each
765 // processor involved in the copy. This function only sets bits in
766 // those masks. It is the caller's responsiblity to zero the masks or
767 // not first.
768 //
769 // va_block_context->make_resident.dest_nid is used to guide the NUMA node for
770 // CPU allocations.
771 //
772 // Notably any status other than NV_OK indicates that the block's lock might
773 // have been unlocked and relocked.
774 //
775 // LOCKING: The caller must hold the va_block lock.
776 // If va_block_context->mm != NULL, va_block_context->mm->mmap_lock must be
777 // held in at least read mode.
778 NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block,
779                                      uvm_va_block_retry_t *va_block_retry,
780                                      uvm_va_block_context_t *va_block_context,
781                                      uvm_processor_id_t dest_id,
782                                      uvm_va_block_region_t region,
783                                      const uvm_page_mask_t *page_mask,
784                                      const uvm_page_mask_t *prefetch_page_mask,
785                                      uvm_make_resident_cause_t cause);
786 
787 // Similar to uvm_va_block_make_resident (read documentation there). The main
788 // differences are:
789 // - Pages are copied not moved (i.e. other copies of the page are not
790 //   unmapped)
791 // - Processors with a resident copy of pages that migrated have write and
792 //   atomic access permission revoked, unlike in uvm_va_block_make_resident
793 //   where they are unmapped
794 // - All remote mappings (due to either SetAccessedBy or performance heuristics)
795 //   are broken
796 // - Only managed va_blocks are supported.
797 //   TODO: Bug 3660922: need to implement HMM read duplication support.
798 NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
799                                                     uvm_va_block_retry_t *va_block_retry,
800                                                     uvm_va_block_context_t *va_block_context,
801                                                     uvm_processor_id_t dest_id,
802                                                     uvm_va_block_region_t region,
803                                                     const uvm_page_mask_t *page_mask,
804                                                     const uvm_page_mask_t *prefetch_page_mask,
805                                                     uvm_make_resident_cause_t cause);
806 
807 // Similar to uvm_va_block_make_resident() (read documentation there). The
808 // difference is that source pages are only copied to the destination and the
809 // residency is not updated until uvm_va_block_make_resident_finish() is called.
810 // Otherwise, the combination of uvm_va_block_make_resident_copy() and
811 // uvm_va_block_make_resident_finish() is the same as just calling
812 // uvm_va_block_make_resident(). Note, however, that the va_block lock must be
813 // held across the two calls for the operation to be complete. The va_block
814 // lock can be dropped after calling uvm_va_block_make_resident_copy() but
815 // uvm_va_block_make_resident_copy() must be called again after relocking the
816 // va_block lock and before calling uvm_va_block_make_resident_finish().
817 // This split is needed when using migrate_vma_setup() and migrate_vma_pages()
818 // so that when migrate_vma_pages() indicates a page is not migrating, the
819 // va_block state is not updated.
820 // LOCKING: The caller must hold the va_block lock.
821 NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block,
822                                           uvm_va_block_retry_t *va_block_retry,
823                                           uvm_va_block_context_t *va_block_context,
824                                           uvm_processor_id_t dest_id,
825                                           uvm_va_block_region_t region,
826                                           const uvm_page_mask_t *page_mask,
827                                           const uvm_page_mask_t *prefetch_page_mask,
828                                           uvm_make_resident_cause_t cause);
829 
830 // The page_mask must be the same or a subset of the page_mask passed to
831 // uvm_va_block_make_resident_copy(). This step updates the residency and breaks
832 // read duplication.
833 // LOCKING: The caller must hold the va_block lock.
834 void uvm_va_block_make_resident_finish(uvm_va_block_t *va_block,
835                                        uvm_va_block_context_t *va_block_context,
836                                        uvm_va_block_region_t region,
837                                        const uvm_page_mask_t *page_mask);
838 
839 // Creates or upgrades a mapping from the input processor to the given virtual
840 // address region. Pages which already have new_prot permissions or higher are
841 // skipped, so this call ensures that the range is mapped with at least new_prot
842 // permissions. new_prot must not be UVM_PROT_NONE. uvm_va_block_unmap or
843 // uvm_va_block_revoke_prot should be used to downgrade permissions instead.
844 //
845 // The mapped pages are described by the region parameter and the map page mask
846 // that allows the caller to restrict the map operation to specific pages within
847 // the region. If the page mask is NULL then the whole region is mapped.
848 //
849 // If the input processor is a GPU with no GPU VA space registered, or if the
850 // input processor is the CPU and this thread is not allowed to create CPU
851 // mappings, this function does nothing. CPU mappings are only allowed if
852 // uvm_va_range_vma_check(va_block_context->mm) is valid, so the caller must
853 // set va_block_context->mm before calling this function.
854 //
855 // cause specifies the cause to be reported in events in case a remote mapping
856 // is created.
857 //
858 // Any CPU mappings will wait for the va_block tracker. If this function pushes
859 // GPU work it will first acquire the va_block tracker, then add the pushed work
860 // to out_tracker. It is the caller's responsibility to add this work to
861 // va_block's tracker. Note that while it is generally safe to run map
862 // operations on different GPUs concurrently, two PTE operations (map, unmap,
863 // revoke) on the same GPU must be serialized even if they target different
864 // pages because the earlier operation can cause a PTE split or merge which is
865 // assumed by the later operation.
866 //
867 // va_block_context must not be NULL and policy for the region must match.
868 // See the comments for uvm_va_block_check_policy_is_valid().
869 //
870 // If allocation-retry was required as part of the operation and was successful,
871 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
872 // out_tracker were added to the block's tracker and then the block's lock was
873 // unlocked and relocked.
874 //
875 // In general, any status other than NV_OK indicates that the block's lock might
876 // have been unlocked and relocked.
877 //
878 // LOCKING: The caller must hold the va block lock. If va_block_context->mm !=
879 //          NULL, va_block_context->mm->mmap_lock must be held in at least read
880 //          mode.
881 NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
882                            uvm_va_block_context_t *va_block_context,
883                            uvm_processor_id_t id,
884                            uvm_va_block_region_t region,
885                            const uvm_page_mask_t *map_page_mask,
886                            uvm_prot_t new_prot,
887                            UvmEventMapRemoteCause cause,
888                            uvm_tracker_t *out_tracker);
889 
890 // Like uvm_va_block_map, except it maps all processors in the input mask. The
891 // VA block tracker contains all map operations on return.
892 //
893 // Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like
894 // uvm_va_block_map() indicating that the operation needs to be retried.
895 NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block,
896                                 uvm_va_block_context_t *va_block_context,
897                                 const uvm_processor_mask_t *map_processor_mask,
898                                 uvm_va_block_region_t region,
899                                 const uvm_page_mask_t *map_page_mask,
900                                 uvm_prot_t new_prot,
901                                 UvmEventMapRemoteCause cause);
902 
903 // Unmaps virtual regions from a single processor. This does not free page
904 // tables or physical memory. This is safe to call on the eviction path, but the
905 // caller must ensure that the block hasn't been killed.
906 //
907 // The unmapped pages are described by the region parameter and the unmap page
908 // mask that allows the caller to restrict the unmap operation to specific pages
909 // within the region. If the page mask is NULL then the whole region is
910 // unmapped.
911 //
912 // If id is UVM_ID_CPU, this is guaranteed to return NV_OK, and this is safe to
913 // call without holding a reference on the mm which owns the associated vma.
914 //
915 // Any CPU unmappings will wait for the va_block tracker. If this function
916 // pushes GPU work it will first acquire the va_block tracker, then add the
917 // pushed work to out_tracker. It is the caller's responsibility to add this
918 // work to va_block's tracker. Note that while it is generally safe to run unmap
919 // operations on different GPUs concurrently, two PTE operations (map, unmap,
920 // revoke) on the same GPU must be serialized even if they target different
921 // pages because the earlier operation can cause a PTE split or merge which is
922 // assumed by the later operation.
923 //
924 // va_block_context must not be NULL.
925 //
926 // If allocation-retry was required as part of the operation and was successful,
927 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
928 // out_tracker were added to the block's tracker and then the block's lock was
929 // unlocked and relocked. It is guaranteed that retry will not be required if
930 // the unmap does not cause a PTE split. Examples of operations which will not
931 // cause a PTE split include unmapping the entire block, unmapping all PTEs with
932 // matching attributes, and unmapping all PTEs which point to the same physical
933 // chunk.
934 //
935 // LOCKING: The caller must hold the va_block lock.
936 NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block,
937                              uvm_va_block_context_t *va_block_context,
938                              uvm_processor_id_t id,
939                              uvm_va_block_region_t region,
940                              const uvm_page_mask_t *unmap_page_mask,
941                              uvm_tracker_t *out_tracker);
942 
943 // Like uvm_va_block_unmap, except it unmaps all processors in the input mask.
944 // The VA block tracker contains all map operations on return.
945 NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block,
946                                   uvm_va_block_context_t *va_block_context,
947                                   const uvm_processor_mask_t *unmap_processor_mask,
948                                   uvm_va_block_region_t region,
949                                   const uvm_page_mask_t *unmap_page_mask);
950 
951 // Function called when the preferred location changes. Notably:
952 // - Mark all CPU pages as dirty because the new processor may not have
953 //   up-to-date data.
954 // - Unmap the preferred location's processor from any pages in this region
955 //   which are not resident on the preferred location.
956 //
957 // va_block_context must not be NULL and policy for the region must match.
958 // See the comments for uvm_va_block_check_policy_is_valid().
959 //
960 // LOCKING: The caller must hold the VA block lock.
961 NV_STATUS uvm_va_block_set_preferred_location_locked(uvm_va_block_t *va_block,
962                                                      uvm_va_block_context_t *va_block_context,
963                                                      uvm_va_block_region_t region);
964 
965 // Maps the given processor to all resident pages in this block, as allowed by
966 // location and policy. Waits for the operation to complete before returning.
967 // This function should only be called with managed va_blocks.
968 //
969 // va_block_context must not be NULL and policy for the region must match.
970 // See the comments for uvm_va_block_check_policy_is_valid().
971 //
972 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm
973 //          != NULL, va_block_context->mm->mmap_lock must be held in at least
974 //          read mode.
975 NV_STATUS uvm_va_block_set_accessed_by(uvm_va_block_t *va_block,
976                                        uvm_va_block_context_t *va_block_context,
977                                        uvm_processor_id_t processor_id);
978 
979 // Maps given processor to all resident pages in this block and region, as
980 // allowed by location and policy. The caller is responsible for waiting for
981 // the tracker after all mappings have been started.
982 // This function can be called with HMM and managed va_blocks.
983 //
984 // va_block_context must not be NULL and policy for the region must match.
985 // See the comments for uvm_va_block_check_policy_is_valid().
986 //
987 // LOCKING: The caller must hold the va_block lock and
988 //          va_block_context->mm->mmap_lock must be held in at least read mode.
989 NV_STATUS uvm_va_block_set_accessed_by_locked(uvm_va_block_t *va_block,
990                                               uvm_va_block_context_t *va_block_context,
991                                               uvm_processor_id_t processor_id,
992                                               uvm_va_block_region_t region,
993                                               uvm_tracker_t *out_tracker);
994 
995 // Breaks SetAccessedBy and remote mappings
996 // This function should only be called with managed va_blocks.
997 //
998 // va_block_context must not be NULL and policy for the region must match.
999 // See the comments for uvm_va_block_check_policy_is_valid().
1000 //
1001 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm
1002 //          != NULL, va_block_context->mm->mmap_lock must be held in at least
1003 //          read mode.
1004 NV_STATUS uvm_va_block_set_read_duplication(uvm_va_block_t *va_block,
1005                                             uvm_va_block_context_t *va_block_context);
1006 
1007 // Restores SetAccessedBy mappings
1008 // This function should only be called with managed va_blocks.
1009 //
1010 // va_block_context must not be NULL and policy for the region must match.
1011 // See the comments for uvm_va_block_check_policy_is_valid().
1012 //
1013 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm
1014 //          != NULL, va_block_context->mm->mmap_lock must be held in at least
1015 //          read mode.
1016 NV_STATUS uvm_va_block_unset_read_duplication(uvm_va_block_t *va_block,
1017                                               uvm_va_block_context_t *va_block_context);
1018 
1019 // Check if processor_id is allowed to access the va_block with access_type
1020 // permissions. Return values:
1021 //
1022 // NV_ERR_INVALID_ADDRESS       The VA block is logically dead (zombie)
1023 // NV_ERR_INVALID_ACCESS_TYPE   The vma corresponding to the VA range does not
1024 //                              allow access_type permissions, or migration is
1025 //                              disallowed and processor_id cannot access the
1026 //                              range remotely (UVM-Lite).
1027 // NV_ERR_INVALID_OPERATION     The access would violate the policies specified
1028 //                              by UvmPreventMigrationRangeGroups.
1029 //
1030 // va_block_context must not be NULL, policy must match, and if the va_block is
1031 // a HMM block, va_block_context->hmm.vma must be valid which also means the
1032 // va_block_context->mm is not NULL, retained, and locked for at least read.
1033 // Locking: the va_block lock must be held.
1034 NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
1035                                                  uvm_va_block_context_t *va_block_context,
1036                                                  uvm_processor_id_t processor_id,
1037                                                  uvm_page_index_t page_index,
1038                                                  uvm_fault_type_t access_type,
1039                                                  bool allow_migration);
1040 
1041 // API for access privilege revocation
1042 //
1043 // Revoke prot_to_revoke access permissions for the given processor.
1044 //
1045 // The revoked pages are described by the region parameter and the revoke page
1046 // mask that allows the caller to restrict the revoke operation to specific
1047 // pages within the region.
1048 //
1049 // prot_to_revoke must be greater than UVM_PROT_READ_ONLY. Caller should call
1050 // unmap explicitly if it wants to revoke all access privileges.
1051 //
1052 // If id is UVM_ID_CPU, and prot_to_revoke is UVM_PROT_READ_WRITE_ATOMIC, no
1053 // action is performed. If the processor id corresponds to the CPU and the
1054 // caller cannot establish CPU mappings because it does not have a reference on
1055 // vma->vm_mm (va_block_context->mm != vma->vm_mm), the page will be simply
1056 // unmapped. Caller should call unmap explicitly if it wants to revoke all
1057 // access privileges.
1058 //
1059 // Any CPU revocation will wait for the va_block tracker. If this function
1060 // pushes GPU work it will first acquire the va_block tracker, then add the
1061 // pushed work to out_tracker. It is the caller's responsibility to add this
1062 // work to va_block's tracker. Note that while it is generally safe to run
1063 // revocation operations on different GPUs concurrently, two PTE operations
1064 // (map, unmap, revoke) on the same GPU must be serialized even if they target
1065 // different pages because the earlier operation can cause a PTE split or merge
1066 // which is assumed by the later operation.
1067 //
1068 // va_block_context must not be NULL.
1069 //
1070 // If allocation-retry was required as part of the operation and was successful,
1071 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
1072 // out_tracker were added to the block's tracker and then the block's lock was
1073 // unlocked and relocked.
1074 //
1075 // In general, any status other than NV_OK indicates that the block's lock might
1076 // have been unlocked and relocked.
1077 //
1078 // LOCKING: The caller must hold the va block lock. If va_block_context->mm !=
1079 //          NULL, va_block_context->mm->mmap_lock must be held in at least read
1080 //          mode.
1081 NV_STATUS uvm_va_block_revoke_prot(uvm_va_block_t *va_block,
1082                                    uvm_va_block_context_t *va_block_context,
1083                                    uvm_processor_id_t id,
1084                                    uvm_va_block_region_t region,
1085                                    const uvm_page_mask_t *revoke_page_mask,
1086                                    uvm_prot_t prot_to_revoke,
1087                                    uvm_tracker_t *out_tracker);
1088 
1089 // Like uvm_va_block_revoke_prot(), except it revokes all processors in the
1090 // input mask. The VA block tracker contains all revocation operations on
1091 // return.
1092 //
1093 // Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like
1094 // uvm_va_block_revoke_prot() indicating that the operation needs to be retried.
1095 NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block,
1096                                         uvm_va_block_context_t *va_block_context,
1097                                         const uvm_processor_mask_t *revoke_processor_mask,
1098                                         uvm_va_block_region_t region,
1099                                         const uvm_page_mask_t *revoke_page_mask,
1100                                         uvm_prot_t prot_to_revoke);
1101 
1102 // Tries to map all pages in the given region and map_page_mask with at most
1103 // max_prot privileges for appropriate processors as determined by the
1104 // accessed_by mask, heuristics and the given processor mask (excluding
1105 // processor_id, which triggered the migration and should have already been
1106 // mapped).
1107 //
1108 // va_block_context must not be NULL and policy for the region must match.
1109 // See the comments for uvm_va_block_check_policy_is_valid().
1110 //
1111 // This function acquires/waits for the va_block tracker and updates that
1112 // tracker with any new work pushed.
1113 //
1114 // Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like
1115 // uvm_va_block_map() indicating that the operation needs to be retried.
1116 //
1117 // LOCKING: The caller must hold the va block lock. If va_block_context->mm !=
1118 //          NULL, va_block_context->mm->mmap_lock must be held in at least read
1119 //          mode.
1120 NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
1121                                                     uvm_va_block_context_t *va_block_context,
1122                                                     uvm_processor_id_t new_residency,
1123                                                     uvm_processor_id_t processor_id,
1124                                                     uvm_va_block_region_t region,
1125                                                     const uvm_page_mask_t *map_page_mask,
1126                                                     uvm_prot_t max_prot,
1127                                                     const uvm_processor_mask_t *processor_mask);
1128 
1129 // Maps processors using SetAccessedBy to all resident pages in the region
1130 // parameter. On Volta+ it is also used to map evicted pages that can be later
1131 // pulled back by using access counters.
1132 //
1133 // This function acquires/waits for the va_block tracker and updates that
1134 // tracker with any new work pushed.
1135 //
1136 // Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like
1137 // uvm_va_block_map() indicating that the operation needs to be retried.
1138 //
1139 // va_block_context must not be NULL and policy must for the region must match.
1140 // See the comments for uvm_va_block_check_policy_is_valid().
1141 //
1142 // LOCKING: The caller must hold the va block lock. If va_block_context->mm !=
1143 //          NULL, va_block_context->mm->mmap_lock must be held in at least read
1144 //          mode.
1145 NV_STATUS uvm_va_block_add_mappings(uvm_va_block_t *va_block,
1146                                     uvm_va_block_context_t *va_block_context,
1147                                     uvm_processor_id_t processor_id,
1148                                     uvm_va_block_region_t region,
1149                                     const uvm_page_mask_t *page_mask,
1150                                     UvmEventMapRemoteCause cause);
1151 
1152 // Notifies the VA block that a new GPU VA space has been created.
1153 // LOCKING: The caller must hold the va_block lock
1154 NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space);
1155 
1156 // Destroys the VA block's mappings and page tables on the GPU, if it has any.
1157 //
1158 // If mm != NULL, that mm is used for any CPU mappings which may be created as
1159 // a result of this call. See uvm_va_block_context_t::mm for details.
1160 //
1161 // va_block_context must not be NULL.
1162 //
1163 // LOCKING: The caller must hold the va_block lock. If block_context->mm is not
1164 // NULL, the caller must hold mm->mmap_lock in at least read mode.
1165 void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
1166                                       uvm_gpu_va_space_t *gpu_va_space,
1167                                       uvm_va_block_context_t *block_context);
1168 
1169 // Creates any mappings necessary in this VA block between the two GPUs, in
1170 // either direction.
1171 // LOCKING: The caller must hold the va_block lock
1172 NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
1173 
1174 // Unmaps all page tables in this VA block which have peer mappings between
1175 // the two GPUs, in either direction.
1176 // LOCKING: The caller must hold the va_block lock
1177 void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
1178 
1179 // Unmap any mappings from GPU to the preferred location.
1180 //
1181 // The GPU has to be in UVM-Lite mode.
1182 //
1183 // LOCKING: The caller must hold the va_block lock
1184 void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uvm_gpu_t *gpu);
1185 
1186 // Frees all memory under this block associated with this GPU. Any portion of
1187 // the block which is resident on the GPU is evicted to sysmem before being
1188 // freed.
1189 //
1190 // If mm != NULL, that mm is used for any CPU mappings which may be created as
1191 // a result of this call. See uvm_va_block_context_t::mm for details.
1192 //
1193 // LOCKING: This takes and releases the VA block lock. If mm != NULL, the caller
1194 //          must hold mm->mmap_lock in at least read mode.
1195 void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm);
1196 
1197 // Same as uvm_va_block_unregister_gpu() but the VA block lock must be held.
1198 // Note that this handles allocation-retry internally and hence might unlock
1199 // and relock block's lock.
1200 void uvm_va_block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm);
1201 
1202 // Unmaps all memory associated with the block and drops the ref count of the
1203 // block. This allows the caller to free resources associated with this block
1204 // regardless of the block's current ref count. Most importantly it allows the
1205 // VA covered by this block to be immediately available for other page table
1206 // mappings upon return.
1207 //
1208 // This clears block->va_range, so only the VA range destroy path should call
1209 // it. Other paths with references on this block, specifically the eviction path
1210 // which temporarily takes a reference to the block, must always check the block
1211 // state after taking the block lock to see if their mapping is still in place.
1212 //
1213 // All of the unmap and state destruction steps are also performed when the ref
1214 // count goes to 0, so this function only needs to be called if the block's
1215 // resources need to be reclaimed immediately.
1216 //
1217 // The caller should not lock the block before calling this function.
1218 //
1219 // This performs a uvm_va_block_release.
1220 void uvm_va_block_kill(uvm_va_block_t *va_block);
1221 
1222 // Exactly the same split semantics as uvm_va_range_split, including error
1223 // handling. See that function's comments for details.
1224 //
1225 // new_va_block's va_range is set to new_va_range before any reverse mapping is
1226 // established to the new block, but the caller is responsible for inserting the
1227 // new block into the range.
1228 NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block,
1229                              NvU64 new_end,
1230                              uvm_va_block_t **new_va_block,
1231                              uvm_va_range_t *new_va_range);
1232 
1233 // Exactly the same split semantics as uvm_va_block_split, including error
1234 // handling except the existing_va_block block lock needs to be held and
1235 // the new_va_block has to be preallocated.
1236 // Also note that the existing_va_block lock may be dropped and re-acquired.
1237 NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block,
1238                                     NvU64 new_end,
1239                                     uvm_va_block_t *new_va_block,
1240                                     uvm_va_range_t *new_va_range);
1241 
1242 // Handles a CPU fault in the given VA block, performing any operations
1243 // necessary to establish a coherent CPU mapping (migrations, cache invalidates,
1244 // etc.).
1245 //
1246 // Locking:
1247 //  - vma->vm_mm->mmap_lock must be held in at least read mode. Note, that
1248 //    might not be the same as current->mm->mmap_lock.
1249 //  - va_space lock must be held in at least read mode
1250 //
1251 // service_context->block_context.mm is ignored and vma->vm_mm is used instead.
1252 //
1253 // Returns NV_ERR_INVALID_ACCESS_TYPE if a CPU mapping to fault_addr cannot be
1254 // accessed, for example because it's within a range group which is non-
1255 // migratable.
1256 NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block,
1257                                  NvU64 fault_addr,
1258                                  bool is_write,
1259                                  uvm_service_block_context_t *service_context);
1260 
1261 // Performs any operations necessary to establish a coherent mapping
1262 // (migrations, cache invalidates, etc.) in response to the given service block
1263 // context.
1264 //
1265 // service_context must not be NULL and policy for service_context->region must
1266 // match. See the comments for uvm_va_block_check_policy_is_valid().  If
1267 // va_block is a HMM block, va_block_context->hmm.vma must be valid.  See the
1268 // comments for uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
1269 // service_context->prefetch_hint is set by this function.
1270 //
1271 // Locking:
1272 //  - service_context->block_context.mm->mmap_lock must be held in at least
1273 //    read mode, if valid.
1274 //  - va_space lock must be held in at least read mode
1275 //  - va_block lock must be held
1276 //
1277 // If allocation-retry was required as part of the operation and was successful,
1278 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the block's lock
1279 // was unlocked and relocked.
1280 //
1281 // NV_WARN_MORE_PROCESSING_REQUIRED indicates that thrashing has been detected
1282 // and the performance heuristics logic decided to throttle execution.
1283 // Any other error code different than NV_OK indicates OOM or a global fatal
1284 // error.
1285 NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id,
1286                                       uvm_va_block_t *va_block,
1287                                       uvm_va_block_retry_t *block_retry,
1288                                       uvm_service_block_context_t *service_context);
1289 
1290 // Performs population of the destination pages, unmapping and copying source
1291 // pages to new_residency.
1292 //
1293 // service_context must not be NULL and policy for service_context->region must
1294 // match.  See the comments for uvm_va_block_check_policy_is_valid().  If
1295 // va_block is a HMM block, va_block_context->hmm.vma must be valid.  See the
1296 // comments for uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
1297 // service_context->prefetch_hint should be set before calling this function.
1298 //
1299 // Locking:
1300 //  - service_context->block_context.mm->mmap_lock must be held in at least
1301 //    read mode, if valid.
1302 //  - va_space lock must be held in at least read mode
1303 //  - va_block lock must be held
1304 //
1305 // If allocation-retry was required as part of the operation and was successful,
1306 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the block's lock
1307 // was unlocked and relocked.
1308 //
1309 // NV_WARN_MORE_PROCESSING_REQUIRED indicates that thrashing has been detected
1310 // and the performance heuristics logic decided to throttle execution.
1311 // Any other error code different than NV_OK indicates OOM or a global fatal
1312 // error.
1313 NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id,
1314                                     uvm_processor_id_t new_residency,
1315                                     uvm_va_block_t *va_block,
1316                                     uvm_va_block_retry_t *block_retry,
1317                                     uvm_service_block_context_t *service_context);
1318 
1319 // This updates the va_block residency state and maps the faulting processor_id
1320 // to the new residency (which may be remote).
1321 //
1322 // service_context must not be NULL and policy for service_context->region must
1323 // match. See the comments for uvm_va_block_check_policy_is_valid().  If
1324 // va_block is a HMM block, va_block_context->hmm.vma must be valid.  See the
1325 // comments for uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
1326 // service_context must be initialized by calling uvm_va_block_service_copy()
1327 // before calling this function.
1328 //
1329 // Locking:
1330 //  - service_context->block_context.mm->mmap_lock must be held in at least
1331 //    read mode, if valid.
1332 //  - va_space lock must be held in at least read mode
1333 //  - va_block lock must be held
1334 //  - the mmap lock and va_space lock must be held across the calls to
1335 //    uvm_va_block_service_copy() and this function. If the va_block lock is
1336 //    dropped inbetween, special care is needed to check for eviction and
1337 //    invalidation callbacks.
1338 //
1339 // If allocation-retry was required as part of the operation and was successful,
1340 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the block's lock
1341 // was unlocked and relocked.
1342 //
1343 // NV_WARN_MORE_PROCESSING_REQUIRED indicates that thrashing has been detected
1344 // and the performance heuristics logic decided to throttle execution.
1345 // Any other error code different than NV_OK indicates OOM or a global fatal
1346 // error.
1347 NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
1348                                       uvm_va_block_t *va_block,
1349                                       uvm_service_block_context_t *service_context);
1350 
1351 // Allocate GPU state for the given va_block and registered GPUs.
1352 // Locking: The block lock must be held.
1353 NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block);
1354 
1355 // Release any GPU or policy data associated with the given region in response
1356 // to munmap().
1357 // Locking: The va_block lock must be held.
1358 void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
1359                                 uvm_va_block_region_t region);
1360 
1361 // Size of the block in bytes. Guaranteed to be a page-aligned value between
1362 // PAGE_SIZE and UVM_VA_BLOCK_SIZE.
1363 static inline NvU64 uvm_va_block_size(uvm_va_block_t *block)
1364 {
1365     NvU64 size = block->end - block->start + 1;
1366     UVM_ASSERT(PAGE_ALIGNED(size));
1367     UVM_ASSERT(size >= PAGE_SIZE);
1368     UVM_ASSERT(size <= UVM_VA_BLOCK_SIZE);
1369     return size;
1370 }
1371 
1372 // Number of pages with PAGE_SIZE in the block
1373 static inline size_t uvm_va_block_num_cpu_pages(uvm_va_block_t *block)
1374 {
1375     return uvm_va_block_size(block) / PAGE_SIZE;
1376 }
1377 
1378 // VA of the given page using CPU page size. page_index must be valid
1379 static inline NvU64 uvm_va_block_cpu_page_address(uvm_va_block_t *block, uvm_page_index_t page_index)
1380 {
1381     UVM_ASSERT(page_index < uvm_va_block_num_cpu_pages(block));
1382     return block->start + PAGE_SIZE * page_index;
1383 }
1384 
1385 // Get the physical address on the given GPU for given residency
1386 uvm_gpu_phys_address_t uvm_va_block_res_phys_page_address(uvm_va_block_t *va_block,
1387                                                           uvm_page_index_t page_index,
1388                                                           uvm_processor_id_t residency,
1389                                                           uvm_gpu_t *gpu);
1390 
1391 // Get the page physical address on the given GPU
1392 //
1393 // This will assert that GPU state is indeed present.
1394 uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block,
1395                                                           uvm_page_index_t page_index,
1396                                                           uvm_gpu_t *gpu);
1397 
1398 static bool uvm_va_block_contains_address(uvm_va_block_t *block, NvU64 address)
1399 {
1400     return address >= block->start && address <= block->end;
1401 }
1402 
1403 // Obtain a pointer to the uvm_va_block_test_t structure for the given VA
1404 // block. If uvm_enable_builtin_tests is unset, NULL will be returned.
1405 static uvm_va_block_test_t *uvm_va_block_get_test(uvm_va_block_t *va_block)
1406 {
1407     if (uvm_enable_builtin_tests)
1408         return &container_of(va_block, uvm_va_block_wrapper_t, block)->test;
1409 
1410     return NULL;
1411 }
1412 
1413 // Get the page residency mask for a processor if it's known to be there.
1414 //
1415 // If the processor is the CPU, the residency mask for the NUMA node ID
1416 // specified by nid will be returned (see
1417 // uvm_va_block_cpu_node_state_t::resident). If nid is NUMA_NO_NODE,
1418 // the cumulative CPU residency mask will be returned (see
1419 // uvm_va_block_t::cpu::resident).
1420 //
1421 // If the processor is a GPU, this will assert that GPU state is indeed present.
1422 uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor, int nid);
1423 
1424 // Get the page mapped mask for a processor. The returned mask cannot be
1425 // directly modified by the caller
1426 //
1427 // If the processor is a GPU, this will assert that GPU state is indeed present.
1428 const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor);
1429 
1430 // Return a mask of non-UVM-Lite pages that are unmapped within the given
1431 // region.
1432 // Locking: The block lock must be held.
1433 void uvm_va_block_unmapped_pages_get(uvm_va_block_t *va_block,
1434                                      uvm_va_block_region_t region,
1435                                      uvm_page_mask_t *out_mask);
1436 
1437 // VA block lookup functions. There are a number of permutations which might be
1438 // useful, such as looking up the block from {va_space, va_range} x {addr,
1439 // block index}. The ones implemented here and in uvm_va_range.h support the
1440 // primary three use cases, which are:
1441 // 1) Iterating over all VA blocks in a VA range. This uses block indices on the
1442 //    VA range:
1443 //      uvm_va_range_num_blocks
1444 //      uvm_va_range_block_index
1445 //      uvm_va_range_block
1446 //      uvm_va_range_block_create
1447 // 2) Operating on a single VA block (fault). This looks up the block using the
1448 //    VA space and address:
1449 //      uvm_va_block_find
1450 //      uvm_va_block_find_create
1451 // 3) Operating on a single VA block (fault). This looks up the block using the
1452 //    supplied VA range and address:
1453 //      uvm_va_block_find_create_in_range
1454 
1455 // Finds the UVM or HMM VA block containing addr, if any. The va_space->lock
1456 // must be held in at least read mode. Return values:
1457 // NV_ERR_INVALID_ADDRESS   addr is not a UVM_VA_RANGE_TYPE_MANAGED va_range nor
1458 //                          a HMM enabled VMA.
1459 //
1460 // NV_ERR_OBJECT_NOT_FOUND  addr is valid but no block has been allocated to
1461 //                          cover it yet
1462 //
1463 // NV_OK                    The block was returned successfully
1464 NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t **out_block);
1465 
1466 // Same as uvm_va_block_find except that the block is created if not found.
1467 // If addr is covered by a UVM_VA_RANGE_TYPE_MANAGED va_range a managed block
1468 // will be created. If addr is not covered by any va_range and HMM is
1469 // enabled in the va_space then a HMM block will be created and hmm_vma is
1470 // set to the VMA covering 'addr'. The va_space_mm must be retained and locked.
1471 // Otherwise hmm_vma is set to NULL.
1472 // Return values:
1473 // NV_ERR_INVALID_ADDRESS   addr is not a UVM_VA_RANGE_TYPE_MANAGED va_range nor
1474 //                          a HMM enabled VMA.
1475 // NV_ERR_NO_MEMORY         memory could not be allocated.
1476 NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
1477                                    NvU64 addr,
1478                                    struct vm_area_struct **hmm_vma,
1479                                    uvm_va_block_t **out_block);
1480 
1481 // Same as uvm_va_block_find_create except that only managed va_blocks are
1482 // created if not already present in the VA range. Does not require va_space_mm
1483 // to be locked or retained.
1484 NV_STATUS uvm_va_block_find_create_managed(uvm_va_space_t *va_space,
1485                                            NvU64 addr,
1486                                            uvm_va_block_t **out_block);
1487 
1488 // Same as uvm_va_block_find_create_managed except that va_range lookup was
1489 // already done by the caller. The supplied va_range must not be NULL.
1490 NV_STATUS uvm_va_block_find_create_in_range(uvm_va_space_t *va_space,
1491                                             uvm_va_range_t *va_range,
1492                                             NvU64 addr,
1493                                             uvm_va_block_t **out_block);
1494 
1495 // Look up a chunk backing a specific address within the VA block.
1496 // Returns NULL if none.
1497 uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU64 address);
1498 
1499 // Implementation of the UvmMigrate() API at the VA block scope.
1500 //
1501 // The out_tracker can be NULL.
1502 //
1503 // If do_mappings is false, mappings are not added after pages have been
1504 // migrated.
1505 //
1506 // The caller needs to handle allocation-retry. va_block_retry can be NULL if
1507 // the destination is the CPU.
1508 //
1509 // va_block_context must not be NULL and policy for the region must match. See
1510 // the comments for uvm_va_block_check_policy_is_valid().  If va_block is a HMM
1511 // block, va_block_context->hmm.vma must be valid.  See the comments for
1512 // uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
1513 //
1514 // LOCKING: The caller must hold the va_block lock. If va_block_context->mm !=
1515 //          NULL, va_block_context->mm->mmap_lock must be held in at least
1516 //          read mode.
1517 NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
1518                                       uvm_va_block_retry_t *va_block_retry,
1519                                       uvm_va_block_context_t *va_block_context,
1520                                       uvm_va_block_region_t region,
1521                                       uvm_processor_id_t dest_id,
1522                                       uvm_migrate_mode_t mode,
1523                                       uvm_tracker_t *out_tracker);
1524 
1525 // Write block's data from a CPU buffer
1526 //
1527 // The [dst, dst + size) range has to fit within a single PAGE_SIZE page.
1528 //
1529 // va_block_context must not be NULL. The caller is not required to set
1530 // va_block_context->hmm.vma.
1531 //
1532 // The caller needs to support allocation-retry of page tables.
1533 //
1534 // LOCKING: The caller must hold the va_block lock
1535 NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block,
1536                                       uvm_va_block_context_t *block_context,
1537                                       NvU64 dst,
1538                                       uvm_mem_t *src,
1539                                       size_t size);
1540 
1541 // Read block's data into a CPU buffer
1542 //
1543 // The [src, src + size) range has to fit within a single PAGE_SIZE page.
1544 //
1545 // LOCKING: The caller must hold the va_block lock
1546 NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block, uvm_mem_t *dst, NvU64 src, size_t size);
1547 
1548 // Initialize va block retry tracking
1549 void uvm_va_block_retry_init(uvm_va_block_retry_t *uvm_va_block_retry);
1550 
1551 // Deinitialize va block retry tracking after a block operation
1552 //
1553 // Frees all the remaining free chunks and unpins all the used chunks.
1554 void uvm_va_block_retry_deinit(uvm_va_block_retry_t *uvm_va_block_retry, uvm_va_block_t *va_block);
1555 
1556 // Evict all chunks from the block that are subchunks of the passed in root_chunk.
1557 //
1558 // Add all the work tracking the eviction to the tracker.
1559 //
1560 // Returns NV_OK if the block is dead or doesn't have any subchunks of the
1561 // root_chunk.
1562 //
1563 // LOCKING: The caller must hold the va_block lock
1564 NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
1565                                     uvm_gpu_t *gpu,
1566                                     uvm_gpu_chunk_t *root_chunk,
1567                                     uvm_tracker_t *tracker);
1568 
1569 NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *params, struct file *filp);
1570 NV_STATUS uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS *params, struct file *filp);
1571 NV_STATUS uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS *params, struct file *filp);
1572 NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params, struct file *filp);
1573 
1574 // Compute the offset in system pages of addr from the start of va_block.
1575 static uvm_page_index_t uvm_va_block_cpu_page_index(uvm_va_block_t *va_block, NvU64 addr)
1576 {
1577     UVM_ASSERT(addr >= va_block->start);
1578     UVM_ASSERT(addr <= va_block->end);
1579     return (addr - va_block->start) / PAGE_SIZE;
1580 }
1581 
1582 // Computes the size and index in the gpu_state chunks array of the GPU chunk
1583 // which corresponds to the given page_index of the VA region.
1584 // Note this is only used for testing and does not work on HMM va_blocks as it
1585 // returns incorrect results for those.
1586 size_t uvm_va_block_gpu_chunk_index_range(NvU64 start,
1587                                           NvU64 size,
1588                                           uvm_gpu_t *gpu,
1589                                           uvm_page_index_t page_index,
1590                                           uvm_chunk_size_t *out_chunk_size);
1591 
1592 // If there are any resident CPU pages in the block, mark them as dirty
1593 void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block);
1594 
1595 // Sets the internal state required to handle fault cancellation
1596 //
1597 // This function may require allocating page tables to split big pages into 4K
1598 // pages. If allocation-retry was required as part of the operation and was
1599 // successful, NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case the
1600 // block's lock was unlocked and relocked.
1601 //
1602 // va_block_context must not be NULL.
1603 //
1604 // LOCKING: The caller must hold the va_block lock.
1605 NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu);
1606 
1607 //
1608 // uvm_va_block_region_t helpers
1609 //
1610 
1611 static uvm_va_block_region_t uvm_va_block_region(uvm_page_index_t first, uvm_page_index_t outer)
1612 {
1613     BUILD_BUG_ON(PAGES_PER_UVM_VA_BLOCK >= (1 << (sizeof(first) * 8)));
1614 
1615     UVM_ASSERT(first <= outer);
1616 
1617     return (uvm_va_block_region_t){ .first = first, .outer = outer };
1618 }
1619 
1620 static uvm_va_block_region_t uvm_va_block_region_for_page(uvm_page_index_t page_index)
1621 {
1622     return uvm_va_block_region(page_index, page_index + 1);
1623 }
1624 
1625 static size_t uvm_va_block_region_num_pages(uvm_va_block_region_t region)
1626 {
1627     return region.outer - region.first;
1628 }
1629 
1630 static NvU64 uvm_va_block_region_size(uvm_va_block_region_t region)
1631 {
1632     return uvm_va_block_region_num_pages(region) * PAGE_SIZE;
1633 }
1634 
1635 static NvU64 uvm_va_block_region_start(uvm_va_block_t *va_block, uvm_va_block_region_t region)
1636 {
1637     return va_block->start + region.first * PAGE_SIZE;
1638 }
1639 
1640 static NvU64 uvm_va_block_region_end(uvm_va_block_t *va_block, uvm_va_block_region_t region)
1641 {
1642     return va_block->start + region.outer * PAGE_SIZE - 1;
1643 }
1644 
1645 static bool uvm_va_block_region_contains_region(uvm_va_block_region_t region, uvm_va_block_region_t subregion)
1646 {
1647     return subregion.first >= region.first && subregion.outer <= region.outer;
1648 }
1649 
1650 static bool uvm_va_block_region_contains_page(uvm_va_block_region_t region, uvm_page_index_t page_index)
1651 {
1652     return uvm_va_block_region_contains_region(region, uvm_va_block_region_for_page(page_index));
1653 }
1654 
1655 // Create a block range from a va block and start and end virtual addresses
1656 // within the block.
1657 static uvm_va_block_region_t uvm_va_block_region_from_start_end(uvm_va_block_t *va_block, NvU64 start, NvU64 end)
1658 {
1659     uvm_va_block_region_t region;
1660 
1661     UVM_ASSERT(start < end);
1662     UVM_ASSERT(start >= va_block->start);
1663     UVM_ASSERT(end <= va_block->end);
1664     UVM_ASSERT(PAGE_ALIGNED(start));
1665     UVM_ASSERT(PAGE_ALIGNED(end + 1));
1666 
1667     region.first = uvm_va_block_cpu_page_index(va_block, start);
1668     region.outer = uvm_va_block_cpu_page_index(va_block, end) + 1;
1669 
1670     return region;
1671 }
1672 
1673 static uvm_va_block_region_t uvm_va_block_region_from_start_size(uvm_va_block_t *va_block, NvU64 start, NvU64 size)
1674 {
1675     return uvm_va_block_region_from_start_end(va_block, start, start + size - 1);
1676 }
1677 
1678 static uvm_va_block_region_t uvm_va_block_region_from_block(uvm_va_block_t *va_block)
1679 {
1680     return uvm_va_block_region(0, uvm_va_block_num_cpu_pages(va_block));
1681 }
1682 
1683 // Create a block region from a va block and page mask. If va_block is NULL, the
1684 // region is assumed to cover the maximum va_block size. Note that the region
1685 // covers the first through the last set bit and may have unset bits in between.
1686 static uvm_va_block_region_t uvm_va_block_region_from_mask(uvm_va_block_t *va_block, const uvm_page_mask_t *page_mask)
1687 {
1688     uvm_va_block_region_t region;
1689     uvm_page_index_t outer;
1690 
1691     if (va_block)
1692         outer = uvm_va_block_num_cpu_pages(va_block);
1693     else
1694         outer = PAGES_PER_UVM_VA_BLOCK;
1695 
1696     region.first = find_first_bit(page_mask->bitmap, outer);
1697     if (region.first >= outer) {
1698         region = uvm_va_block_region(0, 0);
1699     }
1700     else {
1701         // At least one bit is set so find_last_bit() should not return 'outer'.
1702         region.outer = find_last_bit(page_mask->bitmap, outer) + 1;
1703         UVM_ASSERT(region.outer <= outer);
1704     }
1705 
1706     return region;
1707 }
1708 
1709 static bool uvm_page_mask_test(const uvm_page_mask_t *mask, uvm_page_index_t page_index)
1710 {
1711     UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
1712 
1713     return test_bit(page_index, mask->bitmap);
1714 }
1715 
1716 static bool uvm_page_mask_test_and_set(uvm_page_mask_t *mask, uvm_page_index_t page_index)
1717 {
1718     UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
1719 
1720     return __test_and_set_bit(page_index, mask->bitmap);
1721 }
1722 
1723 static bool uvm_page_mask_test_and_clear(uvm_page_mask_t *mask, uvm_page_index_t page_index)
1724 {
1725     UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
1726 
1727     return __test_and_clear_bit(page_index, mask->bitmap);
1728 }
1729 
1730 static void uvm_page_mask_set(uvm_page_mask_t *mask, uvm_page_index_t page_index)
1731 {
1732     UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
1733 
1734     __set_bit(page_index, mask->bitmap);
1735 }
1736 
1737 static void uvm_page_mask_clear(uvm_page_mask_t *mask, uvm_page_index_t page_index)
1738 {
1739     UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
1740 
1741     __clear_bit(page_index, mask->bitmap);
1742 }
1743 
1744 static bool uvm_page_mask_region_test(const uvm_page_mask_t *mask,
1745                                       uvm_va_block_region_t region,
1746                                       uvm_page_index_t page_index)
1747 {
1748     if (!uvm_va_block_region_contains_page(region, page_index))
1749         return false;
1750 
1751     return !mask || uvm_page_mask_test(mask, page_index);
1752 }
1753 
1754 static NvU32 uvm_page_mask_region_weight(const uvm_page_mask_t *mask, uvm_va_block_region_t region)
1755 {
1756     NvU32 weight_before = 0;
1757 
1758     if (region.first > 0)
1759         weight_before = bitmap_weight(mask->bitmap, region.first);
1760 
1761     return bitmap_weight(mask->bitmap, region.outer) - weight_before;
1762 }
1763 
1764 static bool uvm_page_mask_region_empty(const uvm_page_mask_t *mask, uvm_va_block_region_t region)
1765 {
1766     return find_next_bit(mask->bitmap, region.outer, region.first) == region.outer;
1767 }
1768 
1769 static bool uvm_page_mask_region_full(const uvm_page_mask_t *mask, uvm_va_block_region_t region)
1770 {
1771     return find_next_zero_bit(mask->bitmap, region.outer, region.first) == region.outer;
1772 }
1773 
1774 static void uvm_page_mask_region_fill(uvm_page_mask_t *mask, uvm_va_block_region_t region)
1775 {
1776     bitmap_set(mask->bitmap, region.first, region.outer - region.first);
1777 }
1778 
1779 static void uvm_page_mask_region_clear(uvm_page_mask_t *mask, uvm_va_block_region_t region)
1780 {
1781     bitmap_clear(mask->bitmap, region.first, region.outer - region.first);
1782 }
1783 
1784 static void uvm_page_mask_region_clear_outside(uvm_page_mask_t *mask, uvm_va_block_region_t region)
1785 {
1786     if (region.first > 0)
1787         bitmap_clear(mask->bitmap, 0, region.first);
1788     if (region.outer < PAGES_PER_UVM_VA_BLOCK)
1789         bitmap_clear(mask->bitmap, region.outer, PAGES_PER_UVM_VA_BLOCK - region.outer);
1790 }
1791 
1792 static void uvm_page_mask_zero(uvm_page_mask_t *mask)
1793 {
1794     bitmap_zero(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
1795 }
1796 
1797 static bool uvm_page_mask_empty(const uvm_page_mask_t *mask)
1798 {
1799     return bitmap_empty(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
1800 }
1801 
1802 static bool uvm_page_mask_full(const uvm_page_mask_t *mask)
1803 {
1804     return bitmap_full(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
1805 }
1806 
1807 static void uvm_page_mask_fill(uvm_page_mask_t *mask)
1808 {
1809     bitmap_fill(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
1810 }
1811 
1812 static bool uvm_page_mask_and(uvm_page_mask_t *mask_out,
1813                               const uvm_page_mask_t *mask_in1,
1814                               const uvm_page_mask_t *mask_in2)
1815 {
1816     return bitmap_and(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
1817 }
1818 
1819 static bool uvm_page_mask_andnot(uvm_page_mask_t *mask_out,
1820                                  const uvm_page_mask_t *mask_in1,
1821                                  const uvm_page_mask_t *mask_in2)
1822 {
1823     return bitmap_andnot(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
1824 }
1825 
1826 static void uvm_page_mask_or(uvm_page_mask_t *mask_out,
1827                              const uvm_page_mask_t *mask_in1,
1828                              const uvm_page_mask_t *mask_in2)
1829 {
1830     bitmap_or(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
1831 }
1832 
1833 static void uvm_page_mask_complement(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in)
1834 {
1835     bitmap_complement(mask_out->bitmap, mask_in->bitmap, PAGES_PER_UVM_VA_BLOCK);
1836 }
1837 
1838 static void uvm_page_mask_copy(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in)
1839 {
1840     bitmap_copy(mask_out->bitmap, mask_in->bitmap, PAGES_PER_UVM_VA_BLOCK);
1841 }
1842 
1843 static NvU32 uvm_page_mask_weight(const uvm_page_mask_t *mask)
1844 {
1845     return bitmap_weight(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
1846 }
1847 
1848 static bool uvm_page_mask_subset(const uvm_page_mask_t *subset, const uvm_page_mask_t *mask)
1849 {
1850     return bitmap_subset(subset->bitmap, mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
1851 }
1852 
1853 static bool uvm_page_mask_equal(const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
1854 {
1855     return bitmap_equal(mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
1856 }
1857 
1858 static bool uvm_page_mask_init_from_region(uvm_page_mask_t *mask_out,
1859                                            uvm_va_block_region_t region,
1860                                            const uvm_page_mask_t *mask_in)
1861 {
1862     uvm_page_mask_zero(mask_out);
1863     uvm_page_mask_region_fill(mask_out, region);
1864 
1865     if (mask_in)
1866         return uvm_page_mask_and(mask_out, mask_out, mask_in);
1867 
1868     return true;
1869 }
1870 
1871 static void uvm_page_mask_shift_right(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in, unsigned shift)
1872 {
1873     bitmap_shift_right(mask_out->bitmap, mask_in->bitmap, shift, PAGES_PER_UVM_VA_BLOCK);
1874 }
1875 
1876 static void uvm_page_mask_shift_left(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in, unsigned shift)
1877 {
1878     bitmap_shift_left(mask_out->bitmap, mask_in->bitmap, shift, PAGES_PER_UVM_VA_BLOCK);
1879 }
1880 
1881 static bool uvm_page_mask_intersects(const uvm_page_mask_t *mask1, const uvm_page_mask_t *mask2)
1882 {
1883     return bitmap_intersects(mask1->bitmap, mask2->bitmap, PAGES_PER_UVM_VA_BLOCK);
1884 }
1885 
1886 // Print the given page mask on the given buffer using hex symbols. The
1887 // minimum required size of the buffer is UVM_PAGE_MASK_PRINT_MIN_BUFFER_SIZE.
1888 static void uvm_page_mask_print(const uvm_page_mask_t *mask, char *buffer)
1889 {
1890     // There are two cases, which depend on PAGE_SIZE
1891     if (PAGES_PER_UVM_VA_BLOCK > 32) {
1892         NvLength current_long_idx = UVM_PAGE_MASK_WORDS - 1;
1893         const char *buffer_end = buffer + UVM_PAGE_MASK_PRINT_MIN_BUFFER_SIZE;
1894 
1895         UVM_ASSERT(sizeof(*mask->bitmap) == 8);
1896 
1897         // For 4KB pages, we need to iterate over multiple words
1898         do {
1899             NvU64 current_long = mask->bitmap[current_long_idx];
1900 
1901             buffer += sprintf(buffer, "%016llx", current_long);
1902             if (current_long_idx != 0)
1903                 buffer += sprintf(buffer, ":");
1904         } while (current_long_idx-- != 0);
1905 
1906         UVM_ASSERT(buffer <= buffer_end);
1907     }
1908     else {
1909         NvU32 value = (unsigned)*mask->bitmap;
1910 
1911         UVM_ASSERT(PAGES_PER_UVM_VA_BLOCK == 32);
1912 
1913         // For 64KB pages, a single print suffices
1914         sprintf(buffer, "%08x", value);
1915     }
1916 }
1917 
1918 static uvm_va_block_region_t uvm_va_block_first_subregion_in_mask(uvm_va_block_region_t region,
1919                                                                   const uvm_page_mask_t *page_mask)
1920 {
1921     uvm_va_block_region_t subregion;
1922 
1923     if (!page_mask)
1924         return region;
1925 
1926     subregion.first = find_next_bit(page_mask->bitmap, region.outer, region.first);
1927     subregion.outer = find_next_zero_bit(page_mask->bitmap, region.outer, subregion.first + 1);
1928     return subregion;
1929 }
1930 
1931 static uvm_va_block_region_t uvm_va_block_next_subregion_in_mask(uvm_va_block_region_t region,
1932                                                                  const uvm_page_mask_t *page_mask,
1933                                                                  uvm_va_block_region_t previous_subregion)
1934 {
1935     uvm_va_block_region_t subregion;
1936 
1937     if (!page_mask) {
1938         subregion.first = region.outer;
1939         subregion.outer = region.outer;
1940         return subregion;
1941     }
1942 
1943     subregion.first = find_next_bit(page_mask->bitmap, region.outer, previous_subregion.outer + 1);
1944     subregion.outer = find_next_zero_bit(page_mask->bitmap, region.outer, subregion.first + 1);
1945     return subregion;
1946 }
1947 
1948 // Iterate over contiguous subregions of the region given by the page mask.
1949 // If the page mask is NULL then it behaves as if it was a fully set mask and
1950 // the only subregion iterated over will be the region itself.
1951 #define for_each_va_block_subregion_in_mask(subregion, page_mask, region)                       \
1952     for ((subregion) = uvm_va_block_first_subregion_in_mask((region), (page_mask));             \
1953          (subregion).first != (region).outer;                                                   \
1954          (subregion) = uvm_va_block_next_subregion_in_mask((region), (page_mask), (subregion)))
1955 
1956 static uvm_page_index_t uvm_va_block_first_page_in_mask(uvm_va_block_region_t region,
1957                                                         const uvm_page_mask_t *page_mask)
1958 {
1959     if (page_mask)
1960         return find_next_bit(page_mask->bitmap, region.outer, region.first);
1961     else
1962         return region.first;
1963 }
1964 
1965 static uvm_page_index_t uvm_va_block_next_page_in_mask(uvm_va_block_region_t region,
1966                                                        const uvm_page_mask_t *page_mask,
1967                                                        uvm_page_index_t previous_page)
1968 {
1969     if (page_mask) {
1970         return find_next_bit(page_mask->bitmap, region.outer, previous_page + 1);
1971     }
1972     else {
1973         UVM_ASSERT(previous_page < region.outer);
1974         return previous_page + 1;
1975     }
1976 }
1977 
1978 static uvm_page_index_t uvm_va_block_first_unset_page_in_mask(uvm_va_block_region_t region,
1979                                                               const uvm_page_mask_t *page_mask)
1980 {
1981     if (page_mask)
1982         return find_next_zero_bit(page_mask->bitmap, region.outer, region.first);
1983     else
1984         return region.first;
1985 }
1986 
1987 static uvm_page_index_t uvm_va_block_next_unset_page_in_mask(uvm_va_block_region_t region,
1988                                                              const uvm_page_mask_t *page_mask,
1989                                                              uvm_page_index_t previous_page)
1990 {
1991     if (page_mask) {
1992         return find_next_zero_bit(page_mask->bitmap, region.outer, previous_page + 1);
1993     }
1994     else {
1995         UVM_ASSERT(previous_page < region.outer);
1996         return previous_page + 1;
1997     }
1998 }
1999 
2000 static NvU64 uvm_reverse_map_start(const uvm_reverse_map_t *reverse_map)
2001 {
2002     return uvm_va_block_cpu_page_address(reverse_map->va_block, reverse_map->region.first);
2003 }
2004 
2005 static NvU64 uvm_reverse_map_end(const uvm_reverse_map_t *reverse_map)
2006 {
2007     return uvm_va_block_cpu_page_address(reverse_map->va_block, reverse_map->region.first) +
2008            uvm_va_block_region_size(reverse_map->region) - 1;
2009 }
2010 
2011 // Iterate over contiguous pages of the region given by the page mask.
2012 // If the page mask is NULL then it behaves as if it was a fully set mask and
2013 // it will iterate over all pages within the region.
2014 #define for_each_va_block_page_in_region_mask(page_index, page_mask, region)                 \
2015     for ((page_index) = uvm_va_block_first_page_in_mask((region), (page_mask));              \
2016          (page_index) != (region).outer;                                                     \
2017          (page_index) = uvm_va_block_next_page_in_mask((region), (page_mask), (page_index)))
2018 
2019 // Same as for_each_va_block_page_in_region_mask, but the region spans the
2020 // whole given VA block
2021 #define for_each_va_block_page_in_mask(page_index, page_mask, va_block)                      \
2022     for_each_va_block_page_in_region_mask(page_index, page_mask, uvm_va_block_region_from_block(va_block))
2023 
2024 // Similar to for_each_va_block_page_in_region_mask, but iterating over pages
2025 // whose bit is unset.
2026 #define for_each_va_block_unset_page_in_region_mask(page_index, page_mask, region)           \
2027     for ((page_index) = uvm_va_block_first_unset_page_in_mask((region), (page_mask));        \
2028          (page_index) != (region).outer;                                                     \
2029          (page_index) = uvm_va_block_next_unset_page_in_mask((region), (page_mask), (page_index)))
2030 
2031 // Similar to for_each_va_block_page_in_mask, but iterating over pages whose
2032 // bit is unset.
2033 #define for_each_va_block_unset_page_in_mask(page_index, page_mask, va_block)                \
2034     for_each_va_block_unset_page_in_region_mask(page_index, page_mask, uvm_va_block_region_from_block(va_block))
2035 
2036 // Iterate over all pages within the given region
2037 #define for_each_va_block_page_in_region(page_index, region)                                 \
2038     for_each_va_block_page_in_region_mask((page_index), NULL, (region))
2039 
2040 // Iterate over all pages within the given VA block
2041 #define for_each_va_block_page(page_index, va_block)                                         \
2042     for_each_va_block_page_in_region((page_index), uvm_va_block_region_from_block(va_block))
2043 
2044 // Return the first vma intersecting the region [start, va_block->end]
2045 // or NULL if no such vma exists. Also returns the region covered by
2046 // the vma within the va_block.
2047 struct vm_area_struct *uvm_va_block_find_vma_region(uvm_va_block_t *va_block,
2048                                                     struct mm_struct *mm,
2049                                                     NvU64 start,
2050                                                     uvm_va_block_region_t *region);
2051 
2052 // Iterate over all vma regions covered by a va_block
2053 #define for_each_va_block_vma_region(va_block, mm, vma, region)                                 \
2054     for (vma = uvm_va_block_find_vma_region((va_block), (mm), (va_block)->start, (region));     \
2055          (vma);                                                                                 \
2056          vma = uvm_va_block_find_vma_region((va_block),                                         \
2057                                             (mm),                                               \
2058                                             uvm_va_block_region_end((va_block), *(region)) + 1, \
2059                                             (region)))
2060 
2061 // Return the block region covered by the given chunk size. page_index must be
2062 // any page within the block known to be covered by the chunk.
2063 static uvm_va_block_region_t uvm_va_block_chunk_region(uvm_va_block_t *block,
2064                                                        uvm_chunk_size_t chunk_size,
2065                                                        uvm_page_index_t page_index)
2066 {
2067     NvU64 page_addr = uvm_va_block_cpu_page_address(block, page_index);
2068     NvU64 chunk_start_addr = UVM_ALIGN_DOWN(page_addr, chunk_size);
2069     uvm_page_index_t first = (uvm_page_index_t)((chunk_start_addr - block->start) / PAGE_SIZE);
2070     return uvm_va_block_region(first, first + (chunk_size / PAGE_SIZE));
2071 }
2072 
2073 //
2074 // Helpers for page state (permissions, size, residency)
2075 //
2076 
2077 bool uvm_va_block_page_is_gpu_authorized(uvm_va_block_t *va_block,
2078                                          uvm_page_index_t page_index,
2079                                          uvm_gpu_id_t gpu_id,
2080                                          uvm_prot_t required_prot);
2081 
2082 // Compute the processors that have a copy of the given page resident in their
2083 // memory.
2084 void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block,
2085                                            uvm_page_index_t page_index,
2086                                            uvm_processor_mask_t *resident_processors);
2087 
2088 // Count how many processors have a copy of the given page resident in their
2089 // memory.
2090 NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block, uvm_page_index_t page_index);
2091 
2092 // Get the processor with a resident copy of a page closest to the given
2093 // processor.
2094 uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block,
2095                                                           uvm_page_index_t page_index,
2096                                                           uvm_processor_id_t processor);
2097 
2098 // Mark CPU page page_index as resident on NUMA node specified by nid.
2099 // nid cannot be NUMA_NO_NODE.
2100 void uvm_va_block_cpu_set_resident_page(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index);
2101 
2102 // Test if a CPU page is resident on NUMA node nid. If nid is NUMA_NO_NODE,
2103 // the function will return True if the page is resident on any CPU NUMA node.
2104 bool uvm_va_block_cpu_is_page_resident_on(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index);
2105 
2106 // Test if all pages in region are resident on NUMA node nid. If nid is
2107 // NUMA_NO_NODE, the function will test if the pages in the region are
2108 // resident on any CPU NUMA node.
2109 bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, uvm_va_block_region_t region);
2110 
2111 // Insert a CPU chunk at the given page_index into the va_block.
2112 // Locking: The va_block lock must be held.
2113 NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);
2114 
2115 // Remove a CPU chunk at the given page_index from the va_block.
2116 // nid cannot be NUMA_NO_NODE.
2117 // Locking: The va_block lock must be held.
2118 void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index);
2119 
2120 // Return the CPU chunk at the given page_index on the given NUMA node from the
2121 // va_block. nid cannot be NUMA_NO_NODE.
2122 // Locking: The va_block lock must be held.
2123 uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block,
2124                                                   int nid,
2125                                                   uvm_page_index_t page_index);
2126 
2127 // Return the struct page * from the chunk corresponding to the given page_index
2128 // Locking: The va_block lock must be held.
2129 struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);
2130 
2131 // Return the struct page * of the resident chunk at the given page_index from
2132 // the va_block. The given page_index must be resident on the CPU.
2133 // Locking: The va_block lock must be held.
2134 struct page *uvm_va_block_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index);
2135 
2136 // Physically map a CPU chunk so it is DMA'able from all registered GPUs.
2137 // nid cannot be NUMA_NO_NODE.
2138 // Locking: The va_block lock must be held.
2139 NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
2140                                              uvm_cpu_chunk_t *chunk,
2141                                              uvm_page_index_t page_index);
2142 
2143 // Physically unmap a CPU chunk from all registered GPUs.
2144 // Locking: The va_block lock must be held.
2145 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
2146                                           uvm_cpu_chunk_t *chunk,
2147                                           uvm_page_index_t page_index);
2148 
2149 // Remove any CPU chunks in the given region.
2150 // Locking: The va_block lock must be held.
2151 void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region);
2152 
2153 // Get CPU page size or 0 if it is not mapped
2154 NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block,
2155                                  uvm_page_index_t page_index);
2156 
2157 // Get GPU page size or 0 if it is not mapped on the given GPU
2158 NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index);
2159 
2160 // Get page size or 0 if it is not mapped on the given processor
2161 static NvU32 uvm_va_block_page_size_processor(uvm_va_block_t *va_block,
2162                                               uvm_processor_id_t processor_id,
2163                                               uvm_page_index_t page_index)
2164 {
2165     if (UVM_ID_IS_CPU(processor_id))
2166         return uvm_va_block_page_size_cpu(va_block, page_index);
2167     else
2168         return uvm_va_block_page_size_gpu(va_block, processor_id, page_index);
2169 }
2170 
2171 // Returns the big page size for the GPU VA space of the block
2172 NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu);
2173 
2174 // Returns the number of big pages in the VA block for the given size
2175 size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size);
2176 
2177 // Returns the number of big pages in the VA block for the big page size on the
2178 // given GPU
2179 static size_t uvm_va_block_gpu_num_big_pages(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
2180 {
2181     return uvm_va_block_num_big_pages(va_block, uvm_va_block_gpu_big_page_size(va_block, gpu));
2182 }
2183 
2184 // Returns the start address of the given big page index and big page size
2185 NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size);
2186 
2187 // Returns the region [start, end] of the given big page index and big page size
2188 uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block,
2189                                                    size_t big_page_index,
2190                                                    NvU32 big_page_size);
2191 
2192 // Returns the largest sub-region region of [start, end] which can fit big
2193 // pages. If the region cannot fit any big pages, an invalid region (0, 0) is
2194 // returned.
2195 uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size);
2196 
2197 // Returns the largest sub-region region of 'region' which can fit big pages.
2198 // If the region cannot fit any big pages, an invalid region (0, 0) is returned.
2199 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
2200                                                           uvm_va_block_region_t region,
2201                                                           NvU32 big_page_size);
2202 
2203 // Returns the big page index (the bit index within
2204 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
2205 // page_index cannot be covered by a big PTE due to alignment or block size,
2206 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned.
2207 size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size);
2208 
2209 // Returns the new residency for a page that faulted or triggered access counter
2210 // notifications. The read_duplicate output parameter indicates if the page
2211 // meets the requirements to be read-duplicated va_block_context must not be
2212 // NULL, and if the va_block is a HMM block, va_block_context->hmm.vma must be
2213 // valid which also means the va_block_context->mm is not NULL, retained, and
2214 // locked for at least read. See the comments for
2215 // uvm_va_block_check_policy_is_valid() and uvm_hmm_check_context_vma_is_valid()
2216 // in uvm_hmm.h.  Locking: the va_block lock must be held.
2217 uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block,
2218                                                  uvm_va_block_context_t *va_block_context,
2219                                                  uvm_page_index_t page_index,
2220                                                  uvm_processor_id_t processor_id,
2221                                                  NvU32 access_type_mask,
2222                                                  const uvm_va_policy_t *policy,
2223                                                  const uvm_perf_thrashing_hint_t *thrashing_hint,
2224                                                  uvm_service_operation_t operation,
2225                                                  bool *read_duplicate);
2226 
2227 // Return the maximum mapping protection for processor_id that will not require
2228 // any permision revocation on the rest of processors.
2229 uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block,
2230                                                         uvm_processor_id_t processor_id,
2231                                                         uvm_page_index_t page_index);
2232 
2233 // A helper macro for handling allocation-retry
2234 //
2235 // The macro takes a VA block, uvm_va_block_retry_t struct and a function call
2236 // to retry as long as it returns NV_ERR_MORE_PROCESSING_REQUIRED.
2237 //
2238 // block_retry can be NULL if it's not necessary for the function call,
2239 // otherwise it will be initialized and deinitialized by the macro.
2240 //
2241 // The macro also locks and unlocks the block's lock internally as it's expected
2242 // that the block's lock has been unlocked and relocked whenever the function call
2243 // returns NV_ERR_MORE_PROCESSING_REQUIRED and this makes it clear that the
2244 // block's state is not locked across these calls.
2245 #define UVM_VA_BLOCK_LOCK_RETRY(va_block, block_retry, call) ({     \
2246     NV_STATUS status;                                               \
2247     uvm_va_block_t *__block = (va_block);                           \
2248     uvm_va_block_retry_t *__retry = (block_retry);                  \
2249                                                                     \
2250     uvm_va_block_retry_init(__retry);                               \
2251                                                                     \
2252     uvm_mutex_lock(&__block->lock);                                 \
2253                                                                     \
2254     do {                                                            \
2255         status = (call);                                            \
2256     } while (status == NV_ERR_MORE_PROCESSING_REQUIRED);            \
2257                                                                     \
2258     uvm_mutex_unlock(&__block->lock);                               \
2259                                                                     \
2260     uvm_va_block_retry_deinit(__retry, __block);                    \
2261                                                                     \
2262     status;                                                         \
2263 })
2264 
2265 // A helper macro for handling allocation-retry
2266 //
2267 // The macro takes a VA block, uvm_va_block_retry_t struct and a function call
2268 // to retry as long as it returns NV_ERR_MORE_PROCESSING_REQUIRED.
2269 //
2270 // block_retry can be NULL if it's not necessary for the function call,
2271 // otherwise it will be initialized and deinitialized by the macro.
2272 //
2273 // This macro, as opposed to UVM_VA_BLOCK_LOCK_RETRY(), expects the block lock
2274 // to be already taken. Notably the block's lock might be unlocked and relocked
2275 // as part of the call.
2276 #define UVM_VA_BLOCK_RETRY_LOCKED(va_block, block_retry, call) ({   \
2277     NV_STATUS status;                                               \
2278     uvm_va_block_t *__block = (va_block);                           \
2279     uvm_va_block_retry_t *__retry = (block_retry);                  \
2280                                                                     \
2281     uvm_va_block_retry_init(__retry);                               \
2282                                                                     \
2283     uvm_assert_mutex_locked(&__block->lock);                        \
2284                                                                     \
2285     do {                                                            \
2286         status = (call);                                            \
2287     } while (status == NV_ERR_MORE_PROCESSING_REQUIRED);            \
2288                                                                     \
2289     uvm_va_block_retry_deinit(__retry, __block);                    \
2290                                                                     \
2291     status;                                                         \
2292 })
2293 
2294 #endif // __UVM_VA_BLOCK_H__
2295