1 /*******************************************************************************
2     Copyright (c) 2015-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #ifndef __UVM_VA_BLOCK_H__
25 #define __UVM_VA_BLOCK_H__
26 
27 #include "uvm_forward_decl.h"
28 #include "uvm_types.h"
29 #include "uvm_linux.h"
30 #include "nv-kref.h"
31 #include "uvm_common.h"
32 #include "uvm_perf_module.h"
33 #include "uvm_processors.h"
34 #include "uvm_lock.h"
35 #include "uvm_test_ioctl.h"
36 #include "uvm_tracker.h"
37 #include "uvm_pmm_gpu.h"
38 #include "uvm_perf_thrashing.h"
39 #include "uvm_perf_utils.h"
40 #include "uvm_va_block_types.h"
41 #include "uvm_range_tree.h"
42 #include "uvm_mmu.h"
43 #include "nv-kthread-q.h"
44 
45 #include <linux/mmu_notifier.h>
46 #include <linux/wait.h>
47 
48 // VA blocks are the leaf nodes in the uvm_va_space tree for managed allocations
49 // (VA ranges with type == UVM_VA_RANGE_TYPE_MANAGED):
50 //
51 //  UVM: uvm_va_space -> uvm_va_range -> uvm_va_block
52 //  HMM: uvm_va_space -> uvm_va_block
53 //
54 // Each VA block is contained within a single VA range, and contains state on
55 // VAs covered by that block. Most importantly, the block tracks the current
56 // state of the virtual-to-physical mappings for all VAs within that block
57 // across all processors, along with the physical residency location for each
58 // VA.
59 //
60 // The block serializes both CPU and GPU operations on all VAs under that block.
61 // The CPU work is serialized with the block lock, and the GPU work is
62 // serialized by the block work tracker which itself is protected by the block
63 // lock.
64 //
65 // The size of each block varies from the size of the smallest VA range
66 // (PAGE_SIZE) to the max block size specified by UVM_VA_BLOCK_BITS. No block
67 // will span a 2^UVM_VA_BLOCK_BITS boundary in VA space. The size of the block
68 // is determined by the alignment of the parent VA range and the block's
69 // placement within the range.
70 //
71 // Note that this means user space will get best allocation efficiency if it
72 // allocates memory in 2^UVM_VA_BLOCK_BITS naturally-aligned chunks.
73 
74 // enums used for indexing into the array of pte_bits bitmaps in the VA block
75 // which hold the current state of each PTE. For a given {processor, PTE}, the
76 // bits represented here must be enough to re-create the non-address portion of
77 // the PTE for that processor.
78 
79 // If _READ is not set, the PTE mapping is not valid.
80 // If _WRITE is set, _READ is also set (_WRITE implies _READ).
81 typedef enum
82 {
83     UVM_PTE_BITS_CPU_READ,
84     UVM_PTE_BITS_CPU_WRITE,
85     UVM_PTE_BITS_CPU_MAX
86 } uvm_pte_bits_cpu_t;
87 
88 // If _READ is not set, the PTE mapping is not valid.
89 // If _WRITE is set, _READ is also set (_WRITE implies _READ).
90 // If _ATOMIC is set, _WRITE is also set (_ATOMIC implies _WRITE and _READ).
91 //
92 // TODO: Bug 1764925: Track volatile here too if we add GPU L2 caching
93 typedef enum
94 {
95     UVM_PTE_BITS_GPU_READ,
96     UVM_PTE_BITS_GPU_WRITE,
97     UVM_PTE_BITS_GPU_ATOMIC,
98     UVM_PTE_BITS_GPU_MAX
99 } uvm_pte_bits_gpu_t;
100 
101 typedef struct
102 {
103     // Per-page residency bit vector, used for fast traversal
104     // of resident pages.
105     //
106     // This follows the same semantics as the CPU residency bit vector and
107     // notably each bit still represents a PAGE_SIZE amount of data, but the
108     // physical GPU memory is tracked by an array of GPU chunks below.
109     uvm_page_mask_t resident;
110 
111     // Pages that have been evicted to sysmem
112     uvm_page_mask_t evicted;
113 
114     NvU64 *cpu_chunks_dma_addrs;
115 
116     // Array of naturally-aligned chunks. Each chunk has the largest possible
117     // size which can fit within the block, so they are not uniform size.
118     //
119     // The number of chunks in the array is calculated using
120     // block_num_gpu_chunks. The size of each chunk is calculated using
121     // block_gpu_chunk_index.
122     uvm_gpu_chunk_t **chunks;
123 
124     // These page table ranges are not necessarily all used at the same time.
125     // The block might also be too small or not aligned properly to use the
126     // larger ranges, in which case they're never allocated.
127     //
128     // Once a range is allocated we keep it around to avoid constant allocation
129     // overhead when doing PTE splitting and merging.
130     //
131     // Check range.table to see if a given range has been allocated yet.
132     //
133     // page_table_range_big's range covers the big PTEs which fit within the
134     // interior of this block. See the big_ptes field.
135     uvm_page_table_range_t page_table_range_2m;
136     uvm_page_table_range_t page_table_range_big;
137     uvm_page_table_range_t page_table_range_4k;
138 
139     // These flags are ignored unless the {block, gpu} pair supports a 2M page
140     // size. In that case it's the responsibility of the block code to make the
141     // lower page tables active by calling uvm_page_tree_write_pde.
142     //
143     // They can be allocated and activated separately, so we have to track them
144     // separately.
145     //
146     // Activated only means that uvm_page_tree_write_pde has been called at some
147     // point in the past with the appropriate range allocated. It does not imply
148     // that the 2M entry is a PDE (see pte_is_2m).
149     bool activated_big;
150     bool activated_4k;
151 
152     // For {block, gpu} pairs which support the 2M page size, the page table
153     // ranges are uninitialized on allocation. This flag tracks whether the big
154     // PTEs have been initialized.
155     //
156     // We don't need an equivalent flag for the 4k range because we always write
157     // just the 4k PTEs not covered by higher-level PTEs. Big PTEs however can
158     // be allocated and activated late while the 4k PTEs are already active, in
159     // which case we need to initialize the entire big range.
160     bool initialized_big;
161 
162     // Sticky state to split PTEs to 4k and keep them there. Used when a fatal
163     // fault has been detected on this GPU to avoid false dependencies within
164     // the uTLB for fatal and non-fatal faults on the same larger PTE, which
165     // could lead to wrong fault attribution.
166     bool force_4k_ptes;
167 
168     // This table shows the HW PTE states given all permutations of pte_is_2m,
169     // big_ptes, and pte_bits. Note that the first row assumes that the 4k page
170     // tables have been allocated (if not, then no PDEs are allocated either).
171     //
172     // |-------------- SW state --------------|------------------- HW state --------------------|
173     //  pte_is_2m  pte_is_big  pte_bits[READ] | Page size  PDE0(2M only)  Big PTE       4k PTE
174     //  ----------------------------------------------------------------------------------------
175     //  0          0           0              | 4k         Valid PDE      Invalid [1]   Invalid
176     //  0          0           1              | 4k         Valid PDE      Invalid [1]   Valid
177     //  0          1           0              | Big        Valid PDE      Unmapped [2]  x
178     //  0          1           1              | Big        Valid PDE      Valid         x
179     //  1          must be 0   0              | 2M         Invalid        x             x
180     //  1          must be 0   1              | 2M         Valid PTE      x             x
181     //
182     // [1]: The big PTE may be unallocated, in which case its pointer won't be
183     //      valid in the parent PDE. If the big PTE is allocated, it will be
184     //      invalid so the 4k PTEs are active.
185     //
186     // [2]: The unmapped big PTE pattern differs from the invalid pattern, and
187     //      it prevents HW from reading the 4k entries. See the unmapped_pte()
188     //      MMU HAL function.
189 
190     // If pte_is_2m is true, there is a 2M PTE covering this VA block (valid or
191     // invalid). If false then we're in one of the following scenarios:
192     // 1) This {block, gpu} does not support 2M pages.
193     // 2) 2M pages are supported but the page_table_range_2m has not been
194     //    allocated (implying that the other page table ranges have not been
195     //    allocated either).
196     // 3) page_table_range_2m has been allocated, but the big_ptes bitmap should
197     //    be used to determine the mix of big and 4k PTEs.
198     bool pte_is_2m;
199 
200     // When pte_is_2m is false, this block consists of any possible mix of big
201     // and 4k PTEs. This bitmap describes that mix. A set bit indicates that the
202     // corresponding big-page-sized region of the block is covered by a big PTE.
203     // A cleared bit indicates that it is covered by 4k PTEs.
204     //
205     // Neither setting implies that the PTE currently has a valid mapping, it
206     // just indicates which PTE is read by the GPU (see the table above).
207     //
208     // The indices represent the corresponding big PTEs in the block's interior.
209     // For example, a block with alignment and size of one 4k page on either
210     // side of a big page will only use bit 0. Use uvm_va_block_big_page_index to look
211     // the big_ptes index of a page.
212     //
213     // The block might not be able to fit any big PTEs, in which case this
214     // bitmap is always zero. Use uvm_va_block_gpu_num_big_pages to find the number of
215     // valid bits in this mask.
216     DECLARE_BITMAP(big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
217 
218     // See the comments for uvm_va_block_mmap_t::cpu.pte_bits.
219     //
220     // The major difference is that these bits are always accurate since, unlike
221     // the CPU PTEs, the UVM driver is in full control of these mappings.
222     //
223     // Note that the granularity is always PAGE_SIZE, not whatever GPU PTE size
224     // happens to currently map these regions. PAGE_SIZE is the minimum
225     // granularity of operations on the VA blocks. As a future optimization we
226     // could consider sub-PAGE_SIZE operations if PAGE_SIZE > 4K and the CPU
227     // isn't involved, for example false sharing among peer GPUs.
228     uvm_page_mask_t pte_bits[UVM_PTE_BITS_GPU_MAX];
229 
230 } uvm_va_block_gpu_state_t;
231 
232 // TODO: Bug 1766180: Worst-case we could have one of these per system page.
233 //       Options:
234 //       1) Rely on the OOM killer to prevent the user from trying to do that
235 //       2) Be much more space-conscious in this struct (difficult)
236 //       3) Cap the per-process range and/or block count, like vm.max_map_count
237 //          does for vmas
238 struct uvm_va_block_struct
239 {
240     // Reference count for this block. References are held by:
241     // - The parent VA range for managed blocks or VA space for HMM blocks
242     // - The reverse map
243     // - The eviction path temporarily when attempting to evict a GPU page under
244     //   this block
245     //
246     // This isn't protected by the lock on the eviction path, so it must be
247     // atomic. nv_kref provides that.
248     nv_kref_t kref;
249 
250     // Lock protecting the block. See the comment at the top of uvm.c.
251     uvm_mutex_t lock;
252 
253     // Parent VA range. Managed blocks have this set. HMM blocks will have
254     // va_range set to NULL and hmm.va_space set instead. Dead blocks that are
255     // waiting for the last ref count to be removed have va_range and
256     // hmm.va_space set to NULL (could be either type of block).
257     //
258     // This field can be read while holding either the block lock or just the VA
259     // space lock in read mode, since it can only change when the VA space lock
260     // is held in write mode.
261     uvm_va_range_t *va_range;
262 
263     // Virtual address [start, end] covered by this block. These fields can be
264     // read while holding either the block lock or just the VA space lock in
265     // read mode, since they can only change when the VA space lock is held in
266     // write mode.
267     NvU64 start;
268     NvU64 end;
269 
270     // Per-processor residency bit vector, used for fast lookup of which
271     // processors are active in this block.
272     //
273     // A set bit means the corresponding processor has a coherent physical copy
274     // of memory somewhere in the block. The per-processor state must then be
275     // inspected to find out which pages. The processor may or may not have a
276     // mapping to that physical memory, however.
277     //
278     // A cleared bit means the corresponding processor does not have a coherent
279     // physical copy of any pages under this block. The processor may still have
280     // cached pages allocated for future use, however. It also may have mappings
281     // to pages resident on other processors.
282     uvm_processor_mask_t resident;
283 
284     // Per-processor mapping bit vector, used for fast lookup of which
285     // processors are active in this block.
286     //
287     // A set bit means the corresponding processor has an active, valid page
288     // table mapping to some VA in this block. The per-processor pte_bits state
289     // must then be inspected to find out the mapping address and permissions.
290     //
291     // A cleared bit means the corresponding processor has no virtual mappings
292     // within this block (all pte_bits entries are 0).
293     uvm_processor_mask_t mapped;
294 
295     // Per-processor evicted bit vector, used for fast lookup of which GPUs
296     // have evicted pages in this block.
297     //
298     // A set bit means the corresponding processor was the residency of some of
299     // the pages in the block when they were evicted due to memory capacity
300     // limitations. The per-processor state must then be inspected to find out
301     // which pages.
302     //
303     // A cleared bit means the corresponding processor has no evicted pages
304     // within this block (all evicted entries are 0).
305     uvm_processor_mask_t evicted_gpus;
306 
307     struct
308     {
309         // Per-page residency bit vector, used for fast traversal of resident
310         // pages.
311         //
312         // A set bit means the CPU has a coherent copy of the physical page
313         // resident in its memory, and that the corresponding entry in the pages
314         // array is present. This does not mean that the coherent copy is
315         // currently mapped anywhere, however. A page may be resident on
316         // multiple processors when in read-duplicate mode.
317         //
318         // A cleared bit means the CPU does not have a coherent copy of that
319         // page resident. The corresponding entry in the pages array may or may
320         // not present. If the entry is present, it's a cached page which can be
321         // reused in the future.
322         //
323         // Allocating PAGES_PER_UVM_VA_BLOCK is overkill when the block is
324         // smaller than UVM_VA_BLOCK_SIZE, but it's not much extra memory
325         // overhead on the whole.
326         uvm_page_mask_t resident;
327 
328         // CPU memory chunks represent physically contiguous CPU memory
329         // allocations. See uvm_pmm_sysmem.h for more details on CPU chunks.
330         // This member is meant to hold an opaque value indicating the CPU
331         // chunk storage method. For more details on CPU chunk storage,
332         // see uvm_cpu_chunk_storage_type_t in uvm_va_block.c.
333         unsigned long chunks;
334 
335         // Per-page allocation bit vector.
336         //
337         // A set bit means that a CPU page has been allocated for the
338         // corresponding page index.
339         uvm_page_mask_t allocated;
340 
341         // Per-page mapping bit vectors, one per bit we need to track. These are
342         // used for fast traversal of valid mappings in the block. These contain
343         // all non-address bits needed to establish a virtual mapping on this
344         // processor (permissions, cacheability, etc).
345         //
346         // A cleared bit in UVM_PTE_BITS_CPU_READ means the CPU has no valid
347         // virtual mapping to that address (the access will fault). Further,
348         // UVM_PTE_BITS_CPU_WRITE is guaranteed to also be clear.
349         //
350         // A set bit in UVM_PTE_BITS_CPU_READ means the CPU has a valid mapping
351         // at that address with at least read permissions. The physical page for
352         // that mapping is contained in the pages array. If
353         // UVM_PTE_BITS_CPU_WRITE is not set, the mapping is read-only.
354         // Otherwise, the mapping is read-write.
355         //
356         // For managed allocations, this is the maximum permissions a PTE
357         // could have, but not necessarily the actual current permissions of the
358         // CPU PTEs. The UVM driver will never change the PTEs without updating
359         // this state, but the kernel can downgrade our CPU mappings at any time
360         // without notifying the UVM driver (for example in response to user
361         // space calling madvise with MADV_DONTNEED).
362         //
363         // For HMM allocations, this is the minimum permission the CPU has since
364         // Linux can upgrade a read-only PTE to read-write without notifying
365         // the UVM driver. This is why read duplication isn't currently
366         // supported.
367         // TODO: Bug 3660922: Need to handle read duplication at some point.
368         uvm_page_mask_t pte_bits[UVM_PTE_BITS_CPU_MAX];
369 
370         // Whether the CPU has ever mapped a page on this VA block. This is
371         // used to force GMMU PDE1 pre-population on ATS systems. See
372         // pre_populate_gpu_pde1 in uvm_va_block.c for more information.
373         NvU8 ever_mapped        : 1;
374 
375         // We can get "unexpected" faults if multiple CPU threads fault on the
376         // same address simultaneously and race to create the mapping. Since
377         // our CPU fault handler always unmaps to handle the case where the
378         // kernel downgrades our CPU mappings, we can introduce an infinite
379         // stream of CPU faults in multi-threaded workloads.
380         //
381         // In order to handle this scenario, we keep track of the first thread
382         // that faulted on a page with valid permissions and the timestamp.
383         // Then, we keep track of the subsequent faults on that page during a
384         // window of time. If the first thread faults again on the page, that
385         // will indicate that the mapping has been downgraded by the kernel and
386         // we need to remap it. Faults from the rest of threads are just
387         // ignored. The information is also cleared on the following events:
388         // - The tracking window finishes
389         // - The page is unmapped
390         struct
391         {
392             // Timestamp when the first fault was detected. This also is used
393             // as a flag that the contents of this struct are valid
394             NvU64             first_fault_stamp;
395 
396             // First thread that faulted while having valid permissions. we
397             // don't take a reference on the pid so we shouldn't ever use it
398             // for task-lookup in the kernel. We only use it as a heuristic so
399             // it's OK if the pid gets destroyed or reused.
400             pid_t             first_pid;
401 
402             // Index of the page whose faults are being tracked
403             uvm_page_index_t  page_index;
404         } fault_authorized;
405     } cpu;
406 
407     // Per-GPU residency and mapping state
408     //
409     // TODO: Bug 1766180: Even though these are pointers, making this a static
410     //       array will use up a non-trivial amount of storage for small blocks.
411     //       In most cases we won't have anywhere near this many GPUs active
412     //       anyway. Consider using a dense array of just the GPUs registered in
413     //       this VA space, depending on the perf of accessing that array and on
414     //       how noticeable this memory overhead actually is.
415     uvm_va_block_gpu_state_t *gpus[UVM_ID_MAX_GPUS];
416 
417     // Mask to keep track of the pages that are read-duplicate
418     uvm_page_mask_t read_duplicated_pages;
419 
420     // Mask to keep track of the pages that are not mapped on any non-UVM-Lite
421     // processor.
422     //     0: Page is definitely not mapped by any processors
423     //     1: Page may or may not be mapped by a processor
424     //
425     // This mask sets the bit when the page is mapped on any non-UVM-Lite
426     // processor but it is not always unset on unmap (to avoid a performance
427     // impact). Therefore, it can contain false negatives. It should be only
428     // used for opportunistic optimizations that have a fast path for pages
429     // that are not mapped anywhere (see uvm_va_block_migrate_locked, for
430     // example), but not the other way around.
431     uvm_page_mask_t maybe_mapped_pages;
432 
433     // Tracks all outstanding GPU work related to this block: GPU copies, PTE
434     // updates, TLB invalidates, etc. The residency and mapping state is only
435     // valid once this tracker is done.
436     //
437     // CPU operations need to wait for this tracker to be done. GPU operations
438     // need to acquire it before pushing their work, then that work must be
439     // added to this tracker before the block's lock is dropped.
440     uvm_tracker_t tracker;
441 
442     // A queue item for establishing eviction mappings in a deferred way
443     nv_kthread_q_item_t eviction_mappings_q_item;
444 
445     uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT];
446 
447     // Prefetch infomation that is updated while holding the va_block lock but
448     // records state while the lock is not held.
449     struct
450     {
451         uvm_processor_id_t last_migration_proc_id;
452 
453         NvU16 fault_migrations_to_last_proc;
454     } prefetch_info;
455 
456 #if UVM_IS_CONFIG_HMM()
457     struct
458     {
459         // The MMU notifier is registered per va_block.
460         struct mmu_interval_notifier notifier;
461 
462         // Wait queue for GPU atomic operations to system memory.
463         struct wait_queue_head atomic_waitq;
464 
465         // Mask of pages being migrated to system memory for GPU atomic access.
466         // It is used so other threads don't try to migrate those pages while
467         // make_device_exclusive_range() is called without holding the va_block
468         // lock.
469         uvm_page_mask_t atomic_busy;
470 
471         // Sequence number to tell if any changes were made to the va_block
472         // while not holding the block lock and calling hmm_range_fault().
473         unsigned long changed;
474 
475         // Parent VA space pointer. It is NULL for managed blocks or if
476         // the HMM block is dead. This field can be read while holding the
477         // block lock and is only modified while holding the va_space write
478         // lock and va_block lock (same as the va_range pointer).
479         uvm_va_space_t *va_space;
480 
481         // Tree of uvm_va_policy_node_t. The policy node ranges always cover
482         // all or part of a VMA range or a contiguous range of VMAs within the
483         // va_block. Policy nodes are resized or deleted when the underlying
484         // VMA range is changed by Linux via the invalidate() callback.
485         // Otherwise, policies could be stale after munmap().
486         // Locking: The va_block lock is needed to access or modify the tree.
487         uvm_range_tree_t va_policy_tree;
488 
489         // Storage node for range tree of va_blocks.
490         uvm_range_tree_node_t node;
491     } hmm;
492 #endif
493 };
494 
495 // We define additional per-VA Block fields for testing. When
496 // uvm_enable_builtin_tests is defined, all VA Blocks will have
497 // uvm_va_block_wrapper_t size. Otherwise, the test fields are not available.
498 // Use the uvm_va_block_get_test function defined below to obtain a safe
499 // pointer to uvm_va_block_test_t from a uvm_va_block_t pointer.
500 struct uvm_va_block_wrapper_struct
501 {
502     uvm_va_block_t block;
503 
504     struct uvm_va_block_test_struct
505     {
506         // Count of how many page table allocations should be forced to retry
507         // with eviction enabled. Used for testing only.
508         NvU32 page_table_allocation_retry_force_count;
509 
510         // Count of how many user pages allocations should be forced to retry
511         // with eviction enabled. Used for testing only.
512         NvU32 user_pages_allocation_retry_force_count;
513 
514         // Mask of chunk sizes to be used for CPU chunk allocations.
515         // The actual set of chunk sizes to be used will be the set resulting
516         // from AND'ing this value with the value of
517         // uvm_cpu_chunk_allocation_sizes module parameter.
518         NvU32 cpu_chunk_allocation_size_mask;
519 
520         // Subsequent operations that need to allocate CPU pages will fail. As
521         // opposed to other error injection settings, this one fails N times
522         // and then succeeds instead of failing on the Nth try. A value of ~0u
523         // means fail indefinitely.
524         // This is because this error is supposed to be fatal and tests verify
525         // the state of the VA blocks after the failure. However, some tests
526         // use kernels to trigger migrations and a fault replay could trigger
527         // a successful migration if this error flag is cleared.
528         NvU32 inject_cpu_pages_allocation_error_count;
529 
530         // Force the next eviction attempt on this block to fail. Used for
531         // testing only.
532         bool inject_eviction_error;
533 
534         // Force the next successful chunk allocation to then fail. Used for testing
535         // only to simulate driver metadata allocation failure.
536         bool inject_populate_error;
537 
538         // Force the next split on this block to fail.
539         // Set by error injection ioctl for testing purposes only.
540         bool inject_split_error;
541     } test;
542 };
543 
544 // Tracking needed for supporting allocation-retry of user GPU memory
545 struct uvm_va_block_retry_struct
546 {
547     // A tracker used for all allocations from PMM.
548     uvm_tracker_t tracker;
549 
550     // List of allocated chunks (uvm_gpu_chunk_t). Currently all chunks are of
551     // the same size. However it can contain chunks from multiple GPUs. All
552     // remaining free chunks are freed when the operation is finished with
553     // uvm_va_block_retry_deinit().
554     struct list_head free_chunks;
555 
556     // List of chunks allocated and used during the block operation. This list
557     // can contain chunks from multiple GPUs. All the used chunks are unpinned
558     // when the operation is finished with uvm_va_block_retry_deinit().
559     struct list_head used_chunks;
560 };
561 
562 // Module load/exit
563 NV_STATUS uvm_va_block_init(void);
564 void uvm_va_block_exit(void);
565 
566 // Allocates and initializes the block. The block's ref count is initialized to
567 // 1. The caller is responsible for inserting the block into its parent
568 // va_range.
569 //
570 // The caller must be holding the VA space lock in at least read mode.
571 //
572 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
573 NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range,
574                               NvU64 start,
575                               NvU64 end,
576                               uvm_va_block_t **out_block);
577 
578 // Internal function called only when uvm_va_block_release drops the ref count
579 // to 0. Do not call directly.
580 void uvm_va_block_destroy(nv_kref_t *kref);
581 
582 static inline void uvm_va_block_retain(uvm_va_block_t *va_block)
583 {
584     nv_kref_get(&va_block->kref);
585 }
586 
587 // Locking: The va_block lock must not be held.
588 // The va_space lock must be held in write mode unless it is the special case
589 // that the block has no GPU state; for example, right after calling
590 // uvm_va_block_create(). In that case, the va_space lock can be held in read
591 // mode.
592 static inline void uvm_va_block_release(uvm_va_block_t *va_block)
593 {
594     if (va_block) {
595         // The calling thread shouldn't be holding the block's mutex when
596         // releasing the block as it might get destroyed.
597         uvm_assert_unlocked_order(UVM_LOCK_ORDER_VA_BLOCK);
598         nv_kref_put(&va_block->kref, uvm_va_block_destroy);
599     }
600 }
601 
602 // Same as uvm_va_block_release but the caller may be holding the VA block lock.
603 // The caller must ensure that the refcount will not get to zero in this call.
604 static inline void uvm_va_block_release_no_destroy(uvm_va_block_t *va_block)
605 {
606     int destroyed = nv_kref_put(&va_block->kref, uvm_va_block_destroy);
607     UVM_ASSERT(!destroyed);
608 }
609 
610 // Returns true if the block is managed by HMM.
611 // Locking: This can be called while holding either the block lock or just the
612 // VA space lock in read mode, since it can only change when the VA space lock
613 // is held in write mode.
614 static inline bool uvm_va_block_is_hmm(uvm_va_block_t *va_block)
615 {
616 #if UVM_IS_CONFIG_HMM()
617     return va_block->hmm.va_space;
618 #else
619     return false;
620 #endif
621 }
622 
623 // Return true if the block is dead.
624 // Locking: This can be called while holding either the block lock or just the
625 // VA space lock in read mode, since it can only change when the VA space lock
626 // is held in write mode.
627 static inline bool uvm_va_block_is_dead(uvm_va_block_t *va_block)
628 {
629     if (va_block->va_range)
630         return false;
631 
632 #if UVM_IS_CONFIG_HMM()
633     if (va_block->hmm.va_space)
634         return false;
635 #endif
636 
637     return true;
638 }
639 
640 static inline uvm_va_block_gpu_state_t *uvm_va_block_gpu_state_get(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id)
641 {
642     return va_block->gpus[uvm_id_gpu_index(gpu_id)];
643 }
644 
645 // Return the va_space pointer of the given block or NULL if the block is dead.
646 // Locking: This can be called while holding either the block lock or just the
647 // VA space lock in read mode, since it can only change when the VA space lock
648 // is held in write mode.
649 uvm_va_space_t *uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t *va_block);
650 
651 // Return the va_space pointer of the given block assuming the block is not dead
652 // (asserts that it is not dead and asserts va_space is not NULL).
653 // Locking: This can be called while holding either the block lock or just the
654 // VA space lock in read mode, since it can only change when the VA space lock
655 // is held in write mode.
656 uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block);
657 
658 // Return true if the VA space has access counter migrations enabled and should
659 // remote map pages evicted to system memory. This is OK since access counters
660 // can pull the data back to vidmem if sufficient accesses trigger a migration.
661 // The caller must ensure that the VA space cannot go away.
662 bool uvm_va_space_map_remote_on_eviction(uvm_va_space_t *va_space);
663 
664 // Dynamic cache-based allocation for uvm_va_block_context_t.
665 //
666 // See uvm_va_block_context_init() for a description of the mm parameter.
667 uvm_va_block_context_t *uvm_va_block_context_alloc(struct mm_struct *mm);
668 void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context);
669 
670 // Initialization of an already-allocated uvm_va_block_context_t.
671 //
672 // mm is used to initialize the value of va_block_context->mm. NULL is allowed.
673 static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context, struct mm_struct *mm)
674 {
675     UVM_ASSERT(va_block_context);
676 
677     // Write garbage into the VA Block context to ensure that the UVM code
678     // clears masks appropriately
679     if (UVM_IS_DEBUG())
680         memset(va_block_context, 0xff, sizeof(*va_block_context));
681 
682     va_block_context->mm = mm;
683 #if UVM_IS_CONFIG_HMM()
684     va_block_context->hmm.vma = NULL;
685 #endif
686 }
687 
688 // Check that a single policy covers the given region for the given va_block.
689 // This always returns true and is intended to only be used with UVM_ASSERT().
690 // Locking: the va_block lock must be held.
691 bool uvm_va_block_check_policy_is_valid(uvm_va_block_t *va_block,
692                                         const uvm_va_policy_t *policy,
693                                         uvm_va_block_region_t region);
694 
695 // TODO: Bug 1766480: Using only page masks instead of a combination of regions
696 //       and page masks could simplify the below APIs and their implementations
697 //       at the cost of having to scan the whole mask for small regions.
698 //       Investigate the performance effects of doing that.
699 
700 // Moves the physical pages of the given region onto the destination processor.
701 // If page_mask is non-NULL, the movement is further restricted to only those
702 // pages in the region which are present in the mask.
703 //
704 // prefetch_page_mask may be passed as a subset of page_mask when cause is
705 // UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT,
706 // UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT, or
707 // UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER to indicate pages that have been
708 // pulled due to automatic page prefetching heuristics. For pages in this mask,
709 // UVM_MAKE_RESIDENT_CAUSE_PREFETCH will be reported in migration events,
710 // instead.
711 //
712 // This function breaks read duplication for all given pages even if they
713 // don't migrate. Pages which are not resident on the destination processor
714 // will also be unmapped from all existing processors, be populated in the
715 // destination processor's memory, and copied to the new physical location.
716 // Any new memory will be zeroed if it is the first allocation for that page
717 // in the system.
718 //
719 // This function does not create any new virtual mappings.
720 //
721 // This function acquires/waits for the va_block tracker and updates that
722 // tracker with any new work pushed.
723 //
724 // Allocation-retry: this operation may need to perform eviction to be able to
725 // allocate GPU memory successfully and if that happens,
726 // NV_ERR_MORE_PROCESSING_REQUIRED will be returned. That also means that the
727 // block's lock has been unlocked and relocked as part of the call and that the
728 // whole sequence of operations performed under the block's lock needs to be
729 // attempted again. To facilitate that, the caller needs to provide the same
730 // va_block_retry struct for each attempt that has been initialized before the
731 // first attempt and needs to be deinitialized after the last one. Most callers
732 // can just use UVM_VA_BLOCK_LOCK_RETRY() that takes care of that for the
733 // caller.
734 //
735 // If dest_id is the CPU then va_block_retry can be NULL and allocation-retry of
736 // user memory is guaranteed not to happen. Allocation-retry of GPU page tables
737 // can still occur though.
738 //
739 // va_block_context must not be NULL. This function will set a bit in
740 // va_block_context->make_resident.pages_changed_residency for each page that
741 // changed residency (due to a migration or first population) as a result of the
742 // operation and va_block_context->make_resident.all_involved_processors for
743 // each processor involved in the copy. This function only sets bits in those
744 // masks. It is the caller's responsiblity to zero the masks or not first.
745 //
746 // va_block_context->policy must also be set by the caller for the given region.
747 // See the comments for uvm_va_block_check_policy_is_valid().
748 //
749 // Notably any status other than NV_OK indicates that the block's lock might
750 // have been unlocked and relocked.
751 //
752 // LOCKING: The caller must hold the va_block lock.
753 // If va_block_context->mm != NULL, va_block_context->mm->mmap_lock must be
754 // held in at least read mode.
755 NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block,
756                                      uvm_va_block_retry_t *va_block_retry,
757                                      uvm_va_block_context_t *va_block_context,
758                                      uvm_processor_id_t dest_id,
759                                      uvm_va_block_region_t region,
760                                      const uvm_page_mask_t *page_mask,
761                                      const uvm_page_mask_t *prefetch_page_mask,
762                                      uvm_make_resident_cause_t cause);
763 
764 // Similar to uvm_va_block_make_resident (read documentation there). The main
765 // differences are:
766 // - Pages are copied not moved (i.e. other copies of the page are not
767 //   unmapped)
768 // - Processors with a resident copy of pages that migrated have write and
769 //   atomic access permission revoked, unlike in uvm_va_block_make_resident
770 //   where they are unmapped
771 // - All remote mappings (due to either SetAccessedBy or performance heuristics)
772 //   are broken
773 // - Only managed va_blocks are supported.
774 //   TODO: Bug 3660922: need to implement HMM read duplication support.
775 NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
776                                                     uvm_va_block_retry_t *va_block_retry,
777                                                     uvm_va_block_context_t *va_block_context,
778                                                     uvm_processor_id_t dest_id,
779                                                     uvm_va_block_region_t region,
780                                                     const uvm_page_mask_t *page_mask,
781                                                     const uvm_page_mask_t *prefetch_page_mask,
782                                                     uvm_make_resident_cause_t cause);
783 
784 // Similar to uvm_va_block_make_resident() (read documentation there). The
785 // difference is that source pages are only copied to the destination and the
786 // residency is not updated until uvm_va_block_make_resident_finish() is called.
787 // Otherwise, the combination of uvm_va_block_make_resident_copy() and
788 // uvm_va_block_make_resident_finish() is the same as just calling
789 // uvm_va_block_make_resident(). Note, however, that the va_block lock must be
790 // held across the two calls for the operation to be complete. The va_block
791 // lock can be dropped after calling uvm_va_block_make_resident_copy() but
792 // uvm_va_block_make_resident_copy() must be called again after relocking the
793 // va_block lock and before calling uvm_va_block_make_resident_finish().
794 // This split is needed when using migrate_vma_setup() and migrate_vma_pages()
795 // so that when migrate_vma_pages() indicates a page is not migrating, the
796 // va_block state is not updated.
797 // LOCKING: The caller must hold the va_block lock.
798 NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block,
799                                           uvm_va_block_retry_t *va_block_retry,
800                                           uvm_va_block_context_t *va_block_context,
801                                           uvm_processor_id_t dest_id,
802                                           uvm_va_block_region_t region,
803                                           const uvm_page_mask_t *page_mask,
804                                           const uvm_page_mask_t *prefetch_page_mask,
805                                           uvm_make_resident_cause_t cause);
806 
807 // The page_mask must be the same or a subset of the page_mask passed to
808 // uvm_va_block_make_resident_copy(). This step updates the residency and breaks
809 // read duplication.
810 // LOCKING: The caller must hold the va_block lock.
811 void uvm_va_block_make_resident_finish(uvm_va_block_t *va_block,
812                                        uvm_va_block_context_t *va_block_context,
813                                        uvm_va_block_region_t region,
814                                        const uvm_page_mask_t *page_mask);
815 
816 // Creates or upgrades a mapping from the input processor to the given virtual
817 // address region. Pages which already have new_prot permissions or higher are
818 // skipped, so this call ensures that the range is mapped with at least new_prot
819 // permissions. new_prot must not be UVM_PROT_NONE. uvm_va_block_unmap or
820 // uvm_va_block_revoke_prot should be used to downgrade permissions instead.
821 //
822 // The mapped pages are described by the region parameter and the map page mask
823 // that allows the caller to restrict the map operation to specific pages within
824 // the region. If the page mask is NULL then the whole region is mapped.
825 //
826 // If the input processor is a GPU with no GPU VA space registered, or if the
827 // input processor is the CPU and this thread is not allowed to create CPU
828 // mappings, this function does nothing. CPU mappings are only allowed if
829 // uvm_va_range_vma_check(va_block_context->mm) is valid, so the caller must
830 // set va_block_context->mm before calling this function.
831 //
832 // cause specifies the cause to be reported in events in case a remote mapping
833 // is created.
834 //
835 // Any CPU mappings will wait for the va_block tracker. If this function pushes
836 // GPU work it will first acquire the va_block tracker, then add the pushed work
837 // to out_tracker. It is the caller's responsibility to add this work to
838 // va_block's tracker. Note that while it is generally safe to run map
839 // operations on different GPUs concurrently, two PTE operations (map, unmap,
840 // revoke) on the same GPU must be serialized even if they target different
841 // pages because the earlier operation can cause a PTE split or merge which is
842 // assumed by the later operation.
843 //
844 // va_block_context must not be NULL and va_block_context->policy must be valid.
845 // See the comments for uvm_va_block_check_policy_is_valid().
846 //
847 // If allocation-retry was required as part of the operation and was successful,
848 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
849 // out_tracker were added to the block's tracker and then the block's lock was
850 // unlocked and relocked.
851 //
852 // In general, any status other than NV_OK indicates that the block's lock might
853 // have been unlocked and relocked.
854 //
855 // LOCKING: The caller must hold the va block lock. If va_block_context->mm !=
856 //          NULL, va_block_context->mm->mmap_lock must be held in at least read
857 //          mode.
858 NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
859                            uvm_va_block_context_t *va_block_context,
860                            uvm_processor_id_t id,
861                            uvm_va_block_region_t region,
862                            const uvm_page_mask_t *map_page_mask,
863                            uvm_prot_t new_prot,
864                            UvmEventMapRemoteCause cause,
865                            uvm_tracker_t *out_tracker);
866 
867 // Like uvm_va_block_map, except it maps all processors in the input mask. The
868 // VA block tracker contains all map operations on return.
869 //
870 // Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like
871 // uvm_va_block_map() indicating that the operation needs to be retried.
872 NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block,
873                                 uvm_va_block_context_t *va_block_context,
874                                 const uvm_processor_mask_t *map_processor_mask,
875                                 uvm_va_block_region_t region,
876                                 const uvm_page_mask_t *map_page_mask,
877                                 uvm_prot_t new_prot,
878                                 UvmEventMapRemoteCause cause);
879 
880 // Unmaps virtual regions from a single processor. This does not free page
881 // tables or physical memory. This is safe to call on the eviction path, but the
882 // caller must ensure that the block hasn't been killed.
883 //
884 // The unmapped pages are described by the region parameter and the unmap page
885 // mask that allows the caller to restrict the unmap operation to specific pages
886 // within the region. If the page mask is NULL then the whole region is
887 // unmapped.
888 //
889 // If id is UVM_ID_CPU, this is guaranteed to return NV_OK, and this is safe to
890 // call without holding a reference on the mm which owns the associated vma.
891 //
892 // Any CPU unmappings will wait for the va_block tracker. If this function
893 // pushes GPU work it will first acquire the va_block tracker, then add the
894 // pushed work to out_tracker. It is the caller's responsibility to add this
895 // work to va_block's tracker. Note that while it is generally safe to run unmap
896 // operations on different GPUs concurrently, two PTE operations (map, unmap,
897 // revoke) on the same GPU must be serialized even if they target different
898 // pages because the earlier operation can cause a PTE split or merge which is
899 // assumed by the later operation.
900 //
901 // va_block_context must not be NULL. The va_block_context->policy is unused.
902 //
903 // If allocation-retry was required as part of the operation and was successful,
904 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
905 // out_tracker were added to the block's tracker and then the block's lock was
906 // unlocked and relocked. It is guaranteed that retry will not be required if
907 // the unmap does not cause a PTE split. Examples of operations which will not
908 // cause a PTE split include unmapping the entire block, unmapping all PTEs with
909 // matching attributes, and unmapping all PTEs which point to the same physical
910 // chunk.
911 //
912 // LOCKING: The caller must hold the va_block lock.
913 NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block,
914                              uvm_va_block_context_t *va_block_context,
915                              uvm_processor_id_t id,
916                              uvm_va_block_region_t region,
917                              const uvm_page_mask_t *unmap_page_mask,
918                              uvm_tracker_t *out_tracker);
919 
920 // Like uvm_va_block_unmap, except it unmaps all processors in the input mask.
921 // The VA block tracker contains all map operations on return.
922 NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block,
923                                   uvm_va_block_context_t *va_block_context,
924                                   const uvm_processor_mask_t *unmap_processor_mask,
925                                   uvm_va_block_region_t region,
926                                   const uvm_page_mask_t *unmap_page_mask);
927 
928 // Function called when the preferred location changes. Notably:
929 // - Mark all CPU pages as dirty because the new processor may not have
930 //   up-to-date data.
931 // - Unmap the preferred location's processor from any pages in this region
932 //   which are not resident on the preferred location.
933 //
934 // va_block_context must not be NULL and va_block_context->policy must be valid.
935 // See the comments for uvm_va_block_check_policy_is_valid().
936 //
937 // LOCKING: The caller must hold the VA block lock.
938 NV_STATUS uvm_va_block_set_preferred_location_locked(uvm_va_block_t *va_block,
939                                                      uvm_va_block_context_t *va_block_context,
940                                                      uvm_va_block_region_t region);
941 
942 // Maps the given processor to all resident pages in this block, as allowed by
943 // location and policy. Waits for the operation to complete before returning.
944 // This function should only be called with managed va_blocks.
945 //
946 // va_block_context must not be NULL and va_block_context->policy must be valid.
947 // See the comments for uvm_va_block_check_policy_is_valid().
948 //
949 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm
950 //          != NULL, va_block_context->mm->mmap_lock must be held in at least
951 //          read mode.
952 NV_STATUS uvm_va_block_set_accessed_by(uvm_va_block_t *va_block,
953                                        uvm_va_block_context_t *va_block_context,
954                                        uvm_processor_id_t processor_id);
955 
956 // Maps given processor to all resident pages in this block and region, as
957 // allowed by location and policy. The caller is responsible for waiting for
958 // the tracker after all mappings have been started.
959 // This function can be called with HMM and managed va_blocks.
960 //
961 // va_block_context must not be NULL and va_block_context->policy must be valid.
962 // See the comments for uvm_va_block_check_policy_is_valid().
963 //
964 // LOCKING: The caller must hold the va_block lock and
965 //          va_block_context->mm->mmap_lock must be held in at least read mode.
966 NV_STATUS uvm_va_block_set_accessed_by_locked(uvm_va_block_t *va_block,
967                                               uvm_va_block_context_t *va_block_context,
968                                               uvm_processor_id_t processor_id,
969                                               uvm_va_block_region_t region,
970                                               uvm_tracker_t *out_tracker);
971 
972 // Breaks SetAccessedBy and remote mappings
973 // This function should only be called with managed va_blocks.
974 //
975 // va_block_context must not be NULL and va_block_context->policy must be valid.
976 // See the comments for uvm_va_block_check_policy_is_valid().
977 //
978 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm
979 //          != NULL, va_block_context->mm->mmap_lock must be held in at least
980 //          read mode.
981 NV_STATUS uvm_va_block_set_read_duplication(uvm_va_block_t *va_block,
982                                             uvm_va_block_context_t *va_block_context);
983 
984 // Restores SetAccessedBy mappings
985 // This function should only be called with managed va_blocks.
986 //
987 // va_block_context must not be NULL and va_block_context->policy must be valid.
988 // See the comments for uvm_va_block_check_policy_is_valid().
989 //
990 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm
991 //          != NULL, va_block_context->mm->mmap_lock must be held in at least
992 //          read mode.
993 NV_STATUS uvm_va_block_unset_read_duplication(uvm_va_block_t *va_block,
994                                               uvm_va_block_context_t *va_block_context);
995 
996 // Check if processor_id is allowed to access the va_block with access_type
997 // permissions. Return values:
998 //
999 // NV_ERR_INVALID_ADDRESS       The VA block is logically dead (zombie)
1000 // NV_ERR_INVALID_ACCESS_TYPE   The vma corresponding to the VA range does not
1001 //                              allow access_type permissions, or migration is
1002 //                              disallowed and processor_id cannot access the
1003 //                              range remotely (UVM-Lite).
1004 // NV_ERR_INVALID_OPERATION     The access would violate the policies specified
1005 //                              by UvmPreventMigrationRangeGroups.
1006 //
1007 // va_block_context must not be NULL, va_block_context->policy must be valid,
1008 // and if the va_block is a HMM block, va_block_context->hmm.vma must be valid
1009 // which also means the va_block_context->mm is not NULL, retained, and locked
1010 // for at least read.
1011 // Locking: the va_block lock must be held.
1012 NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
1013                                                  uvm_va_block_context_t *va_block_context,
1014                                                  uvm_processor_id_t processor_id,
1015                                                  uvm_page_index_t page_index,
1016                                                  uvm_fault_type_t access_type,
1017                                                  bool allow_migration);
1018 
1019 // API for access privilege revocation
1020 //
1021 // Revoke prot_to_revoke access permissions for the given processor.
1022 //
1023 // The revoked pages are described by the region parameter and the revoke page
1024 // mask that allows the caller to restrict the revoke operation to specific
1025 // pages within the region.
1026 //
1027 // prot_to_revoke must be greater than UVM_PROT_READ_ONLY. Caller should call
1028 // unmap explicitly if it wants to revoke all access privileges.
1029 //
1030 // If id is UVM_ID_CPU, and prot_to_revoke is UVM_PROT_READ_WRITE_ATOMIC, no
1031 // action is performed. If the processor id corresponds to the CPU and the
1032 // caller cannot establish CPU mappings because it does not have a reference on
1033 // vma->vm_mm (va_block_context->mm != vma->vm_mm), the page will be simply
1034 // unmapped. Caller should call unmap explicitly if it wants to revoke all
1035 // access privileges.
1036 //
1037 // Any CPU revocation will wait for the va_block tracker. If this function
1038 // pushes GPU work it will first acquire the va_block tracker, then add the
1039 // pushed work to out_tracker. It is the caller's responsibility to add this
1040 // work to va_block's tracker. Note that while it is generally safe to run
1041 // revocation operations on different GPUs concurrently, two PTE operations
1042 // (map, unmap, revoke) on the same GPU must be serialized even if they target
1043 // different pages because the earlier operation can cause a PTE split or merge
1044 // which is assumed by the later operation.
1045 //
1046 // va_block_context must not be NULL. The va_block_context->policy is unused.
1047 //
1048 // If allocation-retry was required as part of the operation and was successful,
1049 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
1050 // out_tracker were added to the block's tracker and then the block's lock was
1051 // unlocked and relocked.
1052 //
1053 // In general, any status other than NV_OK indicates that the block's lock might
1054 // have been unlocked and relocked.
1055 //
1056 // LOCKING: The caller must hold the va block lock. If va_block_context->mm !=
1057 //          NULL, va_block_context->mm->mmap_lock must be held in at least read
1058 //          mode.
1059 NV_STATUS uvm_va_block_revoke_prot(uvm_va_block_t *va_block,
1060                                    uvm_va_block_context_t *va_block_context,
1061                                    uvm_processor_id_t id,
1062                                    uvm_va_block_region_t region,
1063                                    const uvm_page_mask_t *revoke_page_mask,
1064                                    uvm_prot_t prot_to_revoke,
1065                                    uvm_tracker_t *out_tracker);
1066 
1067 // Like uvm_va_block_revoke_prot(), except it revokes all processors in the
1068 // input mask. The VA block tracker contains all revocation operations on
1069 // return.
1070 //
1071 // Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like
1072 // uvm_va_block_revoke_prot() indicating that the operation needs to be retried.
1073 NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block,
1074                                         uvm_va_block_context_t *va_block_context,
1075                                         const uvm_processor_mask_t *revoke_processor_mask,
1076                                         uvm_va_block_region_t region,
1077                                         const uvm_page_mask_t *revoke_page_mask,
1078                                         uvm_prot_t prot_to_revoke);
1079 
1080 // Tries to map all pages in the given region and map_page_mask with at most
1081 // max_prot privileges for appropriate processors as determined by the
1082 // accessed_by mask, heuristics and the given processor mask (excluding
1083 // processor_id, which triggered the migration and should have already been
1084 // mapped).
1085 //
1086 // va_block_context must not be NULL and va_block_context->policy must be valid.
1087 // See the comments for uvm_va_block_check_policy_is_valid().
1088 //
1089 // This function acquires/waits for the va_block tracker and updates that
1090 // tracker with any new work pushed.
1091 //
1092 // Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like
1093 // uvm_va_block_map() indicating that the operation needs to be retried.
1094 //
1095 // LOCKING: The caller must hold the va block lock. If va_block_context->mm !=
1096 //          NULL, va_block_context->mm->mmap_lock must be held in at least read
1097 //          mode.
1098 NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
1099                                                     uvm_va_block_context_t *va_block_context,
1100                                                     uvm_processor_id_t new_residency,
1101                                                     uvm_processor_id_t processor_id,
1102                                                     uvm_va_block_region_t region,
1103                                                     const uvm_page_mask_t *map_page_mask,
1104                                                     uvm_prot_t max_prot,
1105                                                     const uvm_processor_mask_t *processor_mask);
1106 
1107 // Maps processors using SetAccessedBy to all resident pages in the region
1108 // parameter. On Volta+ it is also used to map evicted pages that can be later
1109 // pulled back by using access counters.
1110 //
1111 // This function acquires/waits for the va_block tracker and updates that
1112 // tracker with any new work pushed.
1113 //
1114 // Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like
1115 // uvm_va_block_map() indicating that the operation needs to be retried.
1116 //
1117 // va_block_context must not be NULL and va_block_context->policy must be valid.
1118 // See the comments for uvm_va_block_check_policy_is_valid().
1119 //
1120 // LOCKING: The caller must hold the va block lock. If va_block_context->mm !=
1121 //          NULL, va_block_context->mm->mmap_lock must be held in at least read
1122 //          mode.
1123 NV_STATUS uvm_va_block_add_mappings(uvm_va_block_t *va_block,
1124                                     uvm_va_block_context_t *va_block_context,
1125                                     uvm_processor_id_t processor_id,
1126                                     uvm_va_block_region_t region,
1127                                     const uvm_page_mask_t *page_mask,
1128                                     UvmEventMapRemoteCause cause);
1129 
1130 // Notifies the VA block that a new GPU VA space has been created.
1131 // LOCKING: The caller must hold the va_block lock
1132 NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space);
1133 
1134 // Destroys the VA block's mappings and page tables on the GPU, if it has any.
1135 //
1136 // If mm != NULL, that mm is used for any CPU mappings which may be created as
1137 // a result of this call. See uvm_va_block_context_t::mm for details.
1138 //
1139 // va_block_context must not be NULL. The va_block_context->policy is unused.
1140 //
1141 // LOCKING: The caller must hold the va_block lock. If block_context->mm is not
1142 // NULL, the caller must hold mm->mmap_lock in at least read mode.
1143 void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
1144                                       uvm_gpu_va_space_t *gpu_va_space,
1145                                       uvm_va_block_context_t *block_context);
1146 
1147 // Creates any mappings necessary in this VA block between the two GPUs, in
1148 // either direction.
1149 // LOCKING: The caller must hold the va_block lock
1150 NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
1151 
1152 // Unmaps all page tables in this VA block which have peer mappings between
1153 // the two GPUs, in either direction.
1154 // LOCKING: The caller must hold the va_block lock
1155 void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
1156 
1157 // Unmap any mappings from GPU to the preferred location.
1158 //
1159 // The GPU has to be in UVM-Lite mode.
1160 //
1161 // LOCKING: The caller must hold the va_block lock
1162 void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uvm_gpu_t *gpu);
1163 
1164 // Frees all memory under this block associated with this GPU. Any portion of
1165 // the block which is resident on the GPU is evicted to sysmem before being
1166 // freed.
1167 //
1168 // If mm != NULL, that mm is used for any CPU mappings which may be created as
1169 // a result of this call. See uvm_va_block_context_t::mm for details.
1170 //
1171 // LOCKING: This takes and releases the VA block lock. If mm != NULL, the caller
1172 //          must hold mm->mmap_lock in at least read mode.
1173 void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm);
1174 
1175 // Same as uvm_va_block_unregister_gpu() but the VA block lock must be held.
1176 // Note that this handles allocation-retry internally and hence might unlock
1177 // and relock block's lock.
1178 void uvm_va_block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm);
1179 
1180 // Unmaps all memory associated with the block and drops the ref count of the
1181 // block. This allows the caller to free resources associated with this block
1182 // regardless of the block's current ref count. Most importantly it allows the
1183 // VA covered by this block to be immediately available for other page table
1184 // mappings upon return.
1185 //
1186 // This clears block->va_range, so only the VA range destroy path should call
1187 // it. Other paths with references on this block, specifically the eviction path
1188 // which temporarily takes a reference to the block, must always check the block
1189 // state after taking the block lock to see if their mapping is still in place.
1190 //
1191 // All of the unmap and state destruction steps are also performed when the ref
1192 // count goes to 0, so this function only needs to be called if the block's
1193 // resources need to be reclaimed immediately.
1194 //
1195 // The caller should not lock the block before calling this function.
1196 //
1197 // This performs a uvm_va_block_release.
1198 void uvm_va_block_kill(uvm_va_block_t *va_block);
1199 
1200 // Exactly the same split semantics as uvm_va_range_split, including error
1201 // handling. See that function's comments for details.
1202 //
1203 // new_va_block's va_range is set to new_va_range before any reverse mapping is
1204 // established to the new block, but the caller is responsible for inserting the
1205 // new block into the range.
1206 NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block,
1207                              NvU64 new_end,
1208                              uvm_va_block_t **new_va_block,
1209                              uvm_va_range_t *new_va_range);
1210 
1211 // Exactly the same split semantics as uvm_va_block_split, including error
1212 // handling except the existing_va_block block lock needs to be held and
1213 // the new_va_block has to be preallocated.
1214 // Also note that the existing_va_block lock may be dropped and re-acquired.
1215 NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block,
1216                                     NvU64 new_end,
1217                                     uvm_va_block_t *new_va_block,
1218                                     uvm_va_range_t *new_va_range);
1219 
1220 // Handles a CPU fault in the given VA block, performing any operations
1221 // necessary to establish a coherent CPU mapping (migrations, cache invalidates,
1222 // etc.).
1223 //
1224 // Locking:
1225 //  - vma->vm_mm->mmap_lock must be held in at least read mode. Note, that
1226 //    might not be the same as current->mm->mmap_lock.
1227 //  - va_space lock must be held in at least read mode
1228 //
1229 // service_context->block_context.mm is ignored and vma->vm_mm is used instead.
1230 // service_context->block_context.policy is set by this function.
1231 //
1232 // Returns NV_ERR_INVALID_ACCESS_TYPE if a CPU mapping to fault_addr cannot be
1233 // accessed, for example because it's within a range group which is non-
1234 // migratable.
1235 NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block,
1236                                  NvU64 fault_addr,
1237                                  bool is_write,
1238                                  uvm_service_block_context_t *service_context);
1239 
1240 // Performs any operations necessary to establish a coherent mapping
1241 // (migrations, cache invalidates, etc.) in response to the given service block
1242 // context.
1243 //
1244 // service_context must not be NULL and service_context->block_context.policy
1245 // must be valid. See the comments for uvm_va_block_check_policy_is_valid().
1246 // If va_block is a HMM block, va_block_context->hmm.vma must be valid.
1247 // See the comments for uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
1248 // service_context->prefetch_hint is set by this function.
1249 //
1250 // Locking:
1251 //  - service_context->block_context.mm->mmap_lock must be held in at least
1252 //    read mode, if valid.
1253 //  - va_space lock must be held in at least read mode
1254 //  - va_block lock must be held
1255 //
1256 // If allocation-retry was required as part of the operation and was successful,
1257 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the block's lock
1258 // was unlocked and relocked.
1259 //
1260 // NV_WARN_MORE_PROCESSING_REQUIRED indicates that thrashing has been detected
1261 // and the performance heuristics logic decided to throttle execution.
1262 // Any other error code different than NV_OK indicates OOM or a global fatal
1263 // error.
1264 NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id,
1265                                       uvm_va_block_t *va_block,
1266                                       uvm_va_block_retry_t *block_retry,
1267                                       uvm_service_block_context_t *service_context);
1268 
1269 // Performs population of the destination pages, unmapping and copying source
1270 // pages to new_residency.
1271 //
1272 // service_context must not be NULL and service_context->block_context.policy
1273 // must be valid. See the comments for uvm_va_block_check_policy_is_valid().
1274 // If va_block is a HMM block, va_block_context->hmm.vma must be valid.
1275 // See the comments for uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
1276 // service_context->prefetch_hint should be set before calling this function.
1277 //
1278 // Locking:
1279 //  - service_context->block_context.mm->mmap_lock must be held in at least
1280 //    read mode, if valid.
1281 //  - va_space lock must be held in at least read mode
1282 //  - va_block lock must be held
1283 //
1284 // If allocation-retry was required as part of the operation and was successful,
1285 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the block's lock
1286 // was unlocked and relocked.
1287 //
1288 // NV_WARN_MORE_PROCESSING_REQUIRED indicates that thrashing has been detected
1289 // and the performance heuristics logic decided to throttle execution.
1290 // Any other error code different than NV_OK indicates OOM or a global fatal
1291 // error.
1292 NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id,
1293                                     uvm_processor_id_t new_residency,
1294                                     uvm_va_block_t *va_block,
1295                                     uvm_va_block_retry_t *block_retry,
1296                                     uvm_service_block_context_t *service_context);
1297 
1298 // This updates the va_block residency state and maps the faulting processor_id
1299 // to the new residency (which may be remote).
1300 //
1301 // service_context must not be NULL and service_context->block_context.policy
1302 // must be valid. See the comments for uvm_va_block_check_policy_is_valid().
1303 // If va_block is a HMM block, va_block_context->hmm.vma must be valid.
1304 // See the comments for uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
1305 // service_context must be initialized by calling uvm_va_block_service_copy()
1306 // before calling this function.
1307 //
1308 // Locking:
1309 //  - service_context->block_context.mm->mmap_lock must be held in at least
1310 //    read mode, if valid.
1311 //  - va_space lock must be held in at least read mode
1312 //  - va_block lock must be held
1313 //  - the mmap lock and va_space lock must be held across the calls to
1314 //    uvm_va_block_service_copy() and this function. If the va_block lock is
1315 //    dropped inbetween, special care is needed to check for eviction and
1316 //    invalidation callbacks.
1317 //
1318 // If allocation-retry was required as part of the operation and was successful,
1319 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the block's lock
1320 // was unlocked and relocked.
1321 //
1322 // NV_WARN_MORE_PROCESSING_REQUIRED indicates that thrashing has been detected
1323 // and the performance heuristics logic decided to throttle execution.
1324 // Any other error code different than NV_OK indicates OOM or a global fatal
1325 // error.
1326 NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
1327                                       uvm_va_block_t *va_block,
1328                                       uvm_service_block_context_t *service_context);
1329 
1330 // Allocate GPU state for the given va_block and registered GPUs.
1331 // Locking: The block lock must be held.
1332 NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block);
1333 
1334 // Release any GPU or policy data associated with the given region in response
1335 // to munmap().
1336 // Locking: The va_block lock must be held.
1337 void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
1338                                 uvm_va_block_region_t region);
1339 
1340 // Size of the block in bytes. Guaranteed to be a page-aligned value between
1341 // PAGE_SIZE and UVM_VA_BLOCK_SIZE.
1342 static inline NvU64 uvm_va_block_size(uvm_va_block_t *block)
1343 {
1344     NvU64 size = block->end - block->start + 1;
1345     UVM_ASSERT(PAGE_ALIGNED(size));
1346     UVM_ASSERT(size >= PAGE_SIZE);
1347     UVM_ASSERT(size <= UVM_VA_BLOCK_SIZE);
1348     return size;
1349 }
1350 
1351 // Number of pages with PAGE_SIZE in the block
1352 static inline size_t uvm_va_block_num_cpu_pages(uvm_va_block_t *block)
1353 {
1354     return uvm_va_block_size(block) / PAGE_SIZE;
1355 }
1356 
1357 // VA of the given page using CPU page size. page_index must be valid
1358 static inline NvU64 uvm_va_block_cpu_page_address(uvm_va_block_t *block, uvm_page_index_t page_index)
1359 {
1360     UVM_ASSERT(page_index < uvm_va_block_num_cpu_pages(block));
1361     return block->start + PAGE_SIZE * page_index;
1362 }
1363 
1364 // Get the physical address on the given GPU for given residency
1365 uvm_gpu_phys_address_t uvm_va_block_res_phys_page_address(uvm_va_block_t *va_block,
1366                                                           uvm_page_index_t page_index,
1367                                                           uvm_processor_id_t residency,
1368                                                           uvm_gpu_t *gpu);
1369 
1370 // Get the page physical address on the given GPU
1371 //
1372 // This will assert that GPU state is indeed present.
1373 uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block,
1374                                                           uvm_page_index_t page_index,
1375                                                           uvm_gpu_t *gpu);
1376 
1377 static bool uvm_va_block_contains_address(uvm_va_block_t *block, NvU64 address)
1378 {
1379     return address >= block->start && address <= block->end;
1380 }
1381 
1382 // Obtain a pointer to the uvm_va_block_test_t structure for the given VA
1383 // block. If uvm_enable_builtin_tests is unset, NULL will be returned.
1384 static uvm_va_block_test_t *uvm_va_block_get_test(uvm_va_block_t *va_block)
1385 {
1386     if (uvm_enable_builtin_tests)
1387         return &container_of(va_block, uvm_va_block_wrapper_t, block)->test;
1388 
1389     return NULL;
1390 }
1391 
1392 // Get the page residency mask for a processor if it's known to be there.
1393 //
1394 // If the processor is a GPU, this will assert that GPU state is indeed present.
1395 uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor);
1396 
1397 // Get the page mapped mask for a processor. The returned mask cannot be
1398 // directly modified by the caller
1399 //
1400 // If the processor is a GPU, this will assert that GPU state is indeed present.
1401 const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor);
1402 
1403 // VA block lookup functions. There are a number of permutations which might be
1404 // useful, such as looking up the block from {va_space, va_range} x {addr,
1405 // block index}. The ones implemented here and in uvm_va_range.h support the
1406 // primary three use cases, which are:
1407 // 1) Iterating over all VA blocks in a VA range. This uses block indices on the
1408 //    VA range:
1409 //      uvm_va_range_num_blocks
1410 //      uvm_va_range_block_index
1411 //      uvm_va_range_block
1412 //      uvm_va_range_block_create
1413 // 2) Operating on a single VA block (fault). This looks up the block using the
1414 //    VA space and address:
1415 //      uvm_va_block_find
1416 //      uvm_va_block_find_create
1417 // 3) Operating on a single VA block (fault). This looks up the block using the
1418 //    supplied VA range and address:
1419 //      uvm_va_block_find_create_in_range
1420 
1421 // Finds the UVM or HMM VA block containing addr, if any. The va_space->lock
1422 // must be held in at least read mode. Return values:
1423 // NV_ERR_INVALID_ADDRESS   addr is not a UVM_VA_RANGE_TYPE_MANAGED va_range nor
1424 //                          a HMM enabled VMA.
1425 //
1426 // NV_ERR_OBJECT_NOT_FOUND  addr is valid but no block has been allocated to
1427 //                          cover it yet
1428 //
1429 // NV_OK                    The block was returned successfully
1430 NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t **out_block);
1431 
1432 // Same as uvm_va_block_find except that the block is created if not found.
1433 // If addr is covered by a UVM_VA_RANGE_TYPE_MANAGED va_range, a managed block
1434 // will be created. Otherwise, if addr is not covered by any va_range, HMM is
1435 // enabled in the va_space, and va_block_context and va_block_context->mm are
1436 // non-NULL, then a HMM block will be created and va_block_context->hmm.vma is
1437 // set to the VMA covering 'addr'. The va_block_context->policy field is left
1438 // unchanged.
1439 // In either case, if va_block_context->mm is non-NULL, it must be retained and
1440 // locked in at least read mode. Return values:
1441 // NV_ERR_INVALID_ADDRESS   addr is not a UVM_VA_RANGE_TYPE_MANAGED va_range nor
1442 //                          a HMM enabled VMA.
1443 // NV_ERR_NO_MEMORY         memory could not be allocated.
1444 NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
1445                                    NvU64 addr,
1446                                    uvm_va_block_context_t *va_block_context,
1447                                    uvm_va_block_t **out_block);
1448 
1449 // Same as uvm_va_block_find_create except that va_range lookup was already done
1450 // by the caller. If the supplied va_range is NULL, this function behaves just
1451 // like when the va_range lookup in uvm_va_block_find_create is NULL.
1452 NV_STATUS uvm_va_block_find_create_in_range(uvm_va_space_t *va_space,
1453                                             uvm_va_range_t *va_range,
1454                                             NvU64 addr,
1455                                             uvm_va_block_context_t *va_block_context,
1456                                             uvm_va_block_t **out_block);
1457 
1458 // Same as uvm_va_block_find_create except that only managed va_blocks are
1459 // created if not already present in the VA range.
1460 static NV_STATUS uvm_va_block_find_create_managed(uvm_va_space_t *va_space,
1461                                                   NvU64 addr,
1462                                                   uvm_va_block_t **out_block)
1463 {
1464     return uvm_va_block_find_create(va_space, addr, NULL, out_block);
1465 }
1466 
1467 // Look up a chunk backing a specific address within the VA block.
1468 // Returns NULL if none.
1469 uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU64 address);
1470 
1471 // Implementation of the UvmMigrate() API at the VA block scope.
1472 //
1473 // The out_tracker can be NULL.
1474 //
1475 // If do_mappings is false, mappings are not added after pages have been
1476 // migrated.
1477 //
1478 // The caller needs to handle allocation-retry. va_block_retry can be NULL if
1479 // the destination is the CPU.
1480 //
1481 // va_block_context must not be NULL and va_block_context->policy must be valid.
1482 // See the comments for uvm_va_block_check_policy_is_valid().
1483 // If va_block is a HMM block, va_block_context->hmm.vma must be valid.
1484 // See the comments for uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
1485 //
1486 // LOCKING: The caller must hold the va_block lock. If va_block_context->mm !=
1487 //          NULL, va_block_context->mm->mmap_lock must be held in at least
1488 //          read mode.
1489 NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
1490                                       uvm_va_block_retry_t *va_block_retry,
1491                                       uvm_va_block_context_t *va_block_context,
1492                                       uvm_va_block_region_t region,
1493                                       uvm_processor_id_t dest_id,
1494                                       uvm_migrate_mode_t mode,
1495                                       uvm_tracker_t *out_tracker);
1496 
1497 // Write block's data from a CPU buffer
1498 //
1499 // The [dst, dst + size) range has to fit within a single PAGE_SIZE page.
1500 //
1501 // va_block_context must not be NULL. The caller is not required to set
1502 // va_block_context->policy or va_block_context->hmm.vma.
1503 //
1504 // The caller needs to support allocation-retry of page tables.
1505 //
1506 // LOCKING: The caller must hold the va_block lock
1507 NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block,
1508                                       uvm_va_block_context_t *block_context,
1509                                       NvU64 dst,
1510                                       uvm_mem_t *src,
1511                                       size_t size);
1512 
1513 // Read block's data into a CPU buffer
1514 //
1515 // The [src, src + size) range has to fit within a single PAGE_SIZE page.
1516 //
1517 // LOCKING: The caller must hold the va_block lock
1518 NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block, uvm_mem_t *dst, NvU64 src, size_t size);
1519 
1520 // Initialize va block retry tracking
1521 void uvm_va_block_retry_init(uvm_va_block_retry_t *uvm_va_block_retry);
1522 
1523 // Deinitialize va block retry tracking after a block operation
1524 //
1525 // Frees all the remaining free chunks and unpins all the used chunks.
1526 void uvm_va_block_retry_deinit(uvm_va_block_retry_t *uvm_va_block_retry, uvm_va_block_t *va_block);
1527 
1528 // Evict all chunks from the block that are subchunks of the passed in root_chunk.
1529 //
1530 // Add all the work tracking the eviction to the tracker.
1531 //
1532 // Returns NV_OK if the block is dead or doesn't have any subchunks of the
1533 // root_chunk.
1534 //
1535 // LOCKING: The caller must hold the va_block lock
1536 NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
1537                                     uvm_gpu_t *gpu,
1538                                     uvm_gpu_chunk_t *root_chunk,
1539                                     uvm_tracker_t *tracker);
1540 
1541 NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *params, struct file *filp);
1542 NV_STATUS uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS *params, struct file *filp);
1543 NV_STATUS uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS *params, struct file *filp);
1544 NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params, struct file *filp);
1545 
1546 // Compute the offset in system pages of addr from the start of va_block.
1547 static uvm_page_index_t uvm_va_block_cpu_page_index(uvm_va_block_t *va_block, NvU64 addr)
1548 {
1549     UVM_ASSERT(addr >= va_block->start);
1550     UVM_ASSERT(addr <= va_block->end);
1551     return (addr - va_block->start) / PAGE_SIZE;
1552 }
1553 
1554 // Computes the size and index in the gpu_state chunks array of the GPU chunk
1555 // which corresponds to the given page_index of the VA region.
1556 // Note this is only used for testing and does not work on HMM va_blocks as it
1557 // returns incorrect results for those.
1558 size_t uvm_va_block_gpu_chunk_index_range(NvU64 start,
1559                                           NvU64 size,
1560                                           uvm_gpu_t *gpu,
1561                                           uvm_page_index_t page_index,
1562                                           uvm_chunk_size_t *out_chunk_size);
1563 
1564 // If there are any resident CPU pages in the block, mark them as dirty
1565 void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block);
1566 
1567 // Sets the internal state required to handle fault cancellation
1568 //
1569 // This function may require allocating page tables to split big pages into 4K
1570 // pages. If allocation-retry was required as part of the operation and was
1571 // successful, NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case the
1572 // block's lock was unlocked and relocked.
1573 //
1574 // va_block_context must not be NULL. The va_block_context->policy is unused.
1575 //
1576 // LOCKING: The caller must hold the va_block lock.
1577 NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu);
1578 
1579 //
1580 // uvm_va_block_region_t helpers
1581 //
1582 
1583 static uvm_va_block_region_t uvm_va_block_region(uvm_page_index_t first, uvm_page_index_t outer)
1584 {
1585     BUILD_BUG_ON(PAGES_PER_UVM_VA_BLOCK >= (1 << (sizeof(first) * 8)));
1586 
1587     UVM_ASSERT(first <= outer);
1588 
1589     return (uvm_va_block_region_t){ .first = first, .outer = outer };
1590 }
1591 
1592 static uvm_va_block_region_t uvm_va_block_region_for_page(uvm_page_index_t page_index)
1593 {
1594     return uvm_va_block_region(page_index, page_index + 1);
1595 }
1596 
1597 static size_t uvm_va_block_region_num_pages(uvm_va_block_region_t region)
1598 {
1599     return region.outer - region.first;
1600 }
1601 
1602 static NvU64 uvm_va_block_region_size(uvm_va_block_region_t region)
1603 {
1604     return uvm_va_block_region_num_pages(region) * PAGE_SIZE;
1605 }
1606 
1607 static NvU64 uvm_va_block_region_start(uvm_va_block_t *va_block, uvm_va_block_region_t region)
1608 {
1609     return va_block->start + region.first * PAGE_SIZE;
1610 }
1611 
1612 static NvU64 uvm_va_block_region_end(uvm_va_block_t *va_block, uvm_va_block_region_t region)
1613 {
1614     return va_block->start + region.outer * PAGE_SIZE - 1;
1615 }
1616 
1617 static bool uvm_va_block_region_contains_region(uvm_va_block_region_t region, uvm_va_block_region_t subregion)
1618 {
1619     return subregion.first >= region.first && subregion.outer <= region.outer;
1620 }
1621 
1622 static bool uvm_va_block_region_contains_page(uvm_va_block_region_t region, uvm_page_index_t page_index)
1623 {
1624     return uvm_va_block_region_contains_region(region, uvm_va_block_region_for_page(page_index));
1625 }
1626 
1627 // Create a block range from a va block and start and end virtual addresses
1628 // within the block.
1629 static uvm_va_block_region_t uvm_va_block_region_from_start_end(uvm_va_block_t *va_block, NvU64 start, NvU64 end)
1630 {
1631     uvm_va_block_region_t region;
1632 
1633     UVM_ASSERT(start < end);
1634     UVM_ASSERT(start >= va_block->start);
1635     UVM_ASSERT(end <= va_block->end);
1636     UVM_ASSERT(PAGE_ALIGNED(start));
1637     UVM_ASSERT(PAGE_ALIGNED(end + 1));
1638 
1639     region.first = uvm_va_block_cpu_page_index(va_block, start);
1640     region.outer = uvm_va_block_cpu_page_index(va_block, end) + 1;
1641 
1642     return region;
1643 }
1644 
1645 static uvm_va_block_region_t uvm_va_block_region_from_start_size(uvm_va_block_t *va_block, NvU64 start, NvU64 size)
1646 {
1647     return uvm_va_block_region_from_start_end(va_block, start, start + size - 1);
1648 }
1649 
1650 static uvm_va_block_region_t uvm_va_block_region_from_block(uvm_va_block_t *va_block)
1651 {
1652     return uvm_va_block_region(0, uvm_va_block_num_cpu_pages(va_block));
1653 }
1654 
1655 // Create a block region from a va block and page mask. Note that the region
1656 // covers the first through the last set bit and may have unset bits in between.
1657 static uvm_va_block_region_t uvm_va_block_region_from_mask(uvm_va_block_t *va_block, const uvm_page_mask_t *page_mask)
1658 {
1659     uvm_va_block_region_t region;
1660     uvm_page_index_t outer = uvm_va_block_num_cpu_pages(va_block);
1661 
1662     region.first = find_first_bit(page_mask->bitmap, outer);
1663     if (region.first >= outer) {
1664         region = uvm_va_block_region(0, 0);
1665     }
1666     else {
1667         // At least one bit is set so find_last_bit() should not return 'outer'.
1668         region.outer = find_last_bit(page_mask->bitmap, outer) + 1;
1669         UVM_ASSERT(region.outer <= outer);
1670     }
1671 
1672     return region;
1673 }
1674 
1675 static bool uvm_page_mask_test(const uvm_page_mask_t *mask, uvm_page_index_t page_index)
1676 {
1677     UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
1678 
1679     return test_bit(page_index, mask->bitmap);
1680 }
1681 
1682 static bool uvm_page_mask_test_and_set(uvm_page_mask_t *mask, uvm_page_index_t page_index)
1683 {
1684     UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
1685 
1686     return __test_and_set_bit(page_index, mask->bitmap);
1687 }
1688 
1689 static bool uvm_page_mask_test_and_clear(uvm_page_mask_t *mask, uvm_page_index_t page_index)
1690 {
1691     UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
1692 
1693     return __test_and_clear_bit(page_index, mask->bitmap);
1694 }
1695 
1696 static void uvm_page_mask_set(uvm_page_mask_t *mask, uvm_page_index_t page_index)
1697 {
1698     UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
1699 
1700     __set_bit(page_index, mask->bitmap);
1701 }
1702 
1703 static void uvm_page_mask_clear(uvm_page_mask_t *mask, uvm_page_index_t page_index)
1704 {
1705     UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
1706 
1707     __clear_bit(page_index, mask->bitmap);
1708 }
1709 
1710 static bool uvm_page_mask_region_test(const uvm_page_mask_t *mask,
1711                                       uvm_va_block_region_t region,
1712                                       uvm_page_index_t page_index)
1713 {
1714     if (!uvm_va_block_region_contains_page(region, page_index))
1715         return false;
1716 
1717     return !mask || uvm_page_mask_test(mask, page_index);
1718 }
1719 
1720 static NvU32 uvm_page_mask_region_weight(const uvm_page_mask_t *mask, uvm_va_block_region_t region)
1721 {
1722     NvU32 weight_before = 0;
1723 
1724     if (region.first > 0)
1725         weight_before = bitmap_weight(mask->bitmap, region.first);
1726 
1727     return bitmap_weight(mask->bitmap, region.outer) - weight_before;
1728 }
1729 
1730 static bool uvm_page_mask_region_empty(const uvm_page_mask_t *mask, uvm_va_block_region_t region)
1731 {
1732     return find_next_bit(mask->bitmap, region.outer, region.first) == region.outer;
1733 }
1734 
1735 static bool uvm_page_mask_region_full(const uvm_page_mask_t *mask, uvm_va_block_region_t region)
1736 {
1737     return find_next_zero_bit(mask->bitmap, region.outer, region.first) == region.outer;
1738 }
1739 
1740 static void uvm_page_mask_region_fill(uvm_page_mask_t *mask, uvm_va_block_region_t region)
1741 {
1742     bitmap_set(mask->bitmap, region.first, region.outer - region.first);
1743 }
1744 
1745 static void uvm_page_mask_region_clear(uvm_page_mask_t *mask, uvm_va_block_region_t region)
1746 {
1747     bitmap_clear(mask->bitmap, region.first, region.outer - region.first);
1748 }
1749 
1750 static void uvm_page_mask_region_clear_outside(uvm_page_mask_t *mask, uvm_va_block_region_t region)
1751 {
1752     if (region.first > 0)
1753         bitmap_clear(mask->bitmap, 0, region.first);
1754     if (region.outer < PAGES_PER_UVM_VA_BLOCK)
1755         bitmap_clear(mask->bitmap, region.outer, PAGES_PER_UVM_VA_BLOCK - region.outer);
1756 }
1757 
1758 static void uvm_page_mask_zero(uvm_page_mask_t *mask)
1759 {
1760     bitmap_zero(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
1761 }
1762 
1763 static bool uvm_page_mask_empty(const uvm_page_mask_t *mask)
1764 {
1765     return bitmap_empty(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
1766 }
1767 
1768 static bool uvm_page_mask_full(const uvm_page_mask_t *mask)
1769 {
1770     return bitmap_full(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
1771 }
1772 
1773 static bool uvm_page_mask_and(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
1774 {
1775     return bitmap_and(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
1776 }
1777 
1778 static bool uvm_page_mask_andnot(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
1779 {
1780     return bitmap_andnot(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
1781 }
1782 
1783 static void uvm_page_mask_or(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
1784 {
1785     bitmap_or(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
1786 }
1787 
1788 static void uvm_page_mask_complement(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in)
1789 {
1790     bitmap_complement(mask_out->bitmap, mask_in->bitmap, PAGES_PER_UVM_VA_BLOCK);
1791 }
1792 
1793 static void uvm_page_mask_copy(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in)
1794 {
1795     bitmap_copy(mask_out->bitmap, mask_in->bitmap, PAGES_PER_UVM_VA_BLOCK);
1796 }
1797 
1798 static NvU32 uvm_page_mask_weight(const uvm_page_mask_t *mask)
1799 {
1800     return bitmap_weight(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
1801 }
1802 
1803 static bool uvm_page_mask_subset(const uvm_page_mask_t *subset, const uvm_page_mask_t *mask)
1804 {
1805     return bitmap_subset(subset->bitmap, mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
1806 }
1807 
1808 static bool uvm_page_mask_init_from_region(uvm_page_mask_t *mask_out,
1809                                            uvm_va_block_region_t region,
1810                                            const uvm_page_mask_t *mask_in)
1811 {
1812     uvm_page_mask_zero(mask_out);
1813     uvm_page_mask_region_fill(mask_out, region);
1814 
1815     if (mask_in)
1816         return uvm_page_mask_and(mask_out, mask_out, mask_in);
1817 
1818     return true;
1819 }
1820 
1821 static void uvm_page_mask_shift_right(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in, unsigned shift)
1822 {
1823     bitmap_shift_right(mask_out->bitmap, mask_in->bitmap, shift, PAGES_PER_UVM_VA_BLOCK);
1824 }
1825 
1826 static void uvm_page_mask_shift_left(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in, unsigned shift)
1827 {
1828     bitmap_shift_left(mask_out->bitmap, mask_in->bitmap, shift, PAGES_PER_UVM_VA_BLOCK);
1829 }
1830 
1831 static bool uvm_page_mask_intersects(const uvm_page_mask_t *mask1, const uvm_page_mask_t *mask2)
1832 {
1833     return bitmap_intersects(mask1->bitmap, mask2->bitmap, PAGES_PER_UVM_VA_BLOCK);
1834 }
1835 
1836 // Print the given page mask on the given buffer using hex symbols. The
1837 // minimum required size of the buffer is UVM_PAGE_MASK_PRINT_MIN_BUFFER_SIZE.
1838 static void uvm_page_mask_print(const uvm_page_mask_t *mask, char *buffer)
1839 {
1840     // There are two cases, which depend on PAGE_SIZE
1841     if (PAGES_PER_UVM_VA_BLOCK > 32) {
1842         NvLength current_long_idx = UVM_PAGE_MASK_WORDS - 1;
1843         const char *buffer_end = buffer + UVM_PAGE_MASK_PRINT_MIN_BUFFER_SIZE;
1844 
1845         UVM_ASSERT(sizeof(*mask->bitmap) == 8);
1846 
1847         // For 4KB pages, we need to iterate over multiple words
1848         do {
1849             NvU64 current_long = mask->bitmap[current_long_idx];
1850 
1851             buffer += sprintf(buffer, "%016llx", current_long);
1852             if (current_long_idx != 0)
1853                 buffer += sprintf(buffer, ":");
1854         } while (current_long_idx-- != 0);
1855 
1856         UVM_ASSERT(buffer <= buffer_end);
1857     }
1858     else {
1859         NvU32 value = (unsigned)*mask->bitmap;
1860 
1861         UVM_ASSERT(PAGES_PER_UVM_VA_BLOCK == 32);
1862 
1863         // For 64KB pages, a single print suffices
1864         sprintf(buffer, "%08x", value);
1865     }
1866 }
1867 
1868 static uvm_va_block_region_t uvm_va_block_first_subregion_in_mask(uvm_va_block_region_t region,
1869                                                                   const uvm_page_mask_t *page_mask)
1870 {
1871     uvm_va_block_region_t subregion;
1872 
1873     if (!page_mask)
1874         return region;
1875 
1876     subregion.first = find_next_bit(page_mask->bitmap, region.outer, region.first);
1877     subregion.outer = find_next_zero_bit(page_mask->bitmap, region.outer, subregion.first + 1);
1878     return subregion;
1879 }
1880 
1881 static uvm_va_block_region_t uvm_va_block_next_subregion_in_mask(uvm_va_block_region_t region,
1882                                                                  const uvm_page_mask_t *page_mask,
1883                                                                  uvm_va_block_region_t previous_subregion)
1884 {
1885     uvm_va_block_region_t subregion;
1886 
1887     if (!page_mask) {
1888         subregion.first = region.outer;
1889         subregion.outer = region.outer;
1890         return subregion;
1891     }
1892 
1893     subregion.first = find_next_bit(page_mask->bitmap, region.outer, previous_subregion.outer + 1);
1894     subregion.outer = find_next_zero_bit(page_mask->bitmap, region.outer, subregion.first + 1);
1895     return subregion;
1896 }
1897 
1898 // Iterate over contiguous subregions of the region given by the page mask.
1899 // If the page mask is NULL then it behaves as if it was a fully set mask and
1900 // the only subregion iterated over will be the region itself.
1901 #define for_each_va_block_subregion_in_mask(subregion, page_mask, region)                       \
1902     for ((subregion) = uvm_va_block_first_subregion_in_mask((region), (page_mask));             \
1903          (subregion).first != (region).outer;                                                   \
1904          (subregion) = uvm_va_block_next_subregion_in_mask((region), (page_mask), (subregion)))
1905 
1906 static uvm_page_index_t uvm_va_block_first_page_in_mask(uvm_va_block_region_t region,
1907                                                         const uvm_page_mask_t *page_mask)
1908 {
1909     if (page_mask)
1910         return find_next_bit(page_mask->bitmap, region.outer, region.first);
1911     else
1912         return region.first;
1913 }
1914 
1915 static uvm_page_index_t uvm_va_block_next_page_in_mask(uvm_va_block_region_t region,
1916                                                        const uvm_page_mask_t *page_mask,
1917                                                        uvm_page_index_t previous_page)
1918 {
1919     if (page_mask) {
1920         return find_next_bit(page_mask->bitmap, region.outer, previous_page + 1);
1921     }
1922     else {
1923         UVM_ASSERT(previous_page < region.outer);
1924         return previous_page + 1;
1925     }
1926 }
1927 
1928 static uvm_page_index_t uvm_va_block_first_unset_page_in_mask(uvm_va_block_region_t region,
1929                                                               const uvm_page_mask_t *page_mask)
1930 {
1931     if (page_mask)
1932         return find_next_zero_bit(page_mask->bitmap, region.outer, region.first);
1933     else
1934         return region.first;
1935 }
1936 
1937 static uvm_page_index_t uvm_va_block_next_unset_page_in_mask(uvm_va_block_region_t region,
1938                                                              const uvm_page_mask_t *page_mask,
1939                                                              uvm_page_index_t previous_page)
1940 {
1941     if (page_mask) {
1942         return find_next_zero_bit(page_mask->bitmap, region.outer, previous_page + 1);
1943     }
1944     else {
1945         UVM_ASSERT(previous_page < region.outer);
1946         return previous_page + 1;
1947     }
1948 }
1949 
1950 static NvU64 uvm_reverse_map_start(const uvm_reverse_map_t *reverse_map)
1951 {
1952     return uvm_va_block_cpu_page_address(reverse_map->va_block, reverse_map->region.first);
1953 }
1954 
1955 static NvU64 uvm_reverse_map_end(const uvm_reverse_map_t *reverse_map)
1956 {
1957     return uvm_va_block_cpu_page_address(reverse_map->va_block, reverse_map->region.first) +
1958            uvm_va_block_region_size(reverse_map->region) - 1;
1959 }
1960 
1961 // Iterate over contiguous pages of the region given by the page mask.
1962 // If the page mask is NULL then it behaves as if it was a fully set mask and
1963 // it will iterate over all pages within the region.
1964 #define for_each_va_block_page_in_region_mask(page_index, page_mask, region)                 \
1965     for ((page_index) = uvm_va_block_first_page_in_mask((region), (page_mask));              \
1966          (page_index) != (region).outer;                                                     \
1967          (page_index) = uvm_va_block_next_page_in_mask((region), (page_mask), (page_index)))
1968 
1969 // Same as for_each_va_block_page_in_region_mask, but the region spans the
1970 // whole given VA block
1971 #define for_each_va_block_page_in_mask(page_index, page_mask, va_block)                      \
1972     for_each_va_block_page_in_region_mask(page_index, page_mask, uvm_va_block_region_from_block(va_block))
1973 
1974 // Similar to for_each_va_block_page_in_region_mask, but iterating over pages
1975 // whose bit is unset.
1976 #define for_each_va_block_unset_page_in_region_mask(page_index, page_mask, region)           \
1977     for ((page_index) = uvm_va_block_first_unset_page_in_mask((region), (page_mask));        \
1978          (page_index) != (region).outer;                                                     \
1979          (page_index) = uvm_va_block_next_unset_page_in_mask((region), (page_mask), (page_index)))
1980 
1981 // Similar to for_each_va_block_page_in_mask, but iterating over pages whose
1982 // bit is unset.
1983 #define for_each_va_block_unset_page_in_mask(page_index, page_mask, va_block)                \
1984     for_each_va_block_unset_page_in_region_mask(page_index, page_mask, uvm_va_block_region_from_block(va_block))
1985 
1986 // Iterate over all pages within the given region
1987 #define for_each_va_block_page_in_region(page_index, region)                                 \
1988     for_each_va_block_page_in_region_mask((page_index), NULL, (region))
1989 
1990 // Iterate over all pages within the given VA block
1991 #define for_each_va_block_page(page_index, va_block)                                         \
1992     for_each_va_block_page_in_region((page_index), uvm_va_block_region_from_block(va_block))
1993 
1994 // Return the block region covered by the given chunk size. page_index must be
1995 // any page within the block known to be covered by the chunk.
1996 static uvm_va_block_region_t uvm_va_block_chunk_region(uvm_va_block_t *block,
1997                                                        uvm_chunk_size_t chunk_size,
1998                                                        uvm_page_index_t page_index)
1999 {
2000     NvU64 page_addr = uvm_va_block_cpu_page_address(block, page_index);
2001     NvU64 chunk_start_addr = UVM_ALIGN_DOWN(page_addr, chunk_size);
2002     uvm_page_index_t first = (uvm_page_index_t)((chunk_start_addr - block->start) / PAGE_SIZE);
2003     return uvm_va_block_region(first, first + (chunk_size / PAGE_SIZE));
2004 }
2005 
2006 //
2007 // Helpers for page state (permissions, size, residency)
2008 //
2009 
2010 bool uvm_va_block_page_is_gpu_authorized(uvm_va_block_t *va_block,
2011                                          uvm_page_index_t page_index,
2012                                          uvm_gpu_id_t gpu_id,
2013                                          uvm_prot_t required_prot);
2014 
2015 // Compute the processors that have a copy of the given page resident in their
2016 // memory.
2017 void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block,
2018                                            uvm_page_index_t page_index,
2019                                            uvm_processor_mask_t *resident_processors);
2020 
2021 // Count how many processors have a copy of the given page resident in their
2022 // memory.
2023 NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block, uvm_page_index_t page_index);
2024 
2025 // Get the processor with a resident copy of a page closest to the given
2026 // processor.
2027 uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block,
2028                                                           uvm_page_index_t page_index,
2029                                                           uvm_processor_id_t processor);
2030 
2031 // Insert a CPU chunk at the given page_index into the va_block.
2032 // Locking: The va_block lock must be held.
2033 NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block,
2034                                         uvm_cpu_chunk_t *chunk,
2035                                         uvm_page_index_t page_index);
2036 
2037 // Remove a CPU chunk at the given page_index from the va_block.
2038 // Locking: The va_block lock must be held.
2039 void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block,
2040                                      uvm_page_index_t page_index);
2041 
2042 // Return the CPU chunk at the given page_index from the va_block.
2043 // Locking: The va_block lock must be held.
2044 uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block,
2045                                                   uvm_page_index_t page_index);
2046 
2047 // Return the CPU chunk at the given page_index from the va_block.
2048 // Locking: The va_block lock must be held.
2049 struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block,
2050                                         uvm_page_index_t page_index);
2051 
2052 // Physically map a CPU chunk so it is DMA'able from all registered GPUs.
2053 // Locking: The va_block lock must be held.
2054 NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
2055                                              uvm_page_index_t page_index);
2056 
2057 // Physically unmap a CPU chunk from all registered GPUs.
2058 // Locking: The va_block lock must be held.
2059 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
2060                                           uvm_cpu_chunk_t *chunk,
2061                                           uvm_page_index_t page_index);
2062 
2063 // Remove any CPU chunks in the given region.
2064 // Locking: The va_block lock must be held.
2065 void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region);
2066 
2067 // Get CPU page size or 0 if it is not mapped
2068 NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block,
2069                                  uvm_page_index_t page_index);
2070 
2071 // Get GPU page size or 0 if it is not mapped on the given GPU
2072 NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index);
2073 
2074 // Get page size or 0 if it is not mapped on the given processor
2075 static NvU32 uvm_va_block_page_size_processor(uvm_va_block_t *va_block,
2076                                               uvm_processor_id_t processor_id,
2077                                               uvm_page_index_t page_index)
2078 {
2079     if (UVM_ID_IS_CPU(processor_id))
2080         return uvm_va_block_page_size_cpu(va_block, page_index);
2081     else
2082         return uvm_va_block_page_size_gpu(va_block, processor_id, page_index);
2083 }
2084 
2085 // Returns the big page size for the GPU VA space of the block
2086 NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu);
2087 
2088 // Returns the number of big pages in the VA block for the given size
2089 size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size);
2090 
2091 // Returns the number of big pages in the VA block for the big page size on the
2092 // given GPU
2093 static size_t uvm_va_block_gpu_num_big_pages(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
2094 {
2095     return uvm_va_block_num_big_pages(va_block, uvm_va_block_gpu_big_page_size(va_block, gpu));
2096 }
2097 
2098 // Returns the start address of the given big page index and big page size
2099 NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size);
2100 
2101 // Returns the region [start, end] of the given big page index and big page size
2102 uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block,
2103                                                    size_t big_page_index,
2104                                                    NvU32 big_page_size);
2105 
2106 // Returns the largest sub-region region of [start, end] which can fit big
2107 // pages. If the region cannot fit any big pages, an invalid region (0, 0) is
2108 // returned.
2109 uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size);
2110 
2111 // Returns the largest sub-region region of 'region' which can fit big pages.
2112 // If the region cannot fit any big pages, an invalid region (0, 0) is returned.
2113 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
2114                                                           uvm_va_block_region_t region,
2115                                                           NvU32 big_page_size);
2116 
2117 // Returns the big page index (the bit index within
2118 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
2119 // page_index cannot be covered by a big PTE due to alignment or block size,
2120 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned.
2121 size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size);
2122 
2123 // Returns the new residency for a page that faulted or triggered access
2124 // counter notifications. The read_duplicate output parameter indicates if the
2125 // page meets the requirements to be read-duplicated
2126 // va_block_context must not be NULL, va_block_context->policy must be valid,
2127 // and if the va_block is a HMM block, va_block_context->hmm.vma must be valid
2128 // which also means the va_block_context->mm is not NULL, retained, and locked
2129 // for at least read. See the comments for uvm_va_block_check_policy_is_valid()
2130 // and uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
2131 // Locking: the va_block lock must be held.
2132 uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block,
2133                                                  uvm_va_block_context_t *va_block_context,
2134                                                  uvm_page_index_t page_index,
2135                                                  uvm_processor_id_t processor_id,
2136                                                  NvU32 access_type_mask,
2137                                                  const uvm_va_policy_t *policy,
2138                                                  const uvm_perf_thrashing_hint_t *thrashing_hint,
2139                                                  uvm_service_operation_t operation,
2140                                                  bool *read_duplicate);
2141 
2142 // Return the maximum mapping protection for processor_id that will not require
2143 // any permision revocation on the rest of processors.
2144 uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block,
2145                                                         uvm_processor_id_t processor_id,
2146                                                         uvm_page_index_t page_index);
2147 
2148 // A helper macro for handling allocation-retry
2149 //
2150 // The macro takes a VA block, uvm_va_block_retry_t struct and a function call
2151 // to retry as long as it returns NV_ERR_MORE_PROCESSING_REQUIRED.
2152 //
2153 // block_retry can be NULL if it's not necessary for the function call,
2154 // otherwise it will be initialized and deinitialized by the macro.
2155 //
2156 // The macro also locks and unlocks the block's lock internally as it's expected
2157 // that the block's lock has been unlocked and relocked whenever the function call
2158 // returns NV_ERR_MORE_PROCESSING_REQUIRED and this makes it clear that the
2159 // block's state is not locked across these calls.
2160 #define UVM_VA_BLOCK_LOCK_RETRY(va_block, block_retry, call) ({     \
2161     NV_STATUS status;                                               \
2162     uvm_va_block_t *__block = (va_block);                           \
2163     uvm_va_block_retry_t *__retry = (block_retry);                  \
2164                                                                     \
2165     uvm_va_block_retry_init(__retry);                               \
2166                                                                     \
2167     uvm_mutex_lock(&__block->lock);                                 \
2168                                                                     \
2169     do {                                                            \
2170         status = (call);                                            \
2171     } while (status == NV_ERR_MORE_PROCESSING_REQUIRED);            \
2172                                                                     \
2173     uvm_mutex_unlock(&__block->lock);                               \
2174                                                                     \
2175     uvm_va_block_retry_deinit(__retry, __block);                    \
2176                                                                     \
2177     status;                                                         \
2178 })
2179 
2180 // A helper macro for handling allocation-retry
2181 //
2182 // The macro takes a VA block, uvm_va_block_retry_t struct and a function call
2183 // to retry as long as it returns NV_ERR_MORE_PROCESSING_REQUIRED.
2184 //
2185 // block_retry can be NULL if it's not necessary for the function call,
2186 // otherwise it will be initialized and deinitialized by the macro.
2187 //
2188 // This macro, as opposed to UVM_VA_BLOCK_LOCK_RETRY(), expects the block lock
2189 // to be already taken. Notably the block's lock might be unlocked and relocked
2190 // as part of the call.
2191 #define UVM_VA_BLOCK_RETRY_LOCKED(va_block, block_retry, call) ({   \
2192     NV_STATUS status;                                               \
2193     uvm_va_block_t *__block = (va_block);                           \
2194     uvm_va_block_retry_t *__retry = (block_retry);                  \
2195                                                                     \
2196     uvm_va_block_retry_init(__retry);                               \
2197                                                                     \
2198     uvm_assert_mutex_locked(&__block->lock);                        \
2199                                                                     \
2200     do {                                                            \
2201         status = (call);                                            \
2202     } while (status == NV_ERR_MORE_PROCESSING_REQUIRED);            \
2203                                                                     \
2204     uvm_va_block_retry_deinit(__retry, __block);                    \
2205                                                                     \
2206     status;                                                         \
2207 })
2208 
2209 #endif // __UVM_VA_BLOCK_H__
2210