1 /*******************************************************************************
2     Copyright (c) 2016-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #ifndef __UVM_VA_BLOCK_TYPES_H__
25 #define __UVM_VA_BLOCK_TYPES_H__
26 
27 #include "uvm_common.h"
28 #include "uvm_pte_batch.h"
29 #include "uvm_tlb_batch.h"
30 #include "uvm_forward_decl.h"
31 
32 #include <linux/migrate.h>
33 #include <linux/nodemask.h>
34 
35 // UVM_VA_BLOCK_BITS is 21, meaning the maximum block size is 2MB. Rationale:
36 // - 2MB matches the largest Pascal GPU page size so it's a natural fit
37 // - 2MB won't span more than one PDE on any chip, so the VA blocks never need
38 //   to track more than a single GPU PDE.
39 // - 2MB is a decent tradeoff between memory overhead and serialization
40 //   contention.
41 //
42 #define UVM_VA_BLOCK_BITS               21
43 
44 // Max size of a block in bytes
45 #define UVM_VA_BLOCK_SIZE               (1ULL << UVM_VA_BLOCK_BITS)
46 
47 #define UVM_VA_BLOCK_ALIGN_DOWN(addr)   UVM_ALIGN_DOWN(addr, UVM_VA_BLOCK_SIZE)
48 #define UVM_VA_BLOCK_ALIGN_UP(addr)     UVM_ALIGN_UP(addr, UVM_VA_BLOCK_SIZE)
49 
50 #define PAGES_PER_UVM_VA_BLOCK          (UVM_VA_BLOCK_SIZE / PAGE_SIZE)
51 
52 #define UVM_MIN_BIG_PAGE_SIZE           UVM_PAGE_SIZE_64K
53 #define MAX_BIG_PAGES_PER_UVM_VA_BLOCK  (UVM_VA_BLOCK_SIZE / UVM_MIN_BIG_PAGE_SIZE)
54 
55 // Prefetch heuristics shift the VA Block page mask so that it is always
56 // aligned to big page granularity. Big page is guaranteed not to exceed
57 // UVM_VA_BLOCK_SIZE, so it will use 2 * PAGES_PER_UVM_VA_BLOCK pages at
58 // most. Note that uvm_page_index_t needs to be able to hold outer page
59 // indices (one beyond the last one), for example in uvm_va_block_region_t.
60 #if (2 * PAGES_PER_UVM_VA_BLOCK) <= NV_U8_MAX
61     typedef NvU8 uvm_page_index_t;
62 #elif (2 * PAGES_PER_UVM_VA_BLOCK) <= NV_U16_MAX
63     typedef NvU16 uvm_page_index_t;
64 #else
65     #warning "Suspicious value for PAGES_PER_UVM_VA_BLOCK"
66     typedef NvU32 uvm_page_index_t;
67 #endif
68 
69 // Encapsulates a [first, outer) region of pages within a va block
70 typedef struct
71 {
72     // Page indices within the va block
73     uvm_page_index_t first;
74     uvm_page_index_t outer;
75 } uvm_va_block_region_t;
76 
77 typedef struct
78 {
79     DECLARE_BITMAP(bitmap, PAGES_PER_UVM_VA_BLOCK);
80 } uvm_page_mask_t;
81 
82 // When updating GPU PTEs, this struct describes the new arrangement of PTE
83 // sizes. It is calculated before the operation is applied so we know which PTE
84 // sizes to allocate.
85 //
86 // This only decribes the new layout. The operation page mask describes the new
87 // permissions of each of these PTEs.
88 typedef struct
89 {
90     // Whether the new PTE should remain 2m (if already 2m) or merged to 2m.
91     // The meaning is the same as uvm_va_block_gpu_state_t::pte_is_2m. If this
92     // is set, the other fields can be ignored.
93     bool pte_is_2m;
94 
95     // Whether the operation requires writing 4k PTEs and thus needs them
96     // allocated. Mutually exclusive to pte_is_2m, but not to big_ptes.
97     bool needs_4k;
98 
99     // These are the PTEs which will be big after the operation is done. This
100     // field will become the new value of uvm_va_block_gpu_state_t::big_ptes, so
101     // it contains both those big PTEs which are being modified by the
102     // operation, and any pre-existing big PTEs which remain unchanged. The
103     // latter will not have the corresponding bit set in big_ptes_covered.
104     DECLARE_BITMAP(big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
105 
106     // These are the big PTE regions which the operation is touching. These may
107     // or may not be big PTEs: use the big_ptes bitmap to determine that. For
108     // example, a bit set here but not in big_ptes means that the PTE size for
109     // that region should be 4k, and that some of those 4k PTEs will be written
110     // by the operation.
111     DECLARE_BITMAP(big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
112 } uvm_va_block_new_pte_state_t;
113 
114 // Event that triggered the call to uvm_va_block_make_resident/
115 // uvm_va_block_make_resident_read_duplicate
116 typedef enum
117 {
118     UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT,
119     UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT,
120     UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER,
121     UVM_MAKE_RESIDENT_CAUSE_PREFETCH,
122     UVM_MAKE_RESIDENT_CAUSE_EVICTION,
123     UVM_MAKE_RESIDENT_CAUSE_API_TOOLS,
124     UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE,
125     UVM_MAKE_RESIDENT_CAUSE_API_SET_RANGE_GROUP,
126     UVM_MAKE_RESIDENT_CAUSE_API_HINT,
127 
128     UVM_MAKE_RESIDENT_CAUSE_MAX
129 } uvm_make_resident_cause_t;
130 
131 // Page masks are printed using hex digits printing last to first from left to
132 // right. For readability, a colon is added to separate each group of pages
133 // stored in the same word of the bitmap.
134 #define UVM_PAGE_MASK_WORDS                 (PAGES_PER_UVM_VA_BLOCK / BITS_PER_LONG)
135 #define UVM_PAGE_MASK_PRINT_NUM_COLONS      (UVM_PAGE_MASK_WORDS > 0? UVM_PAGE_MASK_WORDS - 1 : 0)
136 #define UVM_PAGE_MASK_PRINT_MIN_BUFFER_SIZE (PAGES_PER_UVM_VA_BLOCK / 4 + UVM_PAGE_MASK_PRINT_NUM_COLONS + 1)
137 
138 typedef struct
139 {
140     // Pages that need to be mapped with the corresponding protection
141     uvm_page_mask_t page_mask;
142 
143     // Number of pages that need to be mapped with the corresponding
144     // protections. This is the same value as the result of
145     // bitmap_weight(page_mask)
146     unsigned count;
147 } uvm_prot_page_mask_array_t[UVM_PROT_MAX - 1];
148 
149 typedef struct
150 {
151     // A per-NUMA-node array of page masks (size num_possible_nodes()) that hold
152     // the set of CPU pages used by the migration operation.
153     uvm_page_mask_t **node_masks;
154 
155     // Node mask used to iterate over the page masks above.
156     // If a node's bit is set, it means that the page mask given by
157     // node_to_index() in node_masks has set pages.
158     nodemask_t nodes;
159 } uvm_make_resident_page_tracking_t;
160 
161 // In the worst case some VA block operations require more state than we should
162 // reasonably store on the stack. Instead, we dynamically allocate VA block
163 // contexts. These are used for almost all operations on VA blocks.
164 typedef struct
165 {
166     // Available as scratch space for the caller. Not used by any of the VA
167     // block APIs.
168     uvm_page_mask_t caller_page_mask;
169 
170     // Available as scratch space for the internal APIs. This is like a caller-
171     // save register: it shouldn't be used across function calls which also take
172     // this block_context.
173     uvm_page_mask_t scratch_page_mask;
174 
175     // Scratch node mask. This follows the same rules as scratch_page_mask;
176     nodemask_t scratch_node_mask;
177 
178     // Available as scratch space for the internal APIs. This is like a caller-
179     // save register: it shouldn't be used across function calls which also take
180     // this va_block_context.
181     uvm_processor_mask_t scratch_processor_mask;
182 
183     // Temporary mask in block_add_eviction_mappings().
184     uvm_processor_mask_t map_processors_eviction;
185 
186     // State used by uvm_va_block_make_resident
187     struct uvm_make_resident_context_struct
188     {
189         // Masks used internally
190         uvm_page_mask_t page_mask;
191         uvm_page_mask_t copy_resident_pages_mask;
192         uvm_page_mask_t pages_staged;
193 
194         // This is used to store which pages were successfully copied to the
195         // destination processor and used by uvm_va_block_make_resident_finish()
196         // to update the va_block state.
197         uvm_page_mask_t pages_migrated;
198 
199         // Out mask filled in by uvm_va_block_make_resident to indicate which
200         // pages actually changed residency.
201         uvm_page_mask_t pages_changed_residency;
202 
203         // Out mask of all processors involved in the migration either as
204         // source, destination or the processor performing the copy.
205         // Used to perform ECC checks after the migration is done.
206         uvm_processor_mask_t all_involved_processors;
207 
208         // Page mask used to compute the set of CPU pages for each CPU node.
209         uvm_page_mask_t node_pages_mask;
210 
211         // Final residency for the data. This is useful for callees to know if
212         // a migration is part of a staging copy
213         uvm_processor_id_t dest_id;
214 
215         // Final residency NUMA node if the migration destination is the CPU.
216         int dest_nid;
217 
218         // This structure is used to track CPU pages used for migrations on
219         // a per-NUMA node basis.
220         //
221         // The pages could be used for either migrations to the CPU (used to
222         // track the destination CPU pages) or staging copies (used to track
223         // the CPU pages used for the staging).
224         uvm_make_resident_page_tracking_t cpu_pages_used;
225 
226         // Event that triggered the call
227         uvm_make_resident_cause_t cause;
228     } make_resident;
229 
230     // State used by the mapping APIs (unmap, map, revoke). This could be used
231     // at the same time as the state in make_resident.
232     struct
233     {
234         // Master mask used by uvm_va_block_map and uvm_va_block_unmap, but
235         // they are never called concurrently. Bits are removed as the operation
236         // progresses.
237         uvm_page_mask_t map_running_page_mask;
238 
239         // Master mask used by uvm_va_block_revoke. It can be used concurrently
240         // with map_running_page_mask since revoke calls unmap and map. Bits
241         // are removed as the operation progresses.
242         uvm_page_mask_t revoke_running_page_mask;
243 
244         // Mask used by block_gpu_split_2m and block_gpu_split_big to track
245         // splitting of big PTEs but they are never called concurrently. This
246         // mask can be used concurrently with other page masks.
247         uvm_page_mask_t big_split_page_mask;
248 
249         // Mask used by block_unmap_gpu to track non_uvm_lite_gpus which have
250         // this block mapped. This mask can be used concurrently with other page
251         // masks.
252         uvm_processor_mask_t non_uvm_lite_gpus;
253 
254         uvm_page_mask_t page_mask;
255         uvm_page_mask_t filtered_page_mask;
256         uvm_page_mask_t migratable_mask;
257 
258         uvm_va_block_new_pte_state_t new_pte_state;
259 
260         uvm_pte_batch_t pte_batch;
261         uvm_tlb_batch_t tlb_batch;
262 
263         // Event that triggered the call to the mapping function
264         UvmEventMapRemoteCause cause;
265     } mapping;
266 
267     // Used when adding page mappings with using different protections
268     uvm_prot_page_mask_array_t mask_by_prot;
269 
270     struct
271     {
272         uvm_page_mask_t running_page_mask;
273     } update_read_duplicated_pages;
274 
275     // mm to use for the operation. If this is non-NULL, the caller guarantees
276     // that the mm will be valid (reference held) for the duration of the
277     // block operation.
278     //
279     // If this is NULL, the block operation skips anything which would require
280     // the mm, such as creating CPU mappings.
281     struct mm_struct *mm;
282 
283     struct
284     {
285         // These are used for migrate_vma_*(), hmm_range_fault(), and
286         // make_device_exclusive_range() handling.
287         unsigned long src_pfns[PAGES_PER_UVM_VA_BLOCK];
288         union {
289             unsigned long dst_pfns[PAGES_PER_UVM_VA_BLOCK];
290             struct page *pages[PAGES_PER_UVM_VA_BLOCK];
291         };
292 
293         // Cached VMA pointer. This is only valid while holding the mmap_lock.
294         struct vm_area_struct *vma;
295 
296 #if UVM_IS_CONFIG_HMM()
297 
298         // Temporary mask used in uvm_hmm_block_add_eviction_mappings().
299         uvm_processor_mask_t map_processors_eviction;
300 
301         // Used for migrate_vma_*() to migrate pages to/from GPU/CPU.
302         struct migrate_vma migrate_vma_args;
303 #endif
304     } hmm;
305 
306     // Convenience buffer for page mask prints
307     char page_mask_string_buffer[UVM_PAGE_MASK_PRINT_MIN_BUFFER_SIZE];
308 } uvm_va_block_context_t;
309 
310 typedef enum
311 {
312     UVM_VA_BLOCK_TRANSFER_MODE_MOVE = 1,
313     UVM_VA_BLOCK_TRANSFER_MODE_COPY = 2
314 } uvm_va_block_transfer_mode_t;
315 
316 struct uvm_reverse_map_struct
317 {
318     // VA block where the VA region of this Phys/DMA -> Virt translation
319     // belongs to
320     uvm_va_block_t             *va_block;
321 
322     // VA block region covered by this translation
323     uvm_va_block_region_t         region;
324 
325     // Processor the physical memory range belongs to
326     uvm_processor_id_t             owner;
327 };
328 
329 typedef enum
330 {
331     UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS,
332     UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS,
333     UVM_SERVICE_OPERATION_ACCESS_COUNTERS,
334 } uvm_service_operation_t;
335 
336 typedef enum
337 {
338     UVM_MIGRATE_MODE_MAKE_RESIDENT,
339     UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
340 } uvm_migrate_mode_t;
341 
342 #endif
343