1 /******************************************************************************* 2 Copyright (c) 2016-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #ifndef __UVM_VA_BLOCK_TYPES_H__ 25 #define __UVM_VA_BLOCK_TYPES_H__ 26 27 #include "uvm_common.h" 28 #include "uvm_pte_batch.h" 29 #include "uvm_tlb_batch.h" 30 #include "uvm_forward_decl.h" 31 32 #include <linux/migrate.h> 33 #include <linux/nodemask.h> 34 35 // UVM_VA_BLOCK_BITS is 21, meaning the maximum block size is 2MB. Rationale: 36 // - 2MB matches the largest Pascal GPU page size so it's a natural fit 37 // - 2MB won't span more than one PDE on any chip, so the VA blocks never need 38 // to track more than a single GPU PDE. 39 // - 2MB is a decent tradeoff between memory overhead and serialization 40 // contention. 41 // 42 #define UVM_VA_BLOCK_BITS 21 43 44 // Max size of a block in bytes 45 #define UVM_VA_BLOCK_SIZE (1ULL << UVM_VA_BLOCK_BITS) 46 47 #define UVM_VA_BLOCK_ALIGN_DOWN(addr) UVM_ALIGN_DOWN(addr, UVM_VA_BLOCK_SIZE) 48 #define UVM_VA_BLOCK_ALIGN_UP(addr) UVM_ALIGN_UP(addr, UVM_VA_BLOCK_SIZE) 49 50 #define PAGES_PER_UVM_VA_BLOCK (UVM_VA_BLOCK_SIZE / PAGE_SIZE) 51 52 #define UVM_MIN_BIG_PAGE_SIZE UVM_PAGE_SIZE_64K 53 #define MAX_BIG_PAGES_PER_UVM_VA_BLOCK (UVM_VA_BLOCK_SIZE / UVM_MIN_BIG_PAGE_SIZE) 54 55 // Prefetch heuristics shift the VA Block page mask so that it is always 56 // aligned to big page granularity. Big page is guaranteed not to exceed 57 // UVM_VA_BLOCK_SIZE, so it will use 2 * PAGES_PER_UVM_VA_BLOCK pages at 58 // most. Note that uvm_page_index_t needs to be able to hold outer page 59 // indices (one beyond the last one), for example in uvm_va_block_region_t. 60 #if (2 * PAGES_PER_UVM_VA_BLOCK) <= NV_U8_MAX 61 typedef NvU8 uvm_page_index_t; 62 #elif (2 * PAGES_PER_UVM_VA_BLOCK) <= NV_U16_MAX 63 typedef NvU16 uvm_page_index_t; 64 #else 65 #warning "Suspicious value for PAGES_PER_UVM_VA_BLOCK" 66 typedef NvU32 uvm_page_index_t; 67 #endif 68 69 // Encapsulates a [first, outer) region of pages within a va block 70 typedef struct 71 { 72 // Page indices within the va block 73 uvm_page_index_t first; 74 uvm_page_index_t outer; 75 } uvm_va_block_region_t; 76 77 typedef struct 78 { 79 DECLARE_BITMAP(bitmap, PAGES_PER_UVM_VA_BLOCK); 80 } uvm_page_mask_t; 81 82 // When updating GPU PTEs, this struct describes the new arrangement of PTE 83 // sizes. It is calculated before the operation is applied so we know which PTE 84 // sizes to allocate. 85 // 86 // This only decribes the new layout. The operation page mask describes the new 87 // permissions of each of these PTEs. 88 typedef struct 89 { 90 // Whether the new PTE should remain 2m (if already 2m) or merged to 2m. 91 // The meaning is the same as uvm_va_block_gpu_state_t::pte_is_2m. If this 92 // is set, the other fields can be ignored. 93 bool pte_is_2m; 94 95 // Whether the operation requires writing 4k PTEs and thus needs them 96 // allocated. Mutually exclusive to pte_is_2m, but not to big_ptes. 97 bool needs_4k; 98 99 // These are the PTEs which will be big after the operation is done. This 100 // field will become the new value of uvm_va_block_gpu_state_t::big_ptes, so 101 // it contains both those big PTEs which are being modified by the 102 // operation, and any pre-existing big PTEs which remain unchanged. The 103 // latter will not have the corresponding bit set in big_ptes_covered. 104 DECLARE_BITMAP(big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 105 106 // These are the big PTE regions which the operation is touching. These may 107 // or may not be big PTEs: use the big_ptes bitmap to determine that. For 108 // example, a bit set here but not in big_ptes means that the PTE size for 109 // that region should be 4k, and that some of those 4k PTEs will be written 110 // by the operation. 111 DECLARE_BITMAP(big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 112 } uvm_va_block_new_pte_state_t; 113 114 // Event that triggered the call to uvm_va_block_make_resident/ 115 // uvm_va_block_make_resident_read_duplicate 116 typedef enum 117 { 118 UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT, 119 UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT, 120 UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER, 121 UVM_MAKE_RESIDENT_CAUSE_PREFETCH, 122 UVM_MAKE_RESIDENT_CAUSE_EVICTION, 123 UVM_MAKE_RESIDENT_CAUSE_API_TOOLS, 124 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE, 125 UVM_MAKE_RESIDENT_CAUSE_API_SET_RANGE_GROUP, 126 UVM_MAKE_RESIDENT_CAUSE_API_HINT, 127 128 UVM_MAKE_RESIDENT_CAUSE_MAX 129 } uvm_make_resident_cause_t; 130 131 // Page masks are printed using hex digits printing last to first from left to 132 // right. For readability, a colon is added to separate each group of pages 133 // stored in the same word of the bitmap. 134 #define UVM_PAGE_MASK_WORDS (PAGES_PER_UVM_VA_BLOCK / BITS_PER_LONG) 135 #define UVM_PAGE_MASK_PRINT_NUM_COLONS (UVM_PAGE_MASK_WORDS > 0? UVM_PAGE_MASK_WORDS - 1 : 0) 136 #define UVM_PAGE_MASK_PRINT_MIN_BUFFER_SIZE (PAGES_PER_UVM_VA_BLOCK / 4 + UVM_PAGE_MASK_PRINT_NUM_COLONS + 1) 137 138 typedef struct 139 { 140 // Pages that need to be mapped with the corresponding protection 141 uvm_page_mask_t page_mask; 142 143 // Number of pages that need to be mapped with the corresponding 144 // protections. This is the same value as the result of 145 // bitmap_weight(page_mask) 146 unsigned count; 147 } uvm_prot_page_mask_array_t[UVM_PROT_MAX - 1]; 148 149 typedef struct 150 { 151 // A per-NUMA-node array of page masks (size num_possible_nodes()) that hold 152 // the set of CPU pages used by the migration operation. 153 uvm_page_mask_t **node_masks; 154 155 // Node mask used to iterate over the page masks above. 156 // If a node's bit is set, it means that the page mask given by 157 // node_to_index() in node_masks has set pages. 158 nodemask_t nodes; 159 } uvm_make_resident_page_tracking_t; 160 161 // In the worst case some VA block operations require more state than we should 162 // reasonably store on the stack. Instead, we dynamically allocate VA block 163 // contexts. These are used for almost all operations on VA blocks. 164 typedef struct 165 { 166 // Available as scratch space for the caller. Not used by any of the VA 167 // block APIs. 168 uvm_page_mask_t caller_page_mask; 169 170 // Available as scratch space for the caller. Not used by any of the VA 171 // block APIs. 172 uvm_processor_mask_t caller_processor_mask; 173 174 // Available as scratch space for the internal APIs. This is like a caller- 175 // save register: it shouldn't be used across function calls which also take 176 // this block_context. 177 uvm_page_mask_t scratch_page_mask; 178 179 // Scratch node mask. This follows the same rules as scratch_page_mask; 180 nodemask_t scratch_node_mask; 181 182 // Available as scratch space for the internal APIs. This is like a caller- 183 // save register: it shouldn't be used across function calls which also take 184 // this va_block_context. 185 uvm_processor_mask_t scratch_processor_mask; 186 187 // Temporary mask used in block_add_eviction_mappings(). 188 uvm_processor_mask_t map_processors_eviction; 189 190 // Temporary mask used in uvm_perf_thrashing_unmap_remote_pinned_pages_all. 191 uvm_processor_mask_t unmap_processors_mask; 192 193 // Temporary mask used in thrashing_processors_have_fast_access(). 194 uvm_processor_mask_t fast_access_mask; 195 196 // State used by uvm_va_block_make_resident 197 struct uvm_make_resident_context_struct 198 { 199 // Masks used internally 200 uvm_page_mask_t page_mask; 201 uvm_page_mask_t copy_resident_pages_mask; 202 uvm_page_mask_t pages_staged; 203 204 // This is used to store which pages were successfully copied to the 205 // destination processor and used by uvm_va_block_make_resident_finish() 206 // to update the va_block state. 207 uvm_page_mask_t pages_migrated; 208 209 // Out mask filled in by uvm_va_block_make_resident to indicate which 210 // pages actually changed residency. 211 uvm_page_mask_t pages_changed_residency; 212 213 // Out mask of all processors involved in the migration either as 214 // source, destination or the processor performing the copy. 215 // Used to perform ECC checks after the migration is done. 216 uvm_processor_mask_t all_involved_processors; 217 218 // Page mask used to compute the set of CPU pages for each CPU node. 219 uvm_page_mask_t node_pages_mask; 220 221 // Final residency for the data. This is useful for callees to know if 222 // a migration is part of a staging copy 223 uvm_processor_id_t dest_id; 224 225 // Final residency NUMA node if the migration destination is the CPU. 226 int dest_nid; 227 228 // This structure is used to track CPU pages used for migrations on 229 // a per-NUMA node basis. 230 // 231 // The pages could be used for either migrations to the CPU (used to 232 // track the destination CPU pages) or staging copies (used to track 233 // the CPU pages used for the staging). 234 uvm_make_resident_page_tracking_t cpu_pages_used; 235 236 // Event that triggered the call 237 uvm_make_resident_cause_t cause; 238 } make_resident; 239 240 // State used by the mapping APIs (unmap, map, revoke). This could be used 241 // at the same time as the state in make_resident. 242 struct 243 { 244 // Master mask used by uvm_va_block_map and uvm_va_block_unmap, but 245 // they are never called concurrently. Bits are removed as the operation 246 // progresses. 247 uvm_page_mask_t map_running_page_mask; 248 249 // Master mask used by uvm_va_block_revoke. It can be used concurrently 250 // with map_running_page_mask since revoke calls unmap and map. Bits 251 // are removed as the operation progresses. 252 uvm_page_mask_t revoke_running_page_mask; 253 254 // Mask used by block_gpu_split_2m and block_gpu_split_big to track 255 // splitting of big PTEs but they are never called concurrently. This 256 // mask can be used concurrently with other page masks. 257 uvm_page_mask_t big_split_page_mask; 258 259 // Mask used by block_unmap_gpu to track non_uvm_lite_gpus which have 260 // this block mapped. This mask can be used concurrently with other page 261 // masks. 262 uvm_processor_mask_t non_uvm_lite_gpus; 263 264 uvm_page_mask_t page_mask; 265 uvm_page_mask_t filtered_page_mask; 266 uvm_page_mask_t migratable_mask; 267 268 uvm_va_block_new_pte_state_t new_pte_state; 269 270 uvm_pte_batch_t pte_batch; 271 uvm_tlb_batch_t tlb_batch; 272 273 // Event that triggered the call to the mapping function 274 UvmEventMapRemoteCause cause; 275 } mapping; 276 277 // Used when adding page mappings with using different protections 278 uvm_prot_page_mask_array_t mask_by_prot; 279 280 struct 281 { 282 uvm_page_mask_t running_page_mask; 283 } update_read_duplicated_pages; 284 285 // mm to use for the operation. If this is non-NULL, the caller guarantees 286 // that the mm will be valid (reference held) for the duration of the 287 // block operation. 288 // 289 // If this is NULL, the block operation skips anything which would require 290 // the mm, such as creating CPU mappings. 291 struct mm_struct *mm; 292 293 struct 294 { 295 // These are used for migrate_vma_*(), hmm_range_fault(), and 296 // make_device_exclusive_range() handling. 297 unsigned long src_pfns[PAGES_PER_UVM_VA_BLOCK]; 298 union { 299 unsigned long dst_pfns[PAGES_PER_UVM_VA_BLOCK]; 300 struct page *pages[PAGES_PER_UVM_VA_BLOCK]; 301 }; 302 303 // Cached VMA pointer. This is only valid while holding the mmap_lock. 304 struct vm_area_struct *vma; 305 306 #if UVM_IS_CONFIG_HMM() 307 308 // Temporary mask used in uvm_hmm_block_add_eviction_mappings(). 309 uvm_processor_mask_t map_processors_eviction; 310 311 // Used for migrate_vma_*() to migrate pages to/from GPU/CPU. 312 struct migrate_vma migrate_vma_args; 313 #endif 314 } hmm; 315 316 // Convenience buffer for page mask prints 317 char page_mask_string_buffer[UVM_PAGE_MASK_PRINT_MIN_BUFFER_SIZE]; 318 } uvm_va_block_context_t; 319 320 typedef enum 321 { 322 UVM_VA_BLOCK_TRANSFER_MODE_MOVE = 1, 323 UVM_VA_BLOCK_TRANSFER_MODE_COPY = 2 324 } uvm_va_block_transfer_mode_t; 325 326 struct uvm_reverse_map_struct 327 { 328 // VA block where the VA region of this Phys/DMA -> Virt translation 329 // belongs to 330 uvm_va_block_t *va_block; 331 332 // VA block region covered by this translation 333 uvm_va_block_region_t region; 334 335 // Processor the physical memory range belongs to 336 uvm_processor_id_t owner; 337 }; 338 339 typedef enum 340 { 341 UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS, 342 UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS, 343 UVM_SERVICE_OPERATION_ACCESS_COUNTERS, 344 } uvm_service_operation_t; 345 346 typedef enum 347 { 348 UVM_MIGRATE_MODE_MAKE_RESIDENT, 349 UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP, 350 } uvm_migrate_mode_t; 351 352 #endif 353