1 /******************************************************************************* 2 Copyright (c) 2017-2021 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #ifndef __UVM_PMM_SYSMEM_H__ 25 #define __UVM_PMM_SYSMEM_H__ 26 27 #include "uvm_common.h" 28 #include "uvm_linux.h" 29 #include "uvm_forward_decl.h" 30 #include "uvm_lock.h" 31 #include "uvm_pmm_gpu.h" 32 33 // Module to handle per-GPU user mappings to sysmem physical memory. Notably, 34 // this implements a reverse map of the DMA address to {va_block, virt_addr}. 35 // This is required by the GPU access counters feature since they may provide a 36 // physical address in the notification packet (GPA notifications). We use the 37 // table to obtain the VAs of the memory regions being accessed remotely. The 38 // reverse map is implemented by a radix tree, which is indexed using the 39 // DMA address. For now, only PAGE_SIZE translations are supported (i.e. no 40 // big/huge pages). 41 // 42 // TODO: Bug 1995015: add support for physically-contiguous mappings. 43 struct uvm_pmm_sysmem_mappings_struct 44 { 45 uvm_gpu_t *gpu; 46 47 struct radix_tree_root reverse_map_tree; 48 49 uvm_mutex_t reverse_map_lock; 50 }; 51 52 // See comments in uvm_linux.h 53 #ifdef NV_RADIX_TREE_REPLACE_SLOT_PRESENT 54 #define uvm_pmm_sysmem_mappings_indirect_supported() true 55 #else 56 #define uvm_pmm_sysmem_mappings_indirect_supported() false 57 #endif 58 59 // Global initialization/exit functions, that need to be called during driver 60 // initialization/tear-down. These are needed to allocate/free global internal 61 // data structures. 62 NV_STATUS uvm_pmm_sysmem_init(void); 63 void uvm_pmm_sysmem_exit(void); 64 65 // Initialize per-GPU sysmem mapping tracking 66 NV_STATUS uvm_pmm_sysmem_mappings_init(uvm_gpu_t *gpu, uvm_pmm_sysmem_mappings_t *sysmem_mappings); 67 68 // Destroy per-GPU sysmem mapping tracking. The caller must ensure that all the 69 // mappings have been removed before calling this function. 70 void uvm_pmm_sysmem_mappings_deinit(uvm_pmm_sysmem_mappings_t *sysmem_mappings); 71 72 // If the GPU used to initialize sysmem_mappings supports access counters, the 73 // dma_addr -> {va_block, virt_addr} mapping is inserted in the reverse map. 74 NV_STATUS uvm_pmm_sysmem_mappings_add_gpu_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings, 75 NvU64 dma_addr, 76 NvU64 virt_addr, 77 NvU64 region_size, 78 uvm_va_block_t *va_block, 79 uvm_processor_id_t owner); 80 81 static NV_STATUS uvm_pmm_sysmem_mappings_add_gpu_chunk_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings, 82 NvU64 dma_addr, 83 NvU64 virt_addr, 84 NvU64 region_size, 85 uvm_va_block_t *va_block, 86 uvm_gpu_id_t owner) 87 { 88 if (!uvm_pmm_sysmem_mappings_indirect_supported()) 89 return NV_OK; 90 91 return uvm_pmm_sysmem_mappings_add_gpu_mapping(sysmem_mappings, 92 dma_addr, 93 virt_addr, 94 region_size, 95 va_block, 96 owner); 97 } 98 99 // If the GPU used to initialize sysmem_mappings supports access counters, the 100 // entries for the physical region starting at dma_addr are removed from the 101 // reverse map. 102 void uvm_pmm_sysmem_mappings_remove_gpu_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings, NvU64 dma_addr); 103 104 static void uvm_pmm_sysmem_mappings_remove_gpu_chunk_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings, NvU64 dma_addr) 105 { 106 if (uvm_pmm_sysmem_mappings_indirect_supported()) 107 uvm_pmm_sysmem_mappings_remove_gpu_mapping(sysmem_mappings, dma_addr); 108 } 109 110 // Like uvm_pmm_sysmem_mappings_remove_gpu_mapping but it doesn't assert if the 111 // mapping doesn't exist. See uvm_va_block_evict_chunks for more information. 112 void uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction(uvm_pmm_sysmem_mappings_t *sysmem_mappings, NvU64 dma_addr); 113 114 // If the GPU used to initialize sysmem_mappings supports access counters, the 115 // mapping for the region starting at dma_addr is updated with va_block. 116 // This is required on VA block split. 117 void uvm_pmm_sysmem_mappings_reparent_gpu_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings, 118 NvU64 dma_addr, 119 uvm_va_block_t *va_block); 120 121 static void uvm_pmm_sysmem_mappings_reparent_gpu_chunk_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings, 122 NvU64 dma_addr, 123 uvm_va_block_t *va_block) 124 { 125 if (uvm_pmm_sysmem_mappings_indirect_supported()) 126 uvm_pmm_sysmem_mappings_reparent_gpu_mapping(sysmem_mappings, dma_addr, va_block); 127 } 128 129 // If the GPU used to initialize sysmem_mappings supports access counters, the 130 // mapping for the region starting at dma_addr is split into regions of 131 // new_region_size. new_region_size must be a power of two and smaller than the 132 // previously-registered size. 133 NV_STATUS uvm_pmm_sysmem_mappings_split_gpu_mappings(uvm_pmm_sysmem_mappings_t *sysmem_mappings, 134 NvU64 dma_addr, 135 NvU64 new_region_size); 136 137 static NV_STATUS uvm_pmm_sysmem_mappings_split_gpu_chunk_mappings(uvm_pmm_sysmem_mappings_t *sysmem_mappings, 138 NvU64 dma_addr, 139 NvU64 new_region_size) 140 { 141 if (!uvm_pmm_sysmem_mappings_indirect_supported()) 142 return NV_OK; 143 144 return uvm_pmm_sysmem_mappings_split_gpu_mappings(sysmem_mappings, dma_addr, new_region_size); 145 } 146 147 // If the GPU used to initialize sysmem_mappings supports access counters, all 148 // the mappings within the region [dma_addr, dma_addr + new_region_size) are 149 // merged into a single mapping. new_region_size must be a power of two. The 150 // whole region must be previously populated with mappings and all of them must 151 // have the same VA block and processor owner. 152 void uvm_pmm_sysmem_mappings_merge_gpu_mappings(uvm_pmm_sysmem_mappings_t *sysmem_mappings, 153 NvU64 dma_addr, 154 NvU64 new_region_size); 155 156 static void uvm_pmm_sysmem_mappings_merge_gpu_chunk_mappings(uvm_pmm_sysmem_mappings_t *sysmem_mappings, 157 NvU64 dma_addr, 158 NvU64 new_region_size) 159 { 160 if (uvm_pmm_sysmem_mappings_indirect_supported()) 161 uvm_pmm_sysmem_mappings_merge_gpu_mappings(sysmem_mappings, dma_addr, new_region_size); 162 } 163 164 // Obtain the {va_block, virt_addr} information for the mappings in the given 165 // [dma_addr:dma_addr + region_size) range. dma_addr and region_size must be 166 // page-aligned. 167 // 168 // Valid translations are written to out_mappings sequentially (there are no 169 // gaps). max_out_mappings are written, at most. The caller is required to 170 // provide enough entries in out_mappings. 171 // 172 // The VA Block in each returned translation entry is retained, and it's up to 173 // the caller to release them 174 size_t uvm_pmm_sysmem_mappings_dma_to_virt(uvm_pmm_sysmem_mappings_t *sysmem_mappings, 175 NvU64 dma_addr, 176 NvU64 region_size, 177 uvm_reverse_map_t *out_mappings, 178 size_t max_out_mappings); 179 180 #define UVM_CPU_CHUNK_SIZES (UVM_PAGE_SIZE_2M | UVM_PAGE_SIZE_64K | PAGE_SIZE) 181 182 typedef enum 183 { 184 UVM_CPU_CHUNK_ALLOC_FLAGS_NONE = 0, 185 186 // Zero the chunk. 187 UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO = (1 << 0), 188 189 // Account for the chunk in the cgroup context. 190 UVM_CPU_CHUNK_ALLOC_FLAGS_ACCOUNT = (1 << 1), 191 } uvm_cpu_chunk_alloc_flags_t; 192 193 typedef enum 194 { 195 UVM_CPU_CHUNK_TYPE_PHYSICAL, 196 UVM_CPU_CHUNK_TYPE_LOGICAL, 197 UVM_CPU_CHUNK_TYPE_HMM 198 } uvm_cpu_chunk_type_t; 199 200 // CPU memory chunk descriptor. 201 // CPU memory chunks represent a physically contiguous CPU memory 202 // allocation. 203 // CPU memory chunks can be created due to CPU page allocation or 204 // CPU chunk splitting. Chunks created due to page allocations are 205 // referred to as "physical chunks", while chunks resulting from 206 // splitting are referred to as "logical chunks". 207 struct uvm_cpu_chunk_struct 208 { 209 uvm_cpu_chunk_type_t type:2; 210 211 // Size of the chunk. 212 // For chunks resulting from page allocations (physical chunks), 213 // this value is the size of the physical allocation. 214 size_t log2_size : order_base_2(UVM_CHUNK_SIZE_MASK_SIZE); 215 216 // Chunk reference count used when a CPU chunk is split. Each 217 // child sub-chunk will increment the reference count of its 218 // parent. 219 // The reference count is set to 1 when the chunk is created. 220 // This initial reference is dropped if the chunk is split in 221 // order to automatically destroy the chunk when all logical 222 // chunks resulting from the split are destroyed. 223 nv_kref_t refcount; 224 225 // Pointer to the CPU page backing this CPU chunk. 226 // For physical chunks, this will point to the head page. Physical 227 // chunk allocation will set the reference count for the struct 228 // page (compound or not) to 1. 229 // 230 // For logical chunks, this will point to the struct page from 231 // the compound page array corresponding to the correct page index. 232 // Because freeing a logical chunk does not result in freeing of 233 // any struct page(s) and both physical and logical chunks are 234 // reference counted, there is no need to take separate references 235 // to the struct page for logical chunks. 236 struct page *page; 237 }; 238 239 typedef struct 240 { 241 NvU64 dma_addr; 242 NvU32 map_count; 243 } uvm_cpu_phys_mapping_t; 244 245 typedef struct 246 { 247 uvm_cpu_chunk_t common; 248 249 // Lock protecting dirty_bitmap and gpu_mappings. 250 uvm_mutex_t lock; 251 252 struct 253 { 254 // Per-GPU array of DMA mapping addresses for the chunk. 255 // The DMA mapping addresses for logical chunks are adjusted 256 // to the correct offset within the parent chunk. 257 union 258 { 259 uvm_cpu_phys_mapping_t static_entry; 260 uvm_cpu_phys_mapping_t *dynamic_entries; 261 }; 262 263 // Miximum number of physical mapping entries available. 264 // The initial value is 1 since the static_entry is always 265 // available. 266 // When using the dynamic_entries, it holds the size of the 267 // dynamic_entries array. This may be more than the number 268 // of GPUs with active mappings. The number of active entries 269 // is the number of set bits in dma_addrs_mask. 270 size_t max_entries; 271 272 // The set of GPU ID's that have an active physical mapping. 273 // Since physical mappings are shared by all GPUs under a 274 // parent GPU, this mask only needs to track uvm_parent_gpu_t. 275 uvm_processor_mask_t dma_addrs_mask; 276 } gpu_mappings; 277 278 // A dynamically allocated bitmap (one per PAGE_SIZE page) used 279 // to track dirty state of each PAGE_SIZE page. 280 // Large CPU chunks are allocated as compound pages. For such 281 // pages, the kernel keeps dirtiness state with a single bit 282 // (in the compound page head) that covers the entire compound 283 // page. 284 // 285 // In the case of UVM-Lite GPUs, using the dirty bit of the 286 // the compound page will cause performance regression due to 287 // the copying of extra data. We mitigate this by using this 288 // bitmap to track which base pages are dirty. 289 unsigned long *dirty_bitmap; 290 291 } uvm_cpu_physical_chunk_t; 292 293 typedef struct 294 { 295 uvm_cpu_chunk_t common; 296 297 // Pointer to the parent chunk (which could also be a logical chunk). 298 uvm_cpu_chunk_t *parent; 299 uvm_processor_mask_t mapped_gpus; 300 } uvm_cpu_logical_chunk_t; 301 302 // Return the set of allowed CPU chunk allocation sizes. 303 uvm_chunk_sizes_mask_t uvm_cpu_chunk_get_allocation_sizes(void); 304 305 // Allocate a physical CPU chunk of the specified size. 306 // 307 // If a CPU chunk allocation succeeds, NV_OK is returned. new_chunk will be set 308 // to point to the newly allocated chunk. On failure, NV_ERR_NO_MEMORY is 309 // returned. 310 NV_STATUS uvm_cpu_chunk_alloc(uvm_chunk_size_t alloc_size, 311 uvm_cpu_chunk_alloc_flags_t flags, 312 uvm_cpu_chunk_t **new_chunk); 313 314 // Allocate a HMM CPU chunk. 315 // 316 // HMM chunks differ from normal CPU chunks in that the kernel has already 317 // allocated the page for them. This means we don't allocate any CPU memory 318 // here. It also means the kernel holds the reference to the page, so we 319 // shouldn't call put_page() when freeing the chunk. 320 // 321 // If a CPU chunk allocation succeeds NV_OK is returned and new_chunk will be 322 // set to point to the newly allocated chunk. On failure, NV_ERR_NO_MEMORY is 323 // returned. 324 // 325 // Note that the kernel retains logical ownership of the page. This means page 326 // properties should not be directly modified by UVM. In particular page flags 327 // such as PageDirty should not be modified by UVM, nor can UVM directly free 328 // the page. The kernel is also responsible for mapping/unmapping the page on 329 // the CPU. We create a CPU chunk for the page primarily to allow GPU mappings 330 // for the page to be created. 331 NV_STATUS uvm_cpu_chunk_alloc_hmm(struct page *page, 332 uvm_cpu_chunk_t **new_chunk); 333 334 // Convert a physical chunk to an HMM chunk. 335 static void uvm_cpu_chunk_make_hmm(uvm_cpu_chunk_t *chunk) 336 { 337 UVM_ASSERT(chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL); 338 339 chunk->type = UVM_CPU_CHUNK_TYPE_HMM; 340 } 341 342 uvm_chunk_size_t uvm_cpu_chunk_get_size(uvm_cpu_chunk_t *chunk); 343 344 // Return the number of base system pages covered by the CPU chunk. 345 static size_t uvm_cpu_chunk_num_pages(uvm_cpu_chunk_t *chunk) 346 { 347 UVM_ASSERT(chunk); 348 return uvm_cpu_chunk_get_size(chunk) / PAGE_SIZE; 349 } 350 351 static inline bool uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_t *chunk) 352 { 353 return chunk->type == UVM_CPU_CHUNK_TYPE_HMM; 354 } 355 356 static bool uvm_cpu_chunk_is_physical(uvm_cpu_chunk_t *chunk) 357 { 358 return (chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL || uvm_cpu_chunk_is_hmm(chunk)); 359 } 360 361 static bool uvm_cpu_chunk_is_logical(uvm_cpu_chunk_t *chunk) 362 { 363 return chunk->type == UVM_CPU_CHUNK_TYPE_LOGICAL; 364 } 365 366 static uvm_cpu_physical_chunk_t *uvm_cpu_chunk_to_physical(uvm_cpu_chunk_t *chunk) 367 { 368 UVM_ASSERT(uvm_cpu_chunk_is_physical(chunk)); 369 return container_of((chunk), uvm_cpu_physical_chunk_t, common); 370 } 371 372 static uvm_cpu_logical_chunk_t *uvm_cpu_chunk_to_logical(uvm_cpu_chunk_t *chunk) 373 { 374 UVM_ASSERT(uvm_cpu_chunk_is_logical(chunk)); 375 return container_of((chunk), uvm_cpu_logical_chunk_t, common); 376 } 377 378 // Free a CPU chunk. 379 // This may not result in the immediate freeing of the physical pages of the 380 // chunk if this is a logical chunk and there are other logical chunks holding 381 // references to the physical chunk. 382 // If any DMA mappings to this chunk are still active, they are implicitly 383 // destroyed. 384 void uvm_cpu_chunk_free(uvm_cpu_chunk_t *chunk); 385 386 // In some configurations such as SR-IOV heavy, a CPU chunk cannot be 387 // referenced using its physical address. There needs to be a kernel virtual 388 // mapping created. 389 // 390 // This helper function creates a DMA mapping on the GPU (see 391 // uvm_cpu_chunk_map_gpu()) and if necessary a kernel virtual mapping for the 392 // chunk. The virtual mapping persists until GPU deinitialization, such that no 393 // unmap functionality is exposed. For more details see uvm_mmu_sysmem_map(). 394 // 395 // Note that unlike uvm_cpu_chunk_map_gpu(), this helper requires the GPU 396 // object instead of the parent GPU object. 397 NV_STATUS uvm_cpu_chunk_map_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu); 398 399 // Destroy a CPU chunk's DMA mapping for the parent GPU. 400 // If chunk is a logical chunk, this call may not necessary destroy the DMA 401 // mapping of the parent physical chunk since all logical chunks share the 402 // parent's DMA mapping. 403 void uvm_cpu_chunk_unmap_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu); 404 405 // Get the CPU chunk's DMA mapping address for the specified GPU ID. 406 // If there is no mapping for the GPU, 0 is returned. 407 NvU64 uvm_cpu_chunk_get_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu); 408 409 // Split a CPU chunk into a set of CPU chunks of the next size down from the set 410 // of enabled CPU chunk sizes. 411 // 412 // This function expects that the chunk to be split is larger than the minimum 413 // enabled chunk size and that new_chunks has enough space for all chunks 414 // resulting from the split. 415 // 416 // On success, NV_OK is returned and the caller-provided new_chunks array will 417 // be filled out with the newly-created logical chunks. 418 // 419 // After a successfull split, the input chunk can no longer be used. 420 // 421 // On failure NV_ERR_NO_MEMORY will be returned. 422 // 423 // Should never be called for HMM chunks as these don't need splitting (they can 424 // only be PAGE_SIZE) and even if larger chunks could exist UVM could not split 425 // them without kernel interaction which currently isn't exported. Will return 426 // NV_ERR_INVALID_ARGUMENT for a HMM chunk. 427 // TODO: Bug 3368756: add support for transparent huge page (THP) 428 NV_STATUS uvm_cpu_chunk_split(uvm_cpu_chunk_t *chunk, uvm_cpu_chunk_t **new_chunks); 429 430 // Merge an array of logical chunks into their parent chunk. All chunks have to 431 // have the same size, parent, and set of mapped GPUs. 432 uvm_cpu_chunk_t *uvm_cpu_chunk_merge(uvm_cpu_chunk_t **chunks); 433 434 // Mark the page_index sub-page of the chunk as dirty. 435 // page_index is an offset into the chunk. 436 // 437 // Note that dirty status for HMM chunks should not be modified directly from 438 // UVM. Instead the kernel will mark the backing struct pages dirty either on 439 // fault when written to from the CPU, or when the PTE is mirrored to the GPU 440 // using hmm_range_fault(). 441 void uvm_cpu_chunk_mark_dirty(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index); 442 443 // Mark the page_index sub-page of the chunk as clean. 444 // page_index is an offset into the chunk. 445 void uvm_cpu_chunk_mark_clean(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index); 446 447 // Return true if the page_index base page of the CPU chunk is dirty. 448 bool uvm_cpu_chunk_is_dirty(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index); 449 450 static NV_STATUS uvm_test_get_cpu_chunk_allocation_sizes(UVM_TEST_GET_CPU_CHUNK_ALLOC_SIZES_PARAMS *params, 451 struct file *filp) 452 { 453 params->alloc_size_mask = (NvU32)uvm_cpu_chunk_get_allocation_sizes(); 454 return NV_OK; 455 } 456 #endif 457