1 /******************************************************************************* 2 Copyright (c) 2015-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_linux.h" 25 #include "uvm_common.h" 26 #include "uvm_api.h" 27 #include "uvm_gpu.h" 28 #include "uvm_va_space.h" 29 #include "uvm_va_range.h" 30 #include "uvm_va_block.h" 31 #include "uvm_hal_types.h" 32 #include "uvm_kvmalloc.h" 33 #include "uvm_tools.h" 34 #include "uvm_push.h" 35 #include "uvm_hal.h" 36 #include "uvm_perf_thrashing.h" 37 #include "uvm_perf_prefetch.h" 38 #include "uvm_mem.h" 39 #include "uvm_gpu_access_counters.h" 40 #include "uvm_va_space_mm.h" 41 #include "uvm_test_ioctl.h" 42 #include "uvm_conf_computing.h" 43 44 typedef enum 45 { 46 BLOCK_PTE_OP_MAP, 47 BLOCK_PTE_OP_REVOKE, 48 BLOCK_PTE_OP_COUNT 49 } block_pte_op_t; 50 51 static NvU64 uvm_perf_authorized_cpu_fault_tracking_window_ns = 300000; 52 53 static struct kmem_cache *g_uvm_va_block_cache __read_mostly; 54 static struct kmem_cache *g_uvm_va_block_gpu_state_cache __read_mostly; 55 static struct kmem_cache *g_uvm_page_mask_cache __read_mostly; 56 static struct kmem_cache *g_uvm_va_block_context_cache __read_mostly; 57 58 static int uvm_fault_force_sysmem __read_mostly = 0; 59 module_param(uvm_fault_force_sysmem, int, S_IRUGO|S_IWUSR); 60 MODULE_PARM_DESC(uvm_fault_force_sysmem, "Force (1) using sysmem storage for pages that faulted. Default: 0."); 61 62 static int uvm_perf_map_remote_on_eviction __read_mostly = 1; 63 module_param(uvm_perf_map_remote_on_eviction, int, S_IRUGO); 64 65 // Caching is always disabled for mappings to remote memory. The following two 66 // module parameters can be used to force caching for GPU peer/sysmem mappings. 67 // 68 // However, it is important to note that it may not be safe to enable caching 69 // in the general case so the enablement should only be used for experiments. 70 static unsigned uvm_exp_gpu_cache_peermem __read_mostly = 0; 71 module_param(uvm_exp_gpu_cache_peermem, uint, S_IRUGO); 72 MODULE_PARM_DESC(uvm_exp_gpu_cache_peermem, 73 "Force caching for mappings to peer memory. " 74 "This is an experimental parameter that may cause correctness issues if used."); 75 76 static unsigned uvm_exp_gpu_cache_sysmem __read_mostly = 0; 77 module_param(uvm_exp_gpu_cache_sysmem, uint, S_IRUGO); 78 MODULE_PARM_DESC(uvm_exp_gpu_cache_sysmem, 79 "Force caching for mappings to system memory. " 80 "This is an experimental parameter that may cause correctness issues if used."); 81 82 static void block_add_eviction_mappings_entry(void *args); 83 84 uvm_va_space_t *uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t *va_block) 85 { 86 #if UVM_IS_CONFIG_HMM() 87 if (va_block->hmm.va_space) 88 return va_block->hmm.va_space; 89 #endif 90 91 if (va_block->va_range) 92 return va_block->va_range->va_space; 93 94 return NULL; 95 } 96 97 uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block) 98 { 99 uvm_va_space_t *va_space; 100 101 UVM_ASSERT(!uvm_va_block_is_dead(va_block)); 102 103 va_space = uvm_va_block_get_va_space_maybe_dead(va_block); 104 UVM_ASSERT(va_space); 105 106 return va_space; 107 } 108 109 static NvU64 block_gpu_pte_flag_cacheable(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t resident_id) 110 { 111 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 112 113 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 114 115 // Local vidmem is always cached 116 if (uvm_id_equal(resident_id, gpu->id)) 117 return UVM_MMU_PTE_FLAGS_CACHED; 118 119 if (UVM_ID_IS_CPU(resident_id)) 120 return uvm_exp_gpu_cache_sysmem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED; 121 122 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(gpu->id)], resident_id)); 123 124 return uvm_exp_gpu_cache_peermem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED; 125 } 126 127 static uvm_gpu_t *block_get_gpu(uvm_va_block_t *block, uvm_gpu_id_t gpu_id) 128 { 129 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 130 131 return uvm_va_space_get_gpu(va_space, gpu_id); 132 } 133 134 static const char *block_processor_name(uvm_va_block_t *block, uvm_processor_id_t id) 135 { 136 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 137 138 return uvm_va_space_processor_name(va_space, id); 139 } 140 141 static bool block_processor_has_memory(uvm_va_block_t *block, uvm_processor_id_t id) 142 { 143 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 144 145 return uvm_va_space_processor_has_memory(va_space, id); 146 } 147 148 static bool is_uvm_fault_force_sysmem_set(void) 149 { 150 // Only enforce this during testing 151 return uvm_enable_builtin_tests && uvm_fault_force_sysmem != 0; 152 } 153 154 bool uvm_va_space_map_remote_on_eviction(uvm_va_space_t *va_space) 155 { 156 return uvm_perf_map_remote_on_eviction && 157 uvm_va_space_has_access_counter_migrations(va_space); 158 } 159 160 static const uvm_processor_mask_t *block_get_uvm_lite_gpus(uvm_va_block_t *va_block) 161 { 162 // Note that for HMM we always return a pointer to a zero bitmap 163 // (not allocated on the stack) since uvm_lite GPUs are not supported. 164 static const uvm_processor_mask_t uvm_lite_gpus = {}; 165 166 if (uvm_va_block_is_hmm(va_block)) 167 return &uvm_lite_gpus; 168 else 169 return &va_block->va_range->uvm_lite_gpus; 170 } 171 172 void uvm_va_block_retry_init(uvm_va_block_retry_t *retry) 173 { 174 if (!retry) 175 return; 176 177 uvm_tracker_init(&retry->tracker); 178 INIT_LIST_HEAD(&retry->used_chunks); 179 INIT_LIST_HEAD(&retry->free_chunks); 180 } 181 182 // The bottom bit of uvm_va_block_t::chunks is used to indicate how CPU chunks 183 // are stored. 184 // 185 // CPU chunk storage is handled in three different ways depending on the 186 // type of chunks the VA block owns. This is done to minimize the memory 187 // required to hold metadata. 188 typedef enum 189 { 190 // The uvm_va_block_t::chunk pointer points to a single 2MB 191 // CPU chunk. 192 UVM_CPU_CHUNK_STORAGE_CHUNK = 0, 193 194 // The uvm_va_block_t::chunks pointer points to a 195 // structure of mixed (64K and 4K) chunks. 196 UVM_CPU_CHUNK_STORAGE_MIXED, 197 UVM_CPU_CHUNK_STORAGE_COUNT, 198 } uvm_cpu_chunk_storage_type_t; 199 200 #define UVM_CPU_CHUNK_STORAGE_MASK 0x1 201 202 // The maximum number of slots in the mixed chunk mode (64K + 4K chunks) is 203 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK. Any leading/trailing misaligned pages will 204 // be stored in the first/last entry, respectively. 205 #define MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK MAX_BIG_PAGES_PER_UVM_VA_BLOCK 206 207 #define MAX_SMALL_CHUNKS_PER_BIG_SLOT (UVM_MIN_BIG_PAGE_SIZE / PAGE_SIZE) 208 209 // This structure is used when a VA block contains 64K or a mix of 64K and 4K 210 // CPU chunks. 211 // For every 64K CPU chunks, big_chunks will have its corresponding bit set 212 // and the corresponding index in slots will point directly to the 213 // uvm_cpu_chunk_t structure. 214 // 215 // For 4K CPU chunks, the corresponding bit in big_chunks will be clear and 216 // the element in slots will point to an array of 16 uvm_cpu_chunk_t pointers. 217 typedef struct { 218 DECLARE_BITMAP(big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK); 219 void *slots[MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK]; 220 } uvm_cpu_chunk_storage_mixed_t; 221 222 static uvm_va_block_region_t uvm_cpu_chunk_block_region(uvm_va_block_t *va_block, 223 uvm_cpu_chunk_t *chunk, 224 uvm_page_index_t page_index) 225 { 226 UVM_ASSERT(chunk); 227 return uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index); 228 } 229 230 static void *uvm_cpu_storage_get_ptr(uvm_va_block_t *block) 231 { 232 return (void *)(block->cpu.chunks & ~UVM_CPU_CHUNK_STORAGE_MASK); 233 } 234 235 static uvm_cpu_chunk_storage_type_t uvm_cpu_storage_get_type(uvm_va_block_t *block) 236 { 237 return block->cpu.chunks & UVM_CPU_CHUNK_STORAGE_MASK; 238 } 239 240 static uvm_page_index_t compute_page_prefix(uvm_va_block_t *va_block, uvm_chunk_size_t size) 241 { 242 return (UVM_ALIGN_UP(va_block->start, size) - va_block->start) / PAGE_SIZE; 243 } 244 245 static size_t compute_slot_index(uvm_va_block_t *va_block, uvm_page_index_t page_index) 246 { 247 uvm_va_block_region_t block_region = uvm_va_block_region_from_block(va_block); 248 uvm_page_index_t prefix; 249 size_t slot_index; 250 251 UVM_ASSERT(page_index < block_region.outer); 252 prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K); 253 254 if (page_index < prefix) 255 return 0; 256 257 slot_index = ((page_index - prefix) / MAX_SMALL_CHUNKS_PER_BIG_SLOT) + !!prefix; 258 UVM_ASSERT(slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK); 259 260 return slot_index; 261 } 262 263 static size_t compute_small_index(uvm_va_block_t *va_block, uvm_page_index_t page_index) 264 { 265 size_t prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K); 266 267 if (page_index < prefix) 268 return page_index; 269 270 return (page_index - prefix) % MAX_SMALL_CHUNKS_PER_BIG_SLOT; 271 } 272 273 NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, 274 uvm_cpu_chunk_t *chunk, 275 uvm_page_index_t page_index) 276 { 277 uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk); 278 uvm_va_block_region_t chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk)); 279 size_t slot_index; 280 uvm_cpu_chunk_storage_mixed_t *mixed; 281 uvm_cpu_chunk_t **chunks = NULL; 282 283 // We only want to use the bottom bit of a pointer. 284 BUILD_BUG_ON(UVM_CPU_CHUNK_STORAGE_COUNT > 2); 285 286 // We want to protect against two threads manipulating the VA block's CPU 287 // chunks at the same time. However, when a block is split, the new block's 288 // lock is locked without tracking. So, we can't use 289 // uvm_assert_mutex_locked(). 290 UVM_ASSERT(mutex_is_locked(&va_block->lock.m)); 291 292 if (chunk_size == UVM_CHUNK_SIZE_2M) { 293 UVM_ASSERT(uvm_va_block_size(va_block) == UVM_PAGE_SIZE_2M); 294 UVM_ASSERT(!va_block->cpu.chunks); 295 va_block->cpu.chunks = (unsigned long)chunk | UVM_CPU_CHUNK_STORAGE_CHUNK; 296 } 297 else { 298 if (!va_block->cpu.chunks) { 299 mixed = uvm_kvmalloc_zero(sizeof(*mixed)); 300 if (!mixed) 301 return NV_ERR_NO_MEMORY; 302 303 va_block->cpu.chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED; 304 } 305 306 UVM_ASSERT(uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_MIXED); 307 mixed = uvm_cpu_storage_get_ptr(va_block); 308 slot_index = compute_slot_index(va_block, page_index); 309 UVM_ASSERT(compute_slot_index(va_block, page_index + uvm_cpu_chunk_num_pages(chunk) - 1) == slot_index); 310 UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks)); 311 312 if (chunk_size == UVM_CHUNK_SIZE_64K) { 313 mixed->slots[slot_index] = chunk; 314 set_bit(slot_index, mixed->big_chunks); 315 } 316 else { 317 size_t small_index; 318 319 UVM_ASSERT(chunk_size == UVM_CHUNK_SIZE_4K); 320 chunks = mixed->slots[slot_index]; 321 322 if (!chunks) { 323 chunks = uvm_kvmalloc_zero(sizeof(*chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT); 324 if (!chunks) 325 return NV_ERR_NO_MEMORY; 326 mixed->slots[slot_index] = chunks; 327 } 328 329 small_index = compute_small_index(va_block, page_index); 330 chunks[small_index] = chunk; 331 } 332 } 333 334 uvm_page_mask_region_fill(&va_block->cpu.allocated, chunk_region); 335 return NV_OK; 336 } 337 338 uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block, uvm_page_index_t page_index) 339 { 340 uvm_cpu_chunk_storage_mixed_t *mixed; 341 uvm_cpu_chunk_t *chunk; 342 uvm_cpu_chunk_t **chunks; 343 size_t slot_index; 344 345 UVM_ASSERT(page_index < uvm_va_block_num_cpu_pages(va_block)); 346 if (!uvm_page_mask_test(&va_block->cpu.allocated, page_index)) 347 return NULL; 348 349 UVM_ASSERT(va_block->cpu.chunks); 350 351 if (uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_CHUNK) { 352 return uvm_cpu_storage_get_ptr(va_block); 353 } 354 else { 355 mixed = uvm_cpu_storage_get_ptr(va_block); 356 slot_index = compute_slot_index(va_block, page_index); 357 UVM_ASSERT(mixed->slots[slot_index] != NULL); 358 if (test_bit(slot_index, mixed->big_chunks)) 359 return mixed->slots[slot_index]; 360 361 chunks = mixed->slots[slot_index]; 362 chunk = chunks[compute_small_index(va_block, page_index)]; 363 } 364 365 UVM_ASSERT(chunk); 366 return chunk; 367 } 368 369 void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, 370 uvm_page_index_t page_index) 371 { 372 uvm_cpu_chunk_storage_mixed_t *mixed; 373 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); 374 uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index); 375 size_t slot_index; 376 uvm_cpu_chunk_t **chunks; 377 378 // We want to protect against two threads manipulating the VA block's CPU 379 // chunks at the same time. However, when a block is split, the new block's 380 // lock is locked without tracking. So, we can't use 381 // uvm_assert_mutex_locked(). 382 UVM_ASSERT(mutex_is_locked(&va_block->lock.m)); 383 UVM_ASSERT(va_block->cpu.chunks); 384 UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == uvm_cpu_chunk_num_pages(chunk)); 385 386 if (uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_CHUNK) { 387 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M); 388 UVM_ASSERT(uvm_cpu_storage_get_ptr(va_block) == chunk); 389 va_block->cpu.chunks = 0; 390 } 391 else { 392 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) != UVM_CHUNK_SIZE_2M); 393 mixed = uvm_cpu_storage_get_ptr(va_block); 394 slot_index = compute_slot_index(va_block, page_index); 395 UVM_ASSERT(mixed->slots[slot_index] != NULL); 396 397 if (test_bit(slot_index, mixed->big_chunks)) { 398 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K); 399 UVM_ASSERT(mixed->slots[slot_index] == chunk); 400 mixed->slots[slot_index] = NULL; 401 clear_bit(slot_index, mixed->big_chunks); 402 } 403 else { 404 size_t small_index; 405 406 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K); 407 chunks = mixed->slots[slot_index]; 408 small_index = compute_small_index(va_block, page_index); 409 UVM_ASSERT(chunks[small_index] == chunk); 410 chunks[small_index] = NULL; 411 412 for (small_index = 0; small_index < MAX_SMALL_CHUNKS_PER_BIG_SLOT; small_index++) { 413 if (chunks[small_index]) 414 break; 415 } 416 417 if (small_index == MAX_SMALL_CHUNKS_PER_BIG_SLOT) { 418 uvm_kvfree(chunks); 419 mixed->slots[slot_index] = NULL; 420 } 421 } 422 } 423 424 uvm_page_mask_region_clear(&va_block->cpu.allocated, chunk_region); 425 426 if (uvm_page_mask_empty(&va_block->cpu.allocated) && va_block->cpu.chunks) { 427 uvm_kvfree(uvm_cpu_storage_get_ptr(va_block)); 428 va_block->cpu.chunks = 0; 429 } 430 } 431 432 struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index) 433 { 434 uvm_va_block_region_t chunk_region; 435 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); 436 437 UVM_ASSERT(chunk); 438 UVM_ASSERT(chunk->page); 439 chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index); 440 return chunk->page + (page_index - chunk_region.first); 441 } 442 443 static uvm_cpu_chunk_t *uvm_cpu_chunk_first_in_region(uvm_va_block_t *va_block, 444 uvm_va_block_region_t region, 445 uvm_page_index_t *first_chunk_page) 446 { 447 uvm_cpu_chunk_t *chunk = NULL; 448 uvm_page_index_t page_index; 449 450 page_index = uvm_va_block_first_page_in_mask(region, &va_block->cpu.allocated); 451 if (page_index < region.outer) 452 chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); 453 454 if (first_chunk_page && chunk) { 455 uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index); 456 *first_chunk_page = chunk_region.first; 457 } 458 459 return chunk; 460 } 461 462 #define for_each_cpu_chunk_in_block_region(chunk, page_index, va_block, region) \ 463 for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), &(page_index)); \ 464 (chunk) != NULL; \ 465 (chunk) = uvm_cpu_chunk_first_in_region((va_block), \ 466 uvm_va_block_region((page_index) + uvm_cpu_chunk_num_pages((chunk)), \ 467 (region).outer), \ 468 &(page_index))) 469 470 #define for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, region) \ 471 for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), &(page_index)), \ 472 (next_page_index) = (page_index) + (chunk ? uvm_cpu_chunk_num_pages(chunk) : 0); \ 473 (chunk) != NULL; \ 474 (chunk) = uvm_cpu_chunk_first_in_region((va_block), \ 475 uvm_va_block_region((next_page_index), (region).outer), \ 476 &(page_index)), \ 477 (next_page_index) = (page_index) + ((chunk) ? uvm_cpu_chunk_num_pages((chunk)) : 0)) 478 479 #define for_each_cpu_chunk_in_block(chunk, page_index, va_block) \ 480 for_each_cpu_chunk_in_block_region((chunk), (page_index), (va_block), uvm_va_block_region_from_block((va_block))) 481 482 #define for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, va_block) \ 483 for_each_cpu_chunk_in_block_region_safe((chunk), \ 484 (page_index), \ 485 (next_page_index), \ 486 (va_block), \ 487 uvm_va_block_region_from_block((va_block))) 488 489 struct vm_area_struct *uvm_va_block_find_vma_region(uvm_va_block_t *va_block, 490 struct mm_struct *mm, 491 NvU64 start, 492 uvm_va_block_region_t *region) 493 { 494 struct vm_area_struct *vma; 495 NvU64 end; 496 497 if (start > va_block->end) 498 return NULL; 499 500 vma = find_vma_intersection(mm, start, va_block->end + 1); 501 if (!vma) 502 return NULL; 503 504 if (start < vma->vm_start) 505 start = vma->vm_start; 506 507 end = vma->vm_end - 1; 508 if (end > va_block->end) 509 end = va_block->end; 510 511 *region = uvm_va_block_region_from_start_end(va_block, start, end); 512 513 return vma; 514 } 515 516 static bool block_check_cpu_chunks(uvm_va_block_t *block) 517 { 518 uvm_cpu_chunk_t *chunk; 519 size_t alloced_pages = 0; 520 uvm_va_block_region_t prev_region = { 0 }; 521 uvm_page_index_t page_index; 522 523 for_each_cpu_chunk_in_block(chunk, page_index, block) { 524 uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(block, chunk, page_index); 525 size_t num_chunk_pages = uvm_cpu_chunk_num_pages(chunk); 526 uvm_page_index_t chunk_page; 527 528 UVM_ASSERT(prev_region.outer <= chunk_region.first); 529 UVM_ASSERT(IS_ALIGNED(uvm_va_block_region_start(block, chunk_region), uvm_cpu_chunk_get_size(chunk))); 530 UVM_ASSERT(chunk_region.outer <= uvm_va_block_num_cpu_pages(block)); 531 532 alloced_pages += uvm_cpu_chunk_num_pages(chunk); 533 UVM_ASSERT(uvm_page_mask_region_full(&block->cpu.allocated, chunk_region)); 534 prev_region = chunk_region; 535 536 for (chunk_page = page_index; chunk_page < page_index + num_chunk_pages; chunk_page++) 537 UVM_ASSERT(uvm_cpu_chunk_get_chunk_for_page(block, chunk_page) == chunk); 538 } 539 540 UVM_ASSERT(alloced_pages == uvm_page_mask_weight(&block->cpu.allocated)); 541 542 return true; 543 } 544 545 // Frees any left-over free chunks and unpins all the used chunks 546 void uvm_va_block_retry_deinit(uvm_va_block_retry_t *retry, uvm_va_block_t *va_block) 547 { 548 uvm_gpu_t *gpu; 549 uvm_gpu_chunk_t *gpu_chunk; 550 uvm_gpu_chunk_t *next_chunk; 551 552 if (!retry) 553 return; 554 555 uvm_tracker_deinit(&retry->tracker); 556 557 // Free any unused chunks 558 list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->free_chunks, list) { 559 list_del_init(&gpu_chunk->list); 560 gpu = uvm_gpu_chunk_get_gpu(gpu_chunk); 561 uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL); 562 } 563 564 // Unpin all the used chunks now that we are done 565 list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->used_chunks, list) { 566 list_del_init(&gpu_chunk->list); 567 gpu = uvm_gpu_chunk_get_gpu(gpu_chunk); 568 // HMM should have already moved allocated blocks to the referenced 569 // state so any left over were not migrated and should be freed. 570 if (uvm_va_block_is_hmm(va_block)) 571 uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL); 572 else 573 uvm_pmm_gpu_unpin_allocated(&gpu->pmm, gpu_chunk, va_block); 574 } 575 } 576 577 static void block_retry_add_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk) 578 { 579 list_add_tail(&gpu_chunk->list, &retry->free_chunks); 580 } 581 582 static void block_retry_add_used_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk) 583 { 584 list_add_tail(&gpu_chunk->list, &retry->used_chunks); 585 } 586 587 static uvm_gpu_chunk_t *block_retry_get_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_t *gpu, uvm_chunk_size_t size) 588 { 589 uvm_gpu_chunk_t *gpu_chunk; 590 591 list_for_each_entry(gpu_chunk, &retry->free_chunks, list) { 592 if (uvm_gpu_chunk_get_gpu(gpu_chunk) == gpu && uvm_gpu_chunk_get_size(gpu_chunk) == size) { 593 list_del_init(&gpu_chunk->list); 594 return gpu_chunk; 595 } 596 } 597 598 return NULL; 599 } 600 601 // Encapsulates a reference to a physical page belonging to a specific processor 602 // within a VA block. 603 typedef struct 604 { 605 // Processor the page is on 606 uvm_processor_id_t processor; 607 608 // The page index 609 uvm_page_index_t page_index; 610 } block_phys_page_t; 611 612 static block_phys_page_t block_phys_page(uvm_processor_id_t processor, uvm_page_index_t page_index) 613 { 614 return (block_phys_page_t){ processor, page_index }; 615 } 616 617 NV_STATUS uvm_va_block_init(void) 618 { 619 if (uvm_enable_builtin_tests) 620 g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_wrapper_t", uvm_va_block_wrapper_t); 621 else 622 g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_t", uvm_va_block_t); 623 624 if (!g_uvm_va_block_cache) 625 return NV_ERR_NO_MEMORY; 626 627 g_uvm_va_block_gpu_state_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_gpu_state_t", uvm_va_block_gpu_state_t); 628 if (!g_uvm_va_block_gpu_state_cache) 629 return NV_ERR_NO_MEMORY; 630 631 g_uvm_page_mask_cache = NV_KMEM_CACHE_CREATE("uvm_page_mask_t", uvm_page_mask_t); 632 if (!g_uvm_page_mask_cache) 633 return NV_ERR_NO_MEMORY; 634 635 g_uvm_va_block_context_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_context_t", uvm_va_block_context_t); 636 if (!g_uvm_va_block_context_cache) 637 return NV_ERR_NO_MEMORY; 638 639 return NV_OK; 640 } 641 642 void uvm_va_block_exit(void) 643 { 644 kmem_cache_destroy_safe(&g_uvm_va_block_context_cache); 645 kmem_cache_destroy_safe(&g_uvm_page_mask_cache); 646 kmem_cache_destroy_safe(&g_uvm_va_block_gpu_state_cache); 647 kmem_cache_destroy_safe(&g_uvm_va_block_cache); 648 } 649 650 uvm_va_block_context_t *uvm_va_block_context_alloc(struct mm_struct *mm) 651 { 652 uvm_va_block_context_t *block_context = kmem_cache_alloc(g_uvm_va_block_context_cache, NV_UVM_GFP_FLAGS); 653 if (block_context) 654 uvm_va_block_context_init(block_context, mm); 655 656 return block_context; 657 } 658 659 void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context) 660 { 661 if (va_block_context) 662 kmem_cache_free(g_uvm_va_block_context_cache, va_block_context); 663 } 664 665 // Convert from page_index to chunk_index. The goal is for each system page in 666 // the region [start, start + size) to be covered by the largest naturally- 667 // aligned user chunk size. 668 size_t uvm_va_block_gpu_chunk_index_range(NvU64 start, 669 NvU64 size, 670 uvm_gpu_t *gpu, 671 uvm_page_index_t page_index, 672 uvm_chunk_size_t *out_chunk_size) 673 { 674 uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes; 675 uvm_chunk_size_t chunk_size, final_chunk_size; 676 size_t num_chunks, num_chunks_total; 677 NvU64 addr, end, aligned_start, aligned_addr, aligned_end, temp_size; 678 679 UVM_ASSERT(PAGE_ALIGNED(start)); 680 UVM_ASSERT(PAGE_ALIGNED(size)); 681 UVM_ASSERT(size > 0); 682 UVM_ASSERT(size <= UVM_CHUNK_SIZE_2M); 683 UVM_ASSERT(UVM_ALIGN_DOWN(start, UVM_CHUNK_SIZE_2M) == UVM_ALIGN_DOWN(start + size - 1, UVM_CHUNK_SIZE_2M)); 684 BUILD_BUG_ON(UVM_VA_BLOCK_SIZE != UVM_CHUNK_SIZE_2M); 685 686 // PAGE_SIZE needs to be the lowest natively-supported chunk size in the 687 // mask, since we never deal with chunk sizes smaller than that (although we 688 // may have PTEs mapping pages smaller than that). 689 UVM_ASSERT(uvm_chunk_find_first_size(chunk_sizes) == PAGE_SIZE); 690 691 // Optimize the ideal Pascal+ case: the whole block is covered by a single 692 // 2M page. 693 if ((chunk_sizes & UVM_CHUNK_SIZE_2M) && size == UVM_CHUNK_SIZE_2M) { 694 UVM_ASSERT(IS_ALIGNED(start, UVM_CHUNK_SIZE_2M)); 695 final_chunk_size = UVM_CHUNK_SIZE_2M; 696 num_chunks_total = 0; 697 goto out; 698 } 699 700 // Only one 2M chunk can fit within a VA block on any GPU architecture, so 701 // remove that size from consideration. 702 chunk_sizes &= ~UVM_CHUNK_SIZE_2M; 703 704 // Next common case: the whole block is aligned and sized to perfectly fit 705 // the largest page size. 706 final_chunk_size = uvm_chunk_find_last_size(chunk_sizes); 707 if (IS_ALIGNED(start, final_chunk_size) && IS_ALIGNED(size, final_chunk_size)) { 708 num_chunks_total = (size_t)uvm_div_pow2_64(page_index * PAGE_SIZE, final_chunk_size); 709 goto out; 710 } 711 712 // We didn't hit our special paths. Do it the hard way. 713 714 num_chunks_total = 0; 715 addr = start + page_index * PAGE_SIZE; 716 end = start + size; 717 final_chunk_size = 0; 718 UVM_ASSERT(addr < end); 719 720 // The below loop collapses almost completely when chunk_size == PAGE_SIZE 721 // since in that lowest-common-denominator case everything is already 722 // aligned. Skip it and handle that specially after the loop. 723 // 724 // Note that since we removed 2M already above, this loop will only iterate 725 // once on x86 Pascal+ since only 64K is left. 726 chunk_sizes &= ~PAGE_SIZE; 727 728 // This loop calculates the number of chunks between start and addr by 729 // calculating the number of whole chunks of each size between them, 730 // starting with the largest allowed chunk size. This requires fewer 731 // iterations than if we began from start and kept calculating the next 732 // larger chunk size boundary. 733 for_each_chunk_size_rev(chunk_size, chunk_sizes) { 734 aligned_start = UVM_ALIGN_UP(start, chunk_size); 735 aligned_addr = UVM_ALIGN_DOWN(addr, chunk_size); 736 aligned_end = UVM_ALIGN_DOWN(end, chunk_size); 737 738 // If addr and start are within the same chunk, try smaller 739 if (aligned_start > aligned_addr) 740 continue; 741 742 // If addr and end are not in the same chunk, then addr is covered by a 743 // single chunk of the current size. Ignore smaller boundaries between 744 // addr and aligned_addr. 745 if (aligned_addr < aligned_end && final_chunk_size == 0) { 746 addr = aligned_addr; 747 final_chunk_size = chunk_size; 748 } 749 750 // How many chunks of this size are between start and addr? Note that 751 // this might be 0 since aligned_addr and aligned_start could be in the 752 // same chunk. 753 num_chunks = uvm_div_pow2_32(((NvU32)aligned_addr - aligned_start), chunk_size); 754 num_chunks_total += num_chunks; 755 756 // We've already accounted for these chunks, so "remove" them by 757 // bringing start, addr, and end closer together to calculate the 758 // remaining chunk sizes. 759 temp_size = num_chunks * chunk_size; 760 addr -= temp_size; 761 end -= temp_size; 762 763 // Once there's no separation between addr and start, and we've 764 // successfully found the right chunk size when taking end into account, 765 // we're done. 766 if (addr == start && final_chunk_size) 767 break; 768 } 769 770 // Handle PAGE_SIZE cleanup since we skipped it in the loop 771 num_chunks_total += (addr - start) / PAGE_SIZE; 772 if (final_chunk_size == 0) 773 final_chunk_size = PAGE_SIZE; 774 775 out: 776 if (out_chunk_size) 777 *out_chunk_size = final_chunk_size; 778 779 return num_chunks_total; 780 } 781 782 static size_t block_gpu_chunk_index_range(uvm_va_block_t *va_block, 783 NvU64 start, 784 NvU64 size, 785 uvm_gpu_t *gpu, 786 uvm_page_index_t page_index, 787 uvm_chunk_size_t *out_chunk_size) 788 { 789 if (uvm_va_block_is_hmm(va_block)) { 790 if (out_chunk_size) 791 *out_chunk_size = PAGE_SIZE; 792 return page_index; 793 } 794 795 return uvm_va_block_gpu_chunk_index_range(start, size, gpu, page_index, out_chunk_size); 796 } 797 798 static size_t block_gpu_chunk_index(uvm_va_block_t *block, 799 uvm_gpu_t *gpu, 800 uvm_page_index_t page_index, 801 uvm_chunk_size_t *out_chunk_size) 802 { 803 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 804 uvm_chunk_size_t size; 805 uvm_gpu_chunk_t *chunk; 806 size_t index; 807 808 index = block_gpu_chunk_index_range(block, block->start, uvm_va_block_size(block), gpu, page_index, &size); 809 810 UVM_ASSERT(size >= PAGE_SIZE); 811 812 if (gpu_state) { 813 UVM_ASSERT(gpu_state->chunks); 814 chunk = gpu_state->chunks[index]; 815 if (chunk) { 816 UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == size); 817 UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED); 818 UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_FREE); 819 } 820 } 821 822 if (out_chunk_size) 823 *out_chunk_size = size; 824 825 return index; 826 } 827 828 // Compute the size of the chunk known to start at start_page_index 829 static uvm_chunk_size_t block_gpu_chunk_size(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t start_page_index) 830 { 831 uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes; 832 uvm_chunk_sizes_mask_t start_alignments, pow2_leq_size, allowed_sizes; 833 NvU64 start = uvm_va_block_cpu_page_address(block, start_page_index); 834 NvU64 size = block->end - start + 1; 835 836 if (uvm_va_block_is_hmm(block)) 837 return PAGE_SIZE; 838 839 // Create a mask of all sizes for which start is aligned. x ^ (x-1) yields a 840 // mask of the rightmost 1 bit in x, as well as all trailing 0 bits in x. 841 // Example: 1011000 -> 0001111 842 start_alignments = (uvm_chunk_sizes_mask_t)(start ^ (start - 1)); 843 844 // Next, compute all sizes (powers of two) which are <= size. 845 pow2_leq_size = (uvm_chunk_sizes_mask_t)rounddown_pow_of_two(size); 846 pow2_leq_size |= pow2_leq_size - 1; 847 848 // Now and them all together to get our list of GPU-supported chunk sizes 849 // which are aligned to start and will fit within size. 850 allowed_sizes = chunk_sizes & start_alignments & pow2_leq_size; 851 852 // start and size must always be aligned to at least the smallest supported 853 // chunk size (PAGE_SIZE). 854 UVM_ASSERT(allowed_sizes >= PAGE_SIZE); 855 856 // Take the largest allowed size 857 return uvm_chunk_find_last_size(allowed_sizes); 858 } 859 860 static size_t block_num_gpu_chunks(uvm_va_block_t *block, uvm_gpu_t *gpu) 861 { 862 return block_gpu_chunk_index(block, gpu, uvm_va_block_cpu_page_index(block, block->end), NULL) + 1; 863 } 864 865 static size_t block_num_gpu_chunks_range(uvm_va_block_t *block, NvU64 start, NvU64 size, uvm_gpu_t *gpu) 866 { 867 uvm_page_index_t last_page_index = (size_t)((size / PAGE_SIZE) - 1); 868 return block_gpu_chunk_index_range(block, start, size, gpu, last_page_index, NULL) + 1; 869 } 870 871 uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU64 address) 872 { 873 size_t chunk_index; 874 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 875 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, address); 876 877 uvm_assert_mutex_locked(&va_block->lock); 878 879 if (!gpu_state) 880 return NULL; 881 882 chunk_index = block_gpu_chunk_index(va_block, gpu, page_index, NULL); 883 884 return gpu_state->chunks[chunk_index]; 885 } 886 887 NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range, 888 NvU64 start, 889 NvU64 end, 890 uvm_va_block_t **out_block) 891 { 892 uvm_va_block_t *block = NULL; 893 NvU64 size = end - start + 1; 894 895 UVM_ASSERT(PAGE_ALIGNED(start)); 896 UVM_ASSERT(PAGE_ALIGNED(end + 1)); 897 UVM_ASSERT(PAGE_ALIGNED(size)); 898 UVM_ASSERT(size > 0); 899 UVM_ASSERT(size <= UVM_VA_BLOCK_SIZE); 900 901 if (va_range) { 902 // Create a managed va_block. 903 UVM_ASSERT(start >= va_range->node.start); 904 UVM_ASSERT(end <= va_range->node.end); 905 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 906 } 907 908 // Blocks can't span a block alignment boundary 909 UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end)); 910 911 if (uvm_enable_builtin_tests) { 912 uvm_va_block_wrapper_t *block_wrapper = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS); 913 914 if (block_wrapper) 915 block = &block_wrapper->block; 916 } 917 else { 918 block = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS); 919 } 920 921 if (!block) 922 return NV_ERR_NO_MEMORY; 923 924 nv_kref_init(&block->kref); 925 uvm_mutex_init(&block->lock, UVM_LOCK_ORDER_VA_BLOCK); 926 block->start = start; 927 block->end = end; 928 block->va_range = va_range; 929 uvm_tracker_init(&block->tracker); 930 block->prefetch_info.last_migration_proc_id = UVM_ID_INVALID; 931 932 nv_kthread_q_item_init(&block->eviction_mappings_q_item, block_add_eviction_mappings_entry, block); 933 934 *out_block = block; 935 return NV_OK; 936 } 937 938 static void cpu_chunk_remove_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu) 939 { 940 NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 941 if (gpu_mapping_addr == 0) 942 return; 943 944 uvm_pmm_sysmem_mappings_remove_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, gpu_mapping_addr); 945 uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu->parent); 946 } 947 948 static NV_STATUS cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, 949 uvm_va_block_t *block, 950 uvm_page_index_t page_index, 951 uvm_gpu_t *gpu) 952 { 953 NV_STATUS status; 954 uvm_chunk_size_t chunk_size; 955 956 // When the Confidential Computing feature is enabled the transfers don't 957 // use the DMA mapping of CPU chunks (since it's protected memory), but 958 // the DMA address of the unprotected dma buffer. 959 if (uvm_conf_computing_mode_enabled(gpu)) 960 return NV_OK; 961 962 status = uvm_cpu_chunk_map_gpu(chunk, gpu); 963 if (status != NV_OK) 964 return status; 965 966 chunk_size = uvm_cpu_chunk_get_size(chunk); 967 968 // TODO: Bug 3744779: Handle benign assertion in 969 // pmm_sysmem_mappings_remove_gpu_mapping() in case of a 970 // failure. 971 status = uvm_pmm_sysmem_mappings_add_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, 972 uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent), 973 uvm_va_block_cpu_page_address(block, page_index), 974 chunk_size, 975 block, 976 UVM_ID_CPU); 977 if (status != NV_OK) 978 cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu); 979 980 return status; 981 } 982 983 static void block_gpu_unmap_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu) 984 { 985 uvm_cpu_chunk_t *chunk; 986 uvm_page_index_t page_index; 987 988 for_each_cpu_chunk_in_block(chunk, page_index, block) 989 cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu); 990 } 991 992 static NV_STATUS block_gpu_map_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu) 993 { 994 NV_STATUS status; 995 uvm_cpu_chunk_t *chunk; 996 NvU64 block_mapping_size = uvm_va_block_size(block); 997 uvm_page_index_t page_index; 998 999 UVM_ASSERT(IS_ALIGNED(block_mapping_size, UVM_PAGE_SIZE_4K)); 1000 1001 for_each_cpu_chunk_in_block(chunk, page_index, block) { 1002 UVM_ASSERT_MSG(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent) == 0, 1003 "GPU%u DMA address 0x%llx\n", 1004 uvm_id_value(gpu->id), 1005 uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent)); 1006 1007 status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, page_index, gpu); 1008 if (status != NV_OK) 1009 goto error; 1010 } 1011 1012 return NV_OK; 1013 1014 error: 1015 block_gpu_unmap_phys_all_cpu_pages(block, gpu); 1016 return status; 1017 } 1018 1019 static NV_STATUS block_sysmem_mappings_add_gpu_chunk(uvm_va_block_t *block, 1020 uvm_gpu_t *local_gpu, 1021 uvm_gpu_chunk_t *chunk, 1022 uvm_gpu_t *accessing_gpu) 1023 { 1024 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu); 1025 return uvm_pmm_sysmem_mappings_add_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, 1026 peer_addr, 1027 block->start + chunk->va_block_page_index * PAGE_SIZE, 1028 uvm_gpu_chunk_get_size(chunk), 1029 block, 1030 local_gpu->id); 1031 } 1032 1033 static void block_sysmem_mappings_remove_gpu_chunk(uvm_gpu_t *local_gpu, 1034 uvm_gpu_chunk_t *chunk, 1035 uvm_gpu_t *accessing_gpu) 1036 { 1037 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu); 1038 uvm_pmm_sysmem_mappings_remove_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr); 1039 } 1040 1041 static NV_STATUS block_gpu_map_all_chunks_indirect_peer(uvm_va_block_t *block, 1042 uvm_gpu_t *local_gpu, 1043 uvm_gpu_t *accessing_gpu) 1044 { 1045 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id); 1046 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1047 size_t num_chunks, i; 1048 NV_STATUS status; 1049 1050 UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)], 1051 accessing_gpu->id)); 1052 1053 // If no chunks are allocated currently, the mappings will be created later 1054 // at chunk allocation. 1055 if (!gpu_state || !gpu_state->chunks) 1056 return NV_OK; 1057 1058 num_chunks = block_num_gpu_chunks(block, local_gpu); 1059 for (i = 0; i < num_chunks; i++) { 1060 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 1061 if (!chunk) 1062 continue; 1063 1064 status = uvm_pmm_gpu_indirect_peer_map(&local_gpu->pmm, chunk, accessing_gpu); 1065 if (status != NV_OK) 1066 goto error; 1067 1068 status = block_sysmem_mappings_add_gpu_chunk(block, local_gpu, chunk, accessing_gpu); 1069 if (status != NV_OK) 1070 goto error; 1071 } 1072 1073 return NV_OK; 1074 1075 error: 1076 while (i-- > 0) { 1077 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 1078 if (chunk) { 1079 // Indirect peer mappings are removed lazily by PMM, so if an error 1080 // occurs the mappings established above will be removed when the 1081 // chunk is freed later on. We only need to remove the sysmem 1082 // reverse mappings. 1083 block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu); 1084 } 1085 } 1086 1087 return status; 1088 } 1089 1090 // Mappings for indirect peers are removed lazily by PMM, but we need to remove 1091 // the entries from the reverse map. 1092 static void block_gpu_unmap_all_chunks_indirect_peer(uvm_va_block_t *block, 1093 uvm_gpu_t *local_gpu, 1094 uvm_gpu_t *accessing_gpu) 1095 { 1096 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id); 1097 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1098 size_t num_chunks, i; 1099 1100 UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)], 1101 accessing_gpu->id)); 1102 1103 // Exit if no chunks are allocated currently. 1104 if (!gpu_state || !gpu_state->chunks) 1105 return; 1106 1107 num_chunks = block_num_gpu_chunks(block, local_gpu); 1108 for (i = 0; i < num_chunks; i++) { 1109 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 1110 if (chunk) 1111 block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu); 1112 } 1113 } 1114 1115 // Retrieves the gpu_state for the given GPU. The returned pointer is 1116 // internally managed and will be allocated (and freed) automatically, 1117 // rather than by the caller. 1118 static uvm_va_block_gpu_state_t *block_gpu_state_get_alloc(uvm_va_block_t *block, uvm_gpu_t *gpu) 1119 { 1120 NV_STATUS status; 1121 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 1122 1123 if (gpu_state) 1124 return gpu_state; 1125 1126 gpu_state = nv_kmem_cache_zalloc(g_uvm_va_block_gpu_state_cache, NV_UVM_GFP_FLAGS); 1127 if (!gpu_state) 1128 return NULL; 1129 1130 gpu_state->chunks = uvm_kvmalloc_zero(block_num_gpu_chunks(block, gpu) * sizeof(gpu_state->chunks[0])); 1131 if (!gpu_state->chunks) 1132 goto error; 1133 1134 block->gpus[uvm_id_gpu_index(gpu->id)] = gpu_state; 1135 1136 status = block_gpu_map_phys_all_cpu_pages(block, gpu); 1137 if (status != NV_OK) 1138 goto error; 1139 1140 return gpu_state; 1141 1142 error: 1143 uvm_kvfree(gpu_state->chunks); 1144 kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state); 1145 block->gpus[uvm_id_gpu_index(gpu->id)] = NULL; 1146 1147 return NULL; 1148 } 1149 1150 NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block) 1151 { 1152 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 1153 uvm_gpu_id_t gpu_id; 1154 1155 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1156 uvm_assert_mutex_locked(&va_block->lock); 1157 1158 for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpus) { 1159 if (!block_gpu_state_get_alloc(va_block, uvm_va_space_get_gpu(va_space, gpu_id))) 1160 return NV_ERR_NO_MEMORY; 1161 } 1162 1163 return NV_OK; 1164 } 1165 1166 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block, 1167 uvm_cpu_chunk_t *chunk, 1168 uvm_page_index_t page_index) 1169 { 1170 uvm_gpu_id_t id; 1171 1172 for_each_gpu_id(id) { 1173 if (uvm_va_block_gpu_state_get(block, id)) 1174 cpu_chunk_remove_sysmem_gpu_mapping(chunk, block_get_gpu(block, id)); 1175 } 1176 } 1177 1178 NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *block, 1179 uvm_page_index_t page_index) 1180 { 1181 NV_STATUS status; 1182 uvm_gpu_id_t id; 1183 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 1184 uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk); 1185 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index); 1186 1187 // We can't iterate over va_space->registered_gpus because we might be 1188 // on the eviction path, which does not have the VA space lock held. We have 1189 // the VA block lock held however, so the gpu_states can't change. 1190 uvm_assert_mutex_locked(&block->lock); 1191 1192 for_each_gpu_id(id) { 1193 uvm_gpu_t *gpu; 1194 1195 if (!uvm_va_block_gpu_state_get(block, id)) 1196 continue; 1197 1198 gpu = block_get_gpu(block, id); 1199 status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, chunk_region.first, gpu); 1200 if (status != NV_OK) 1201 goto error; 1202 } 1203 1204 return NV_OK; 1205 1206 error: 1207 uvm_va_block_unmap_cpu_chunk_on_gpus(block, chunk, page_index); 1208 return status; 1209 } 1210 1211 void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region) 1212 { 1213 uvm_cpu_chunk_t *chunk; 1214 uvm_page_index_t page_index, next_page_index; 1215 uvm_va_block_region_t chunk_region; 1216 1217 for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, region) { 1218 chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk)); 1219 1220 uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], chunk_region); 1221 uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region); 1222 uvm_page_mask_region_clear(&va_block->cpu.resident, chunk_region); 1223 uvm_cpu_chunk_remove_from_block(va_block, page_index); 1224 uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index); 1225 uvm_cpu_chunk_free(chunk); 1226 } 1227 1228 if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) 1229 uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU); 1230 if (uvm_page_mask_empty(&va_block->cpu.resident)) 1231 uvm_processor_mask_clear(&va_block->resident, UVM_ID_CPU); 1232 } 1233 1234 // Create physical mappings to allow other GPUs to access this chunk. 1235 static NV_STATUS block_map_indirect_peers_to_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) 1236 { 1237 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1238 uvm_gpu_t *accessing_gpu, *remove_gpu; 1239 NV_STATUS status; 1240 1241 // Unlike uvm_va_block_map_cpu_chunk_on_gpus, this function isn't called on 1242 // the eviction path, so we can assume that the VA space is locked. 1243 // 1244 // TODO: Bug 2007346: In the future we may want to enable eviction to peers, 1245 // meaning we may need to allocate peer memory and map it on the 1246 // eviction path. That will require making sure that peers can't be 1247 // enabled or disabled either in the VA space or globally within this 1248 // function. 1249 uvm_assert_rwsem_locked(&va_space->lock); 1250 uvm_assert_mutex_locked(&block->lock); 1251 1252 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 1253 status = uvm_pmm_gpu_indirect_peer_map(&gpu->pmm, chunk, accessing_gpu); 1254 if (status != NV_OK) 1255 goto error; 1256 1257 status = block_sysmem_mappings_add_gpu_chunk(block, gpu, chunk, accessing_gpu); 1258 if (status != NV_OK) 1259 goto error; 1260 } 1261 1262 return NV_OK; 1263 1264 error: 1265 for_each_va_space_gpu_in_mask(remove_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 1266 if (remove_gpu == accessing_gpu) 1267 break; 1268 1269 // Indirect peer mappings are removed lazily by PMM, so if an error 1270 // occurs the mappings established above will be removed when the 1271 // chunk is freed later on. We only need to remove the sysmem 1272 // reverse mappings. 1273 block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, remove_gpu); 1274 } 1275 1276 return status; 1277 } 1278 1279 static void block_unmap_indirect_peers_from_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) 1280 { 1281 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1282 uvm_gpu_t *peer_gpu; 1283 1284 uvm_assert_rwsem_locked(&va_space->lock); 1285 uvm_assert_mutex_locked(&block->lock); 1286 1287 // Indirect peer mappings are removed lazily by PMM, so we only need to 1288 // remove the sysmem reverse mappings. 1289 for_each_va_space_gpu_in_mask(peer_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) 1290 block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, peer_gpu); 1291 } 1292 1293 // Mark a CPU page as dirty. 1294 static void block_mark_cpu_page_dirty(uvm_va_block_t *block, uvm_page_index_t page_index) 1295 { 1296 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 1297 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index); 1298 uvm_cpu_chunk_mark_dirty(chunk, page_index - chunk_region.first); 1299 } 1300 1301 // Mark a CPU page as clean. 1302 static void block_mark_cpu_page_clean(uvm_va_block_t *block, uvm_page_index_t page_index) 1303 { 1304 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 1305 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index); 1306 uvm_cpu_chunk_mark_clean(chunk, page_index - chunk_region.first); 1307 } 1308 1309 // Check if a CPU page is dirty. 1310 static bool block_cpu_page_is_dirty(uvm_va_block_t *block, uvm_page_index_t page_index) 1311 { 1312 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 1313 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index); 1314 return uvm_cpu_chunk_is_dirty(chunk, page_index - chunk_region.first); 1315 } 1316 1317 static NV_STATUS block_alloc_cpu_chunk(uvm_va_block_t *block, 1318 uvm_chunk_size_t alloc_size, 1319 uvm_cpu_chunk_alloc_flags_t flags, 1320 uvm_cpu_chunk_t **chunk) 1321 { 1322 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); 1323 1324 // Return out of memory error if the tests have requested it. As opposed to 1325 // other error injection settings, this one fails N times and then succeeds. 1326 // TODO: Bug 3701182: This will print a warning in Linux kernels newer than 1327 // 5.16.0-rc1+. 1328 if (block_test && block_test->inject_cpu_pages_allocation_error_count) { 1329 if (block_test->inject_cpu_pages_allocation_error_count != ~(NvU32)0) 1330 block_test->inject_cpu_pages_allocation_error_count--; 1331 return NV_ERR_NO_MEMORY; 1332 } 1333 1334 return uvm_cpu_chunk_alloc(alloc_size, flags, chunk); 1335 } 1336 1337 // Allocates the input page in the block, if it doesn't already exist 1338 // 1339 // Also maps the page for physical access by all GPUs used by the block, which 1340 // is required for IOMMU support. Skipped on GPUs without access to CPU memory. 1341 // e.g., this happens when the Confidential Computing Feature is enabled. 1342 static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block, 1343 uvm_page_mask_t *populate_page_mask, 1344 uvm_va_block_region_t populate_region, 1345 uvm_va_block_context_t *block_context) 1346 { 1347 NV_STATUS status = NV_OK; 1348 uvm_cpu_chunk_t *chunk; 1349 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); 1350 uvm_chunk_sizes_mask_t cpu_allocation_sizes = uvm_cpu_chunk_get_allocation_sizes(); 1351 uvm_chunk_size_t alloc_size; 1352 uvm_page_mask_t *resident_mask = &block_context->scratch_page_mask; 1353 uvm_cpu_chunk_alloc_flags_t alloc_flags = UVM_CPU_CHUNK_ALLOC_FLAGS_NONE; 1354 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1355 uvm_processor_mask_t uvm_lite_gpus; 1356 uvm_page_index_t page_index; 1357 uvm_gpu_id_t id; 1358 1359 // Check whether all requested pages have already been allocated. 1360 uvm_page_mask_init_from_region(&block_context->scratch_page_mask, populate_region, populate_page_mask); 1361 if (!uvm_page_mask_andnot(&block_context->scratch_page_mask, 1362 &block_context->scratch_page_mask, 1363 &block->cpu.allocated)) 1364 return NV_OK; 1365 1366 if (block_test) { 1367 if (block_test->cpu_chunk_allocation_size_mask) 1368 cpu_allocation_sizes &= block_test->cpu_chunk_allocation_size_mask; 1369 } 1370 1371 uvm_page_mask_zero(resident_mask); 1372 for_each_id_in_mask (id, &block->resident) 1373 uvm_page_mask_or(resident_mask, resident_mask, uvm_va_block_resident_mask_get(block, id)); 1374 1375 // If the VA space has a UVM-Lite GPU registered, only PAGE_SIZE allocations 1376 // should be used in order to avoid extra copies due to dirty compound 1377 // pages. HMM va_blocks also require PAGE_SIZE allocations. 1378 // TODO: Bug 3368756: add support for HMM transparent huge page (THP) 1379 // migrations. 1380 uvm_processor_mask_andnot(&uvm_lite_gpus, &va_space->registered_gpus, &va_space->faultable_processors); 1381 if (!uvm_processor_mask_empty(&uvm_lite_gpus) || uvm_va_block_is_hmm(block)) 1382 cpu_allocation_sizes = PAGE_SIZE; 1383 1384 if (block_context->mm) 1385 alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ACCOUNT; 1386 1387 UVM_ASSERT(cpu_allocation_sizes >= PAGE_SIZE); 1388 UVM_ASSERT(cpu_allocation_sizes & PAGE_SIZE); 1389 1390 for_each_va_block_page_in_region_mask(page_index, populate_page_mask, populate_region) { 1391 uvm_cpu_chunk_alloc_flags_t chunk_alloc_flags; 1392 uvm_va_block_region_t region = populate_region; 1393 1394 if (uvm_page_mask_test(&block->cpu.allocated, page_index)) { 1395 page_index = uvm_va_block_next_unset_page_in_mask(populate_region, &block->cpu.allocated, page_index) - 1; 1396 continue; 1397 } 1398 1399 UVM_ASSERT(!uvm_page_mask_test(&block->cpu.resident, page_index)); 1400 1401 chunk_alloc_flags = alloc_flags; 1402 1403 // Attempt to allocate CPU pages with the largest physically contiguous 1404 // size from the set of CPU chunk sizes that we can. 1405 // This is accomplished by: 1406 // 1. Aligning the CPU page address down to the allocation size. 1407 // 2. Ensuring that the entire allocation region fits withing the VA 1408 // block. 1409 // 3. Ensuring that the region covered by the allocation is empty. 1410 for_each_chunk_size_rev(alloc_size, cpu_allocation_sizes) { 1411 NvU64 alloc_virt_addr; 1412 1413 chunk = NULL; 1414 alloc_virt_addr = UVM_ALIGN_DOWN(uvm_va_block_cpu_page_address(block, page_index), alloc_size); 1415 1416 if (!uvm_va_block_contains_address(block, alloc_virt_addr) || 1417 !uvm_va_block_contains_address(block, alloc_virt_addr + alloc_size - 1)) 1418 continue; 1419 1420 region = uvm_va_block_region_from_start_end(block, alloc_virt_addr, alloc_virt_addr + alloc_size - 1); 1421 1422 if (!uvm_page_mask_region_empty(&block->cpu.allocated, region)) 1423 continue; 1424 1425 // If not all pages in the allocation region are resident somewhere, 1426 // zero out the allocated page. 1427 // This could be wasteful if only a few pages in high-order 1428 // allocation need to be zero'ed out but the alternative is to map 1429 // single sub-pages one-by-one. 1430 if (!uvm_page_mask_region_full(resident_mask, region)) 1431 chunk_alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO; 1432 1433 status = block_alloc_cpu_chunk(block, alloc_size, chunk_alloc_flags, &chunk); 1434 if (status == NV_OK) { 1435 page_index = region.first; 1436 break; 1437 } 1438 1439 UVM_ASSERT(status == NV_ERR_NO_MEMORY); 1440 } 1441 1442 if (status != NV_OK) 1443 break; 1444 1445 status = uvm_cpu_chunk_insert_in_block(block, chunk, page_index); 1446 if (status != NV_OK) { 1447 uvm_cpu_chunk_free(chunk); 1448 return status; 1449 } 1450 1451 status = uvm_va_block_map_cpu_chunk_on_gpus(block, page_index); 1452 if (status != NV_OK) 1453 break; 1454 1455 // Skip iterating over all pages covered by the allocated chunk. 1456 page_index = region.outer - 1; 1457 } 1458 1459 if (status != NV_OK && chunk) { 1460 uvm_cpu_chunk_remove_from_block(block, page_index); 1461 uvm_cpu_chunk_free(chunk); 1462 } 1463 1464 return status; 1465 } 1466 1467 // Try allocating a chunk. If eviction was required, 1468 // NV_ERR_MORE_PROCESSING_REQUIRED will be returned since the block's lock was 1469 // unlocked and relocked. The caller is responsible for adding the chunk to the 1470 // retry used_chunks list. 1471 static NV_STATUS block_alloc_gpu_chunk(uvm_va_block_t *block, 1472 uvm_va_block_retry_t *retry, 1473 uvm_gpu_t *gpu, 1474 uvm_chunk_size_t size, 1475 uvm_gpu_chunk_t **out_gpu_chunk) 1476 { 1477 NV_STATUS status = NV_OK; 1478 uvm_gpu_chunk_t *gpu_chunk; 1479 1480 // First try getting a free chunk from previously-made allocations. 1481 gpu_chunk = block_retry_get_free_chunk(retry, gpu, size); 1482 if (!gpu_chunk) { 1483 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); 1484 if (block_test && block_test->user_pages_allocation_retry_force_count > 0) { 1485 // Force eviction by pretending the allocation failed with no memory 1486 --block_test->user_pages_allocation_retry_force_count; 1487 status = NV_ERR_NO_MEMORY; 1488 } 1489 else { 1490 // Try allocating a new one without eviction 1491 status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_NONE, &gpu_chunk, &retry->tracker); 1492 } 1493 1494 if (status == NV_ERR_NO_MEMORY) { 1495 // If that fails with no memory, try allocating with eviction and 1496 // return back to the caller immediately so that the operation can 1497 // be restarted. 1498 uvm_mutex_unlock(&block->lock); 1499 1500 status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_EVICT, &gpu_chunk, &retry->tracker); 1501 if (status == NV_OK) { 1502 block_retry_add_free_chunk(retry, gpu_chunk); 1503 status = NV_ERR_MORE_PROCESSING_REQUIRED; 1504 } 1505 1506 uvm_mutex_lock(&block->lock); 1507 return status; 1508 } 1509 else if (status != NV_OK) { 1510 return status; 1511 } 1512 } 1513 1514 *out_gpu_chunk = gpu_chunk; 1515 return NV_OK; 1516 } 1517 1518 static bool block_gpu_has_page_tables(uvm_va_block_t *block, uvm_gpu_t *gpu) 1519 { 1520 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 1521 1522 if (!gpu_state) 1523 return false; 1524 1525 return gpu_state->page_table_range_4k.table || 1526 gpu_state->page_table_range_big.table || 1527 gpu_state->page_table_range_2m.table; 1528 } 1529 1530 // A helper to get a known-to-be-present GPU VA space given a VA block that's 1531 // locked. In order to use this function, the caller must know that at least one 1532 // of these conditions is true: 1533 // 1534 // 1) The VA space lock is held 1535 // 2) The VA block has active page tables for the GPU 1536 // 1537 // If the VA space lock is held (#1), then the gpu_va_space obviously can't go 1538 // away. 1539 // 1540 // On the eviction path, we don't have a lock on the VA space state. However, 1541 // since remove_gpu_va_space walks each block to unmap the GPU and free GPU page 1542 // tables before destroying the gpu_va_space, we're guaranteed that if this GPU 1543 // has page tables (#2), the gpu_va_space can't go away while we're holding the 1544 // block lock. 1545 static uvm_gpu_va_space_t *uvm_va_block_get_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_t *gpu) 1546 { 1547 uvm_gpu_va_space_t *gpu_va_space; 1548 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 1549 1550 UVM_ASSERT(gpu); 1551 1552 if (!block_gpu_has_page_tables(va_block, gpu)) 1553 uvm_assert_rwsem_locked(&va_space->lock); 1554 1555 UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id)); 1556 1557 gpu_va_space = va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)]; 1558 1559 UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE); 1560 UVM_ASSERT(gpu_va_space->va_space == va_space); 1561 UVM_ASSERT(gpu_va_space->gpu == gpu); 1562 1563 return gpu_va_space; 1564 } 1565 1566 static bool block_gpu_supports_2m(uvm_va_block_t *block, uvm_gpu_t *gpu) 1567 { 1568 uvm_gpu_va_space_t *gpu_va_space; 1569 1570 // TODO: Bug 3368756: add HMM support for transparent huge page migrations. 1571 if (uvm_va_block_size(block) < UVM_PAGE_SIZE_2M || uvm_va_block_is_hmm(block)) 1572 return false; 1573 1574 UVM_ASSERT(uvm_va_block_size(block) == UVM_PAGE_SIZE_2M); 1575 1576 gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 1577 return uvm_mmu_page_size_supported(&gpu_va_space->page_tables, UVM_PAGE_SIZE_2M); 1578 } 1579 1580 NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu) 1581 { 1582 uvm_gpu_va_space_t *gpu_va_space; 1583 1584 gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu); 1585 return gpu_va_space->page_tables.big_page_size; 1586 } 1587 1588 static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU32 big_page_size) 1589 { 1590 NvU64 first_addr = UVM_ALIGN_UP(start, big_page_size); 1591 NvU64 outer_addr = UVM_ALIGN_DOWN(end + 1, big_page_size); 1592 1593 // The range must fit within a VA block 1594 UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end)); 1595 1596 if (outer_addr <= first_addr) 1597 return uvm_va_block_region(0, 0); 1598 1599 return uvm_va_block_region((first_addr - start) / PAGE_SIZE, (outer_addr - start) / PAGE_SIZE); 1600 } 1601 1602 static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU32 big_page_size) 1603 { 1604 uvm_va_block_region_t region = range_big_page_region_all(start, end, big_page_size); 1605 return (size_t)uvm_div_pow2_64(uvm_va_block_region_size(region), big_page_size); 1606 } 1607 1608 uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size) 1609 { 1610 return range_big_page_region_all(va_block->start, va_block->end, big_page_size); 1611 } 1612 1613 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block, 1614 uvm_va_block_region_t region, 1615 NvU32 big_page_size) 1616 { 1617 NvU64 start = uvm_va_block_region_start(va_block, region); 1618 NvU64 end = uvm_va_block_region_end(va_block, region); 1619 uvm_va_block_region_t big_region; 1620 1621 UVM_ASSERT(start < va_block->end); 1622 UVM_ASSERT(end <= va_block->end); 1623 1624 big_region = range_big_page_region_all(start, end, big_page_size); 1625 if (big_region.outer) { 1626 big_region.first += region.first; 1627 big_region.outer += region.first; 1628 } 1629 1630 return big_region; 1631 } 1632 1633 size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size) 1634 { 1635 return range_num_big_pages(va_block->start, va_block->end, big_page_size); 1636 } 1637 1638 NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size) 1639 { 1640 NvU64 addr = UVM_ALIGN_UP(va_block->start, big_page_size) + (big_page_index * big_page_size); 1641 UVM_ASSERT(addr >= va_block->start); 1642 UVM_ASSERT(addr < va_block->end); 1643 return addr; 1644 } 1645 1646 uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size) 1647 { 1648 NvU64 page_addr = uvm_va_block_big_page_addr(va_block, big_page_index, big_page_size); 1649 1650 // Assume that we don't have to handle multiple big PTEs per system page. 1651 // It's not terribly difficult to implement, but we don't currently have a 1652 // use case. 1653 UVM_ASSERT(big_page_size >= PAGE_SIZE); 1654 1655 return uvm_va_block_region_from_start_size(va_block, page_addr, big_page_size); 1656 } 1657 1658 // Returns the big page index (the bit index within 1659 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If 1660 // page_index cannot be covered by a big PTE due to alignment or block size, 1661 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned. 1662 size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size) 1663 { 1664 uvm_va_block_region_t big_region_all = uvm_va_block_big_page_region_all(va_block, big_page_size); 1665 size_t big_index; 1666 1667 // Note that this condition also handles the case of having no big pages in 1668 // the block, in which case .first >= .outer. 1669 if (page_index < big_region_all.first || page_index >= big_region_all.outer) 1670 return MAX_BIG_PAGES_PER_UVM_VA_BLOCK; 1671 1672 big_index = (size_t)uvm_div_pow2_64((page_index - big_region_all.first) * PAGE_SIZE, big_page_size); 1673 1674 UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) >= va_block->start); 1675 UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) + big_page_size <= va_block->end + 1); 1676 1677 return big_index; 1678 } 1679 1680 static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block, 1681 uvm_gpu_t *gpu, 1682 uvm_page_mask_t *mask_out, 1683 const unsigned long *big_ptes_in) 1684 { 1685 uvm_va_block_region_t big_region; 1686 size_t big_page_index; 1687 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); 1688 1689 uvm_page_mask_zero(mask_out); 1690 1691 for_each_set_bit(big_page_index, big_ptes_in, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 1692 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 1693 uvm_page_mask_region_fill(mask_out, big_region); 1694 } 1695 } 1696 1697 NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index) 1698 { 1699 if (!uvm_page_mask_test(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index)) 1700 return 0; 1701 1702 UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU)); 1703 1704 // Despite the fact that physical CPU memory can be allocated at sizes 1705 // greater than PAGE_SIZE, vm_insert_page(s)() always maps CPU memory 1706 // with 4K PTEs. Until the core kernel adds support for PMD mappings, 1707 // the return value of this function will remain at PAGE_SIZE. 1708 return PAGE_SIZE; 1709 } 1710 1711 NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index) 1712 { 1713 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 1714 size_t big_page_size, big_page_index; 1715 1716 if (!gpu_state) 1717 return 0; 1718 1719 if (!uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index)) 1720 return 0; 1721 1722 UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, gpu_id)); 1723 1724 if (gpu_state->pte_is_2m) 1725 return UVM_PAGE_SIZE_2M; 1726 1727 big_page_size = uvm_va_block_gpu_big_page_size(va_block, block_get_gpu(va_block, gpu_id)); 1728 big_page_index = uvm_va_block_big_page_index(va_block, page_index, big_page_size); 1729 if (big_page_index != MAX_BIG_PAGES_PER_UVM_VA_BLOCK && test_bit(big_page_index, gpu_state->big_ptes)) 1730 return big_page_size; 1731 1732 return UVM_PAGE_SIZE_4K; 1733 } 1734 1735 // Get the size of the physical allocation backing the page, or 0 if not 1736 // resident. Note that this is different from uvm_va_block_page_size_* because 1737 // those return the size of the PTE which maps the page index, which may be 1738 // smaller than the physical allocation. 1739 static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page) 1740 { 1741 uvm_va_block_gpu_state_t *gpu_state; 1742 uvm_chunk_size_t chunk_size; 1743 1744 if (UVM_ID_IS_CPU(page.processor)) { 1745 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page.page_index); 1746 1747 if (!uvm_page_mask_test(&block->cpu.resident, page.page_index)) 1748 return 0; 1749 1750 UVM_ASSERT(uvm_processor_mask_test(&block->resident, UVM_ID_CPU)); 1751 return (NvU32)uvm_cpu_chunk_get_size(chunk); 1752 } 1753 1754 gpu_state = uvm_va_block_gpu_state_get(block, page.processor); 1755 if (!gpu_state || !uvm_page_mask_test(&gpu_state->resident, page.page_index)) 1756 return 0; 1757 1758 UVM_ASSERT(uvm_processor_mask_test(&block->resident, page.processor)); 1759 block_gpu_chunk_index(block, block_get_gpu(block, page.processor), page.page_index, &chunk_size); 1760 return (NvU32)chunk_size; 1761 } 1762 1763 static uvm_pte_bits_cpu_t get_cpu_pte_bit_index(uvm_prot_t prot) 1764 { 1765 uvm_pte_bits_cpu_t pte_bit_index = UVM_PTE_BITS_CPU_MAX; 1766 1767 // ATOMIC and WRITE are synonyms for the CPU 1768 if (prot == UVM_PROT_READ_WRITE_ATOMIC || prot == UVM_PROT_READ_WRITE) 1769 pte_bit_index = UVM_PTE_BITS_CPU_WRITE; 1770 else if (prot == UVM_PROT_READ_ONLY) 1771 pte_bit_index = UVM_PTE_BITS_CPU_READ; 1772 else 1773 UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot)); 1774 1775 return pte_bit_index; 1776 } 1777 1778 static uvm_pte_bits_gpu_t get_gpu_pte_bit_index(uvm_prot_t prot) 1779 { 1780 uvm_pte_bits_gpu_t pte_bit_index = UVM_PTE_BITS_GPU_MAX; 1781 1782 if (prot == UVM_PROT_READ_WRITE_ATOMIC) 1783 pte_bit_index = UVM_PTE_BITS_GPU_ATOMIC; 1784 else if (prot == UVM_PROT_READ_WRITE) 1785 pte_bit_index = UVM_PTE_BITS_GPU_WRITE; 1786 else if (prot == UVM_PROT_READ_ONLY) 1787 pte_bit_index = UVM_PTE_BITS_GPU_READ; 1788 else 1789 UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot)); 1790 1791 return pte_bit_index; 1792 } 1793 1794 uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor) 1795 { 1796 uvm_va_block_gpu_state_t *gpu_state; 1797 1798 if (UVM_ID_IS_CPU(processor)) 1799 return &block->cpu.resident; 1800 1801 gpu_state = uvm_va_block_gpu_state_get(block, processor); 1802 1803 UVM_ASSERT(gpu_state); 1804 return &gpu_state->resident; 1805 } 1806 1807 // Get the page residency mask for a processor 1808 // 1809 // Notably this will allocate GPU state if not yet present and if that fails 1810 // NULL is returned. 1811 static uvm_page_mask_t *block_resident_mask_get_alloc(uvm_va_block_t *block, uvm_processor_id_t processor) 1812 { 1813 uvm_va_block_gpu_state_t *gpu_state; 1814 1815 if (UVM_ID_IS_CPU(processor)) 1816 return &block->cpu.resident; 1817 1818 gpu_state = block_gpu_state_get_alloc(block, block_get_gpu(block, processor)); 1819 if (!gpu_state) 1820 return NULL; 1821 1822 return &gpu_state->resident; 1823 } 1824 1825 static const uvm_page_mask_t *block_map_with_prot_mask_get(uvm_va_block_t *block, 1826 uvm_processor_id_t processor, 1827 uvm_prot_t prot) 1828 { 1829 uvm_va_block_gpu_state_t *gpu_state; 1830 1831 if (UVM_ID_IS_CPU(processor)) 1832 return &block->cpu.pte_bits[get_cpu_pte_bit_index(prot)]; 1833 1834 gpu_state = uvm_va_block_gpu_state_get(block, processor); 1835 1836 UVM_ASSERT(gpu_state); 1837 return &gpu_state->pte_bits[get_gpu_pte_bit_index(prot)]; 1838 } 1839 1840 const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor) 1841 { 1842 return block_map_with_prot_mask_get(block, processor, UVM_PROT_READ_ONLY); 1843 } 1844 1845 static const uvm_page_mask_t *block_evicted_mask_get(uvm_va_block_t *block, uvm_gpu_id_t gpu_id) 1846 { 1847 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id); 1848 UVM_ASSERT(gpu_state); 1849 1850 return &gpu_state->evicted; 1851 } 1852 1853 static bool block_is_page_resident_anywhere(uvm_va_block_t *block, uvm_page_index_t page_index) 1854 { 1855 uvm_processor_id_t id; 1856 for_each_id_in_mask(id, &block->resident) { 1857 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(block, id), page_index)) 1858 return true; 1859 } 1860 1861 return false; 1862 } 1863 1864 static bool block_processor_page_is_populated(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index) 1865 { 1866 uvm_va_block_gpu_state_t *gpu_state; 1867 size_t chunk_index; 1868 1869 if (UVM_ID_IS_CPU(proc)) 1870 return uvm_page_mask_test(&block->cpu.allocated, page_index); 1871 1872 gpu_state = uvm_va_block_gpu_state_get(block, proc); 1873 if (!gpu_state) 1874 return false; 1875 1876 chunk_index = block_gpu_chunk_index(block, block_get_gpu(block, proc), page_index, NULL); 1877 return gpu_state->chunks[chunk_index] != NULL; 1878 } 1879 1880 static bool block_processor_page_is_resident_on(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index) 1881 { 1882 const uvm_page_mask_t *resident_mask; 1883 1884 if (UVM_ID_IS_CPU(proc)) { 1885 resident_mask = &block->cpu.resident; 1886 } 1887 else { 1888 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, proc); 1889 if (!gpu_state) 1890 return false; 1891 1892 resident_mask = &gpu_state->resident; 1893 } 1894 1895 return uvm_page_mask_test(resident_mask, page_index); 1896 } 1897 1898 // Compute the gpus that have at least the given access permissions for the 1899 // range described by region and page_mask. The function sets the bit if any 1900 // page in the region has the permissions. 1901 static void block_region_authorized_gpus(uvm_va_block_t *va_block, 1902 uvm_va_block_region_t region, 1903 uvm_prot_t access_permission, 1904 uvm_processor_mask_t *authorized_gpus) 1905 { 1906 uvm_gpu_id_t gpu_id; 1907 uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(access_permission); 1908 1909 uvm_processor_mask_zero(authorized_gpus); 1910 1911 // Test all GPUs with mappings on the block 1912 for_each_gpu_id_in_mask(gpu_id, &va_block->mapped) { 1913 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 1914 if (gpu_state && !uvm_page_mask_region_empty(&gpu_state->pte_bits[search_gpu_bit], region)) 1915 uvm_processor_mask_set(authorized_gpus, gpu_id); 1916 } 1917 } 1918 1919 // Compute the processors that have at least the given access permissions for 1920 // the range described by region and page_mask. The function sets the bit if any 1921 // page in the region has the permissions. 1922 static void block_region_authorized_processors(uvm_va_block_t *va_block, 1923 uvm_va_block_region_t region, 1924 uvm_prot_t access_permission, 1925 uvm_processor_mask_t *authorized_processors) 1926 { 1927 uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(access_permission); 1928 1929 // Compute GPUs 1930 block_region_authorized_gpus(va_block, region, access_permission, authorized_processors); 1931 1932 // Test CPU 1933 if (uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU) && 1934 !uvm_page_mask_region_empty(&va_block->cpu.pte_bits[search_cpu_bit], region)) { 1935 uvm_processor_mask_set(authorized_processors, UVM_ID_CPU); 1936 } 1937 } 1938 1939 static void block_page_authorized_processors(uvm_va_block_t *va_block, 1940 uvm_page_index_t page_index, 1941 uvm_prot_t access_permission, 1942 uvm_processor_mask_t *authorized_processors) 1943 { 1944 block_region_authorized_processors(va_block, 1945 uvm_va_block_region_for_page(page_index), 1946 access_permission, 1947 authorized_processors); 1948 } 1949 1950 static bool block_is_gpu_authorized_on_whole_region(uvm_va_block_t *va_block, 1951 uvm_va_block_region_t region, 1952 uvm_gpu_id_t gpu_id, 1953 uvm_prot_t required_prot) 1954 { 1955 uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(required_prot); 1956 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 1957 1958 if (!gpu_state) 1959 return false; 1960 1961 return uvm_page_mask_region_full(&gpu_state->pte_bits[search_gpu_bit], region); 1962 } 1963 1964 static bool block_is_processor_authorized_on_whole_region(uvm_va_block_t *va_block, 1965 uvm_va_block_region_t region, 1966 uvm_processor_id_t processor_id, 1967 uvm_prot_t required_prot) 1968 { 1969 if (UVM_ID_IS_CPU(processor_id)) { 1970 uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(required_prot); 1971 1972 return uvm_page_mask_region_full(&va_block->cpu.pte_bits[search_cpu_bit], region); 1973 } 1974 else { 1975 return block_is_gpu_authorized_on_whole_region(va_block, region, processor_id, required_prot); 1976 } 1977 } 1978 1979 bool uvm_va_block_page_is_gpu_authorized(uvm_va_block_t *va_block, 1980 uvm_page_index_t page_index, 1981 uvm_gpu_id_t gpu_id, 1982 uvm_prot_t required_prot) 1983 { 1984 return block_is_gpu_authorized_on_whole_region(va_block, 1985 uvm_va_block_region_for_page(page_index), 1986 gpu_id, 1987 required_prot); 1988 } 1989 1990 static bool block_page_is_processor_authorized(uvm_va_block_t *va_block, 1991 uvm_page_index_t page_index, 1992 uvm_processor_id_t processor_id, 1993 uvm_prot_t required_prot) 1994 { 1995 return block_is_processor_authorized_on_whole_region(va_block, 1996 uvm_va_block_region_for_page(page_index), 1997 processor_id, 1998 required_prot); 1999 } 2000 2001 // Compute the gpus that have a copy of the given page resident in their memory 2002 static void block_page_resident_gpus(uvm_va_block_t *va_block, 2003 uvm_page_index_t page_index, 2004 uvm_processor_mask_t *resident_gpus) 2005 { 2006 uvm_gpu_id_t id; 2007 uvm_processor_mask_zero(resident_gpus); 2008 2009 for_each_gpu_id_in_mask(id, &va_block->resident) { 2010 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index)) { 2011 UVM_ASSERT(block_processor_page_is_populated(va_block, id, page_index)); 2012 uvm_processor_mask_set(resident_gpus, id); 2013 } 2014 } 2015 } 2016 2017 void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block, 2018 uvm_page_index_t page_index, 2019 uvm_processor_mask_t *resident_processors) 2020 { 2021 block_page_resident_gpus(va_block, page_index, resident_processors); 2022 2023 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU), page_index)) { 2024 UVM_ASSERT(block_processor_page_is_populated(va_block, UVM_ID_CPU, page_index)); 2025 uvm_processor_mask_set(resident_processors, UVM_ID_CPU); 2026 } 2027 } 2028 2029 NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block, uvm_page_index_t page_index) 2030 { 2031 uvm_processor_mask_t resident_processors; 2032 uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors); 2033 2034 return uvm_processor_mask_get_count(&resident_processors); 2035 } 2036 2037 static uvm_processor_id_t block_page_get_closest_resident_in_mask(uvm_va_block_t *va_block, 2038 uvm_page_index_t page_index, 2039 uvm_processor_id_t processor, 2040 const uvm_processor_mask_t *processor_mask) 2041 { 2042 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 2043 uvm_processor_mask_t search_mask; 2044 uvm_processor_id_t id; 2045 2046 if (processor_mask) 2047 uvm_processor_mask_and(&search_mask, processor_mask, &va_block->resident); 2048 else 2049 uvm_processor_mask_copy(&search_mask, &va_block->resident); 2050 2051 for_each_closest_id(id, &search_mask, processor, va_space) { 2052 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index)) 2053 return id; 2054 } 2055 2056 return UVM_ID_INVALID; 2057 } 2058 2059 uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block, 2060 uvm_page_index_t page_index, 2061 uvm_processor_id_t processor) 2062 { 2063 return block_page_get_closest_resident_in_mask(va_block, page_index, processor, NULL); 2064 } 2065 2066 // We don't track the specific aperture of each mapped page. Instead, we assume 2067 // that each virtual mapping from a given processor always targets the closest 2068 // processor on which that page is resident (with special rules for UVM-Lite). 2069 // 2070 // This function verifies that assumption: before a page becomes resident on a 2071 // new location, assert that no processor has a valid mapping to a farther 2072 // processor on that page. 2073 static bool block_check_resident_proximity(uvm_va_block_t *block, uvm_page_index_t page_index, uvm_processor_id_t new_residency) 2074 { 2075 uvm_processor_mask_t resident_procs, mapped_procs; 2076 uvm_processor_id_t mapped_id, closest_id; 2077 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 2078 2079 uvm_processor_mask_andnot(&mapped_procs, &block->mapped, block_get_uvm_lite_gpus(block)); 2080 2081 for_each_id_in_mask(mapped_id, &mapped_procs) { 2082 if (!uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index)) 2083 continue; 2084 2085 uvm_va_block_page_resident_processors(block, page_index, &resident_procs); 2086 UVM_ASSERT(!uvm_processor_mask_empty(&resident_procs)); 2087 UVM_ASSERT(!uvm_processor_mask_test(&resident_procs, new_residency)); 2088 uvm_processor_mask_set(&resident_procs, new_residency); 2089 closest_id = uvm_processor_mask_find_closest_id(va_space, &resident_procs, mapped_id); 2090 UVM_ASSERT(!uvm_id_equal(closest_id, new_residency)); 2091 } 2092 2093 return true; 2094 } 2095 2096 // Returns the processor to which page_index should be mapped on gpu 2097 static uvm_processor_id_t block_gpu_get_processor_to_map(uvm_va_block_t *block, 2098 uvm_gpu_t *gpu, 2099 uvm_page_index_t page_index) 2100 { 2101 uvm_processor_id_t dest_id; 2102 2103 // UVM-Lite GPUs can only map pages on the preferred location 2104 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) 2105 return uvm_va_range_get_policy(block->va_range)->preferred_location; 2106 2107 // Otherwise we always map the closest resident processor 2108 dest_id = uvm_va_block_page_get_closest_resident(block, page_index, gpu->id); 2109 UVM_ASSERT(UVM_ID_IS_VALID(dest_id)); 2110 return dest_id; 2111 } 2112 2113 // Returns the processor to which page_index should be mapped on mapping_id 2114 static uvm_processor_id_t block_get_processor_to_map(uvm_va_block_t *block, 2115 uvm_processor_id_t mapping_id, 2116 uvm_page_index_t page_index) 2117 { 2118 2119 if (UVM_ID_IS_CPU(mapping_id)) 2120 return uvm_va_block_page_get_closest_resident(block, page_index, mapping_id); 2121 2122 return block_gpu_get_processor_to_map(block, block_get_gpu(block, mapping_id), page_index); 2123 } 2124 2125 static void block_get_mapped_processors(uvm_va_block_t *block, 2126 uvm_processor_id_t resident_id, 2127 uvm_page_index_t page_index, 2128 uvm_processor_mask_t *mapped_procs) 2129 { 2130 uvm_processor_id_t mapped_id; 2131 2132 uvm_processor_mask_zero(mapped_procs); 2133 2134 for_each_id_in_mask(mapped_id, &block->mapped) { 2135 if (uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index)) { 2136 uvm_processor_id_t to_map_id = block_get_processor_to_map(block, mapped_id, page_index); 2137 2138 if (uvm_id_equal(to_map_id, resident_id)) 2139 uvm_processor_mask_set(mapped_procs, mapped_id); 2140 } 2141 } 2142 } 2143 2144 // We use block_gpu_get_processor_to_map to find the destination processor of a 2145 // given GPU mapping. This function is called when the mapping is established to 2146 // sanity check that the destination of the mapping matches the query. 2147 static bool block_check_mapping_residency_region(uvm_va_block_t *block, 2148 uvm_gpu_t *gpu, 2149 uvm_processor_id_t mapping_dest, 2150 uvm_va_block_region_t region, 2151 const uvm_page_mask_t *page_mask) 2152 { 2153 uvm_page_index_t page_index; 2154 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2155 NvU64 va = uvm_va_block_cpu_page_address(block, page_index); 2156 uvm_processor_id_t proc_to_map = block_gpu_get_processor_to_map(block, gpu, page_index); 2157 UVM_ASSERT_MSG(uvm_id_equal(mapping_dest, proc_to_map), 2158 "VA 0x%llx on %s: mapping %s, supposed to map %s", 2159 va, 2160 uvm_gpu_name(gpu), 2161 block_processor_name(block, mapping_dest), 2162 block_processor_name(block, proc_to_map)); 2163 } 2164 return true; 2165 } 2166 2167 static bool block_check_mapping_residency(uvm_va_block_t *block, 2168 uvm_gpu_t *gpu, 2169 uvm_processor_id_t mapping_dest, 2170 const uvm_page_mask_t *page_mask) 2171 { 2172 return block_check_mapping_residency_region(block, 2173 gpu, 2174 mapping_dest, 2175 uvm_va_block_region_from_block(block), 2176 page_mask); 2177 } 2178 2179 // Check that there are no mappings targeting resident_id from any processor in 2180 // the block. 2181 static bool block_check_processor_not_mapped(uvm_va_block_t *block, uvm_processor_id_t resident_id) 2182 { 2183 uvm_processor_id_t mapped_id; 2184 uvm_page_index_t page_index; 2185 2186 for_each_id_in_mask(mapped_id, &block->mapped) { 2187 const uvm_page_mask_t *map_mask = uvm_va_block_map_mask_get(block, mapped_id); 2188 2189 for_each_va_block_page_in_mask(page_index, map_mask, block) { 2190 uvm_processor_id_t to_map_id = block_get_processor_to_map(block, mapped_id, page_index); 2191 UVM_ASSERT(!uvm_id_equal(to_map_id, resident_id)); 2192 } 2193 } 2194 2195 return true; 2196 } 2197 2198 // Zero all pages of the newly-populated chunk which are not resident anywhere 2199 // else in the system, adding that work to the block's tracker. In all cases, 2200 // this function adds a dependency on passed in tracker to the block's tracker. 2201 static NV_STATUS block_zero_new_gpu_chunk(uvm_va_block_t *block, 2202 uvm_gpu_t *gpu, 2203 uvm_gpu_chunk_t *chunk, 2204 uvm_va_block_region_t chunk_region, 2205 uvm_tracker_t *tracker) 2206 { 2207 uvm_va_block_gpu_state_t *gpu_state; 2208 NV_STATUS status; 2209 uvm_gpu_address_t memset_addr_base, memset_addr; 2210 uvm_push_t push; 2211 uvm_gpu_id_t id; 2212 uvm_va_block_region_t subregion; 2213 uvm_page_mask_t *zero_mask; 2214 2215 UVM_ASSERT(uvm_va_block_region_size(chunk_region) == uvm_gpu_chunk_get_size(chunk)); 2216 2217 if (chunk->is_zero) 2218 return NV_OK; 2219 2220 gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 2221 zero_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS); 2222 2223 if (!zero_mask) 2224 return NV_ERR_NO_MEMORY; 2225 2226 // Tradeoff: zeroing entire chunk vs zeroing only the pages needed for the 2227 // operation. 2228 // 2229 // We may over-zero the page with this approach. For example, we might be 2230 // populating a 2MB chunk because only a single page within that chunk needs 2231 // to be made resident. If we also zero non-resident pages outside of the 2232 // strict region, we could waste the effort if those pages are populated on 2233 // another processor later and migrated here. 2234 // 2235 // We zero all non-resident pages in the chunk anyway for two reasons: 2236 // 2237 // 1) Efficiency. It's better to do all zeros as pipelined transfers once 2238 // rather than scatter them around for each populate operation. 2239 // 2240 // 2) Optimizing the common case of block_populate_gpu_chunk being called 2241 // for already-populated chunks. If we zero once at initial populate, we 2242 // can simply check whether the chunk is present in the array. Otherwise 2243 // we'd have to recompute the "is any page resident" mask every time. 2244 2245 // Roll up all pages in chunk_region which are resident somewhere 2246 uvm_page_mask_zero(zero_mask); 2247 for_each_id_in_mask(id, &block->resident) 2248 uvm_page_mask_or(zero_mask, zero_mask, uvm_va_block_resident_mask_get(block, id)); 2249 2250 // If all pages in the chunk are resident somewhere, we don't need to clear 2251 // anything. Just make sure the chunk is tracked properly. 2252 if (uvm_page_mask_region_full(zero_mask, chunk_region)) { 2253 status = uvm_tracker_add_tracker_safe(&block->tracker, tracker); 2254 goto out; 2255 } 2256 2257 // Complement to get the pages which are not resident anywhere. These 2258 // are the pages which must be zeroed. 2259 uvm_page_mask_complement(zero_mask, zero_mask); 2260 2261 memset_addr_base = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address)); 2262 memset_addr = memset_addr_base; 2263 2264 status = uvm_push_begin_acquire(gpu->channel_manager, 2265 UVM_CHANNEL_TYPE_GPU_INTERNAL, 2266 tracker, 2267 &push, 2268 "Zero out chunk [0x%llx, 0x%llx) for region [0x%llx, 0x%llx) in va block [0x%llx, 0x%llx)", 2269 chunk->address, 2270 chunk->address + uvm_gpu_chunk_get_size(chunk), 2271 uvm_va_block_region_start(block, chunk_region), 2272 uvm_va_block_region_end(block, chunk_region) + 1, 2273 block->start, 2274 block->end + 1); 2275 if (status != NV_OK) 2276 goto out; 2277 2278 for_each_va_block_subregion_in_mask(subregion, zero_mask, chunk_region) { 2279 // Pipeline the memsets since they never overlap with each other 2280 uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 2281 2282 // We'll push one membar later for all memsets in this loop 2283 uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 2284 2285 memset_addr.address = memset_addr_base.address + (subregion.first - chunk_region.first) * PAGE_SIZE; 2286 gpu->parent->ce_hal->memset_8(&push, memset_addr, 0, uvm_va_block_region_size(subregion)); 2287 } 2288 2289 // A membar from this GPU is required between this memset and any PTE write 2290 // pointing this or another GPU to this chunk. Otherwise an engine could 2291 // read the PTE then access the page before the memset write is visible to 2292 // that engine. 2293 // 2294 // This memset writes GPU memory, so local mappings need only a GPU-local 2295 // membar. We can't easily determine here whether a peer GPU will ever map 2296 // this page in the future, so always use a sysmembar. uvm_push_end provides 2297 // one by default. 2298 // 2299 // TODO: Bug 1766424: Use GPU-local membars if no peer can currently map 2300 // this page. When peer access gets enabled, do a MEMBAR_SYS at that 2301 // point. 2302 uvm_push_end(&push); 2303 status = uvm_tracker_add_push_safe(&block->tracker, &push); 2304 2305 out: 2306 if (zero_mask) 2307 kmem_cache_free(g_uvm_page_mask_cache, zero_mask); 2308 2309 return status; 2310 } 2311 2312 static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block, 2313 uvm_va_block_retry_t *retry, 2314 uvm_gpu_t *gpu, 2315 size_t chunk_index, 2316 uvm_va_block_region_t chunk_region) 2317 { 2318 uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu); 2319 uvm_gpu_chunk_t *chunk = NULL; 2320 uvm_chunk_size_t chunk_size = uvm_va_block_region_size(chunk_region); 2321 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); 2322 NV_STATUS status; 2323 2324 if (!gpu_state) 2325 return NV_ERR_NO_MEMORY; 2326 2327 uvm_assert_mutex_locked(&block->lock); 2328 UVM_ASSERT(chunk_index < block_num_gpu_chunks(block, gpu)); 2329 UVM_ASSERT(chunk_size & gpu->parent->mmu_user_chunk_sizes); 2330 2331 // We zero chunks as necessary at initial population, so if the chunk is 2332 // already populated we're done. See the comment in 2333 // block_zero_new_gpu_chunk. 2334 if (gpu_state->chunks[chunk_index]) 2335 return NV_OK; 2336 2337 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->resident, chunk_region)); 2338 2339 status = block_alloc_gpu_chunk(block, retry, gpu, chunk_size, &chunk); 2340 if (status != NV_OK) 2341 return status; 2342 2343 // In some configurations such as SR-IOV heavy, the chunk cannot be 2344 // referenced using its physical address. Create a virtual mapping. 2345 status = uvm_mmu_chunk_map(chunk); 2346 if (status != NV_OK) 2347 goto chunk_free; 2348 2349 status = block_zero_new_gpu_chunk(block, gpu, chunk, chunk_region, &retry->tracker); 2350 if (status != NV_OK) 2351 goto chunk_unmap; 2352 2353 // It is safe to modify the page index field without holding any PMM locks 2354 // because the chunk is pinned, which means that none of the other fields in 2355 // the bitmap can change. 2356 chunk->va_block_page_index = chunk_region.first; 2357 2358 // va_block_page_index is a bitfield of size PAGE_SHIFT. Make sure at 2359 // compile-time that it can store VA Block page indexes. 2360 BUILD_BUG_ON(PAGES_PER_UVM_VA_BLOCK >= PAGE_SIZE); 2361 2362 status = block_map_indirect_peers_to_gpu_chunk(block, gpu, chunk); 2363 if (status != NV_OK) 2364 goto chunk_unmap; 2365 2366 if (block_test && block_test->inject_populate_error) { 2367 block_test->inject_populate_error = false; 2368 2369 // Use NV_ERR_MORE_PROCESSING_REQUIRED to force a retry rather than 2370 // causing a fatal OOM failure. 2371 status = NV_ERR_MORE_PROCESSING_REQUIRED; 2372 goto chunk_unmap_indirect_peers; 2373 } 2374 2375 // Record the used chunk so that it can be unpinned at the end of the whole 2376 // operation. 2377 block_retry_add_used_chunk(retry, chunk); 2378 gpu_state->chunks[chunk_index] = chunk; 2379 2380 return NV_OK; 2381 2382 chunk_unmap_indirect_peers: 2383 block_unmap_indirect_peers_from_gpu_chunk(block, gpu, chunk); 2384 2385 chunk_unmap: 2386 uvm_mmu_chunk_unmap(chunk, &block->tracker); 2387 2388 chunk_free: 2389 // block_zero_new_gpu_chunk may have pushed memsets on this chunk which it 2390 // placed in the block tracker. 2391 uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker); 2392 2393 return status; 2394 } 2395 2396 // Populate all chunks which cover the given region and page mask. 2397 static NV_STATUS block_populate_pages_gpu(uvm_va_block_t *block, 2398 uvm_va_block_retry_t *retry, 2399 uvm_gpu_t *gpu, 2400 uvm_va_block_region_t region, 2401 const uvm_page_mask_t *populate_mask) 2402 { 2403 uvm_va_block_region_t chunk_region, check_region; 2404 size_t chunk_index; 2405 uvm_page_index_t page_index; 2406 uvm_chunk_size_t chunk_size; 2407 NV_STATUS status; 2408 2409 page_index = uvm_va_block_first_page_in_mask(region, populate_mask); 2410 if (page_index == region.outer) 2411 return NV_OK; 2412 2413 chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size); 2414 chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index); 2415 2416 while (1) { 2417 check_region = uvm_va_block_region(max(chunk_region.first, region.first), 2418 min(chunk_region.outer, region.outer)); 2419 page_index = uvm_va_block_first_page_in_mask(check_region, populate_mask); 2420 if (page_index != check_region.outer) { 2421 status = block_populate_gpu_chunk(block, retry, gpu, chunk_index, chunk_region); 2422 if (status != NV_OK) 2423 return status; 2424 } 2425 2426 if (check_region.outer == region.outer) 2427 break; 2428 2429 ++chunk_index; 2430 chunk_size = block_gpu_chunk_size(block, gpu, chunk_region.outer); 2431 chunk_region = uvm_va_block_region(chunk_region.outer, chunk_region.outer + (chunk_size / PAGE_SIZE)); 2432 } 2433 2434 return NV_OK; 2435 } 2436 2437 static NV_STATUS block_populate_pages(uvm_va_block_t *block, 2438 uvm_va_block_retry_t *retry, 2439 uvm_va_block_context_t *block_context, 2440 uvm_processor_id_t dest_id, 2441 uvm_va_block_region_t region, 2442 const uvm_page_mask_t *page_mask) 2443 { 2444 NV_STATUS status; 2445 const uvm_page_mask_t *resident_mask = block_resident_mask_get_alloc(block, dest_id); 2446 uvm_page_mask_t *populate_page_mask = &block_context->make_resident.page_mask; 2447 uvm_memcg_context_t memcg_context; 2448 2449 if (!resident_mask) 2450 return NV_ERR_NO_MEMORY; 2451 2452 if (page_mask) 2453 uvm_page_mask_andnot(populate_page_mask, page_mask, resident_mask); 2454 else 2455 uvm_page_mask_complement(populate_page_mask, resident_mask); 2456 2457 if (UVM_ID_IS_GPU(dest_id)) 2458 return block_populate_pages_gpu(block, retry, block_get_gpu(block, dest_id), region, populate_page_mask); 2459 2460 uvm_memcg_context_start(&memcg_context, block_context->mm); 2461 status = block_populate_pages_cpu(block, populate_page_mask, region, block_context); 2462 uvm_memcg_context_end(&memcg_context); 2463 return status; 2464 } 2465 2466 static const uvm_processor_mask_t *block_get_can_copy_from_mask(uvm_va_block_t *block, uvm_processor_id_t from) 2467 { 2468 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 2469 2470 return &va_space->can_copy_from[uvm_id_value(from)]; 2471 } 2472 2473 static bool block_can_copy_from(uvm_va_block_t *va_block, uvm_processor_id_t from, uvm_processor_id_t to) 2474 { 2475 return uvm_processor_mask_test(block_get_can_copy_from_mask(va_block, to), from); 2476 } 2477 2478 // Get the chunk containing the given page, along with the offset of that page 2479 // within the chunk. 2480 static uvm_gpu_chunk_t *block_phys_page_chunk(uvm_va_block_t *block, block_phys_page_t block_page, size_t *chunk_offset) 2481 { 2482 uvm_gpu_t *gpu = block_get_gpu(block, block_page.processor); 2483 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, block_page.processor); 2484 size_t chunk_index; 2485 uvm_gpu_chunk_t *chunk; 2486 uvm_chunk_size_t chunk_size; 2487 2488 UVM_ASSERT(gpu_state); 2489 2490 chunk_index = block_gpu_chunk_index(block, gpu, block_page.page_index, &chunk_size); 2491 chunk = gpu_state->chunks[chunk_index]; 2492 UVM_ASSERT(chunk); 2493 2494 if (chunk_offset) { 2495 size_t page_offset = block_page.page_index - 2496 uvm_va_block_chunk_region(block,chunk_size, block_page.page_index).first; 2497 *chunk_offset = page_offset * PAGE_SIZE; 2498 } 2499 2500 return chunk; 2501 } 2502 2503 // Get the physical GPU address of a block's page from the POV of the specified GPU 2504 // This is the address that should be used for making PTEs for the specified GPU. 2505 static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block, 2506 block_phys_page_t block_page, 2507 uvm_gpu_t *gpu) 2508 { 2509 uvm_va_block_gpu_state_t *accessing_gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 2510 size_t chunk_offset; 2511 uvm_gpu_chunk_t *chunk; 2512 2513 UVM_ASSERT(accessing_gpu_state); 2514 2515 if (UVM_ID_IS_CPU(block_page.processor)) { 2516 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, block_page.page_index); 2517 NvU64 dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 2518 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, 2519 uvm_cpu_chunk_get_size(chunk), 2520 block_page.page_index); 2521 2522 // The page should be mapped for physical access already as we do that 2523 // eagerly on CPU page population and GPU state alloc. 2524 UVM_ASSERT(dma_addr != 0); 2525 dma_addr += (block_page.page_index - chunk_region.first) * PAGE_SIZE; 2526 2527 return uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr); 2528 } 2529 2530 chunk = block_phys_page_chunk(block, block_page, &chunk_offset); 2531 2532 if (uvm_id_equal(block_page.processor, gpu->id)) { 2533 return uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address + chunk_offset); 2534 } 2535 else { 2536 uvm_gpu_phys_address_t phys_addr; 2537 uvm_gpu_t *owning_gpu = block_get_gpu(block, block_page.processor); 2538 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 2539 2540 UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu)); 2541 phys_addr = uvm_pmm_gpu_peer_phys_address(&owning_gpu->pmm, chunk, gpu); 2542 phys_addr.address += chunk_offset; 2543 return phys_addr; 2544 } 2545 } 2546 2547 // Get the physical GPU address of a block's page from the POV of the specified 2548 // GPU, suitable for accessing the memory from UVM-internal CE channels. 2549 // 2550 // Notably this is may be different from block_phys_page_address() to handle CE 2551 // limitations in addressing physical memory directly. 2552 static uvm_gpu_address_t block_phys_page_copy_address(uvm_va_block_t *block, 2553 block_phys_page_t block_page, 2554 uvm_gpu_t *gpu) 2555 { 2556 uvm_gpu_t *owning_gpu; 2557 size_t chunk_offset; 2558 uvm_gpu_chunk_t *chunk; 2559 uvm_gpu_address_t copy_addr; 2560 uvm_va_space_t *va_space; 2561 2562 UVM_ASSERT_MSG(block_can_copy_from(block, gpu->id, block_page.processor), 2563 "from %s to %s\n", 2564 block_processor_name(block, gpu->id), 2565 block_processor_name(block, block_page.processor)); 2566 2567 // CPU and local GPU accesses can rely on block_phys_page_address, but the 2568 // resulting physical address may need to be converted into virtual. 2569 if (UVM_ID_IS_CPU(block_page.processor) || uvm_id_equal(block_page.processor, gpu->id)) 2570 return uvm_gpu_address_copy(gpu, block_phys_page_address(block, block_page, gpu)); 2571 2572 va_space = uvm_va_block_get_va_space(block); 2573 2574 // See the comments on the peer_identity_mappings_supported assignments in 2575 // the HAL for why we disable direct copies between peers. 2576 owning_gpu = block_get_gpu(block, block_page.processor); 2577 2578 UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu)); 2579 2580 chunk = block_phys_page_chunk(block, block_page, &chunk_offset); 2581 copy_addr = uvm_pmm_gpu_peer_copy_address(&owning_gpu->pmm, chunk, gpu); 2582 copy_addr.address += chunk_offset; 2583 return copy_addr; 2584 } 2585 2586 uvm_gpu_phys_address_t uvm_va_block_res_phys_page_address(uvm_va_block_t *va_block, 2587 uvm_page_index_t page_index, 2588 uvm_processor_id_t residency, 2589 uvm_gpu_t *gpu) 2590 { 2591 uvm_assert_mutex_locked(&va_block->lock); 2592 2593 return block_phys_page_address(va_block, block_phys_page(residency, page_index), gpu); 2594 } 2595 2596 uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block, 2597 uvm_page_index_t page_index, 2598 uvm_gpu_t *gpu) 2599 { 2600 return uvm_va_block_res_phys_page_address(va_block, page_index, gpu->id, gpu); 2601 } 2602 2603 typedef struct 2604 { 2605 // Location of the memory 2606 uvm_processor_id_t id; 2607 2608 // Whether the whole block has a single physically-contiguous chunk of 2609 // storage on the processor. 2610 bool is_block_contig; 2611 2612 // Starting address of the physically-contiguous allocation, from the view 2613 // of the copying GPU. Valid only if is_block_contig. 2614 uvm_gpu_address_t gpu_address; 2615 } block_copy_addr_t; 2616 2617 typedef struct 2618 { 2619 block_copy_addr_t src; 2620 block_copy_addr_t dst; 2621 uvm_conf_computing_dma_buffer_t *dma_buffer; 2622 } block_copy_state_t; 2623 2624 // Begin a push appropriate for copying data from src_id processor to dst_id processor. 2625 // One of src_id and dst_id needs to be a GPU. 2626 static NV_STATUS block_copy_begin_push(uvm_va_block_t *va_block, 2627 block_copy_state_t *copy_state, 2628 uvm_tracker_t *tracker, 2629 uvm_push_t *push) 2630 { 2631 uvm_gpu_t *gpu; 2632 NV_STATUS status; 2633 uvm_channel_type_t channel_type; 2634 uvm_tracker_t *tracker_ptr = tracker; 2635 uvm_processor_id_t dst_id = copy_state->dst.id; 2636 uvm_processor_id_t src_id = copy_state->src.id; 2637 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 2638 2639 UVM_ASSERT_MSG(!uvm_id_equal(src_id, dst_id), 2640 "Unexpected copy to self, processor %s\n", 2641 block_processor_name(va_block, src_id)); 2642 2643 if (UVM_ID_IS_CPU(src_id)) { 2644 gpu = block_get_gpu(va_block, dst_id); 2645 channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU; 2646 } 2647 else if (UVM_ID_IS_CPU(dst_id)) { 2648 gpu = block_get_gpu(va_block, src_id); 2649 channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU; 2650 } 2651 else { 2652 // For GPU to GPU copies, prefer to "push" the data from the source as 2653 // that works better at least for P2P over PCI-E. 2654 gpu = block_get_gpu(va_block, src_id); 2655 2656 channel_type = UVM_CHANNEL_TYPE_GPU_TO_GPU; 2657 } 2658 2659 UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, dst_id), 2660 "GPU %s dst %s src %s\n", 2661 block_processor_name(va_block, gpu->id), 2662 block_processor_name(va_block, dst_id), 2663 block_processor_name(va_block, src_id)); 2664 UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, src_id), 2665 "GPU %s dst %s src %s\n", 2666 block_processor_name(va_block, gpu->id), 2667 block_processor_name(va_block, dst_id), 2668 block_processor_name(va_block, src_id)); 2669 2670 if (channel_type == UVM_CHANNEL_TYPE_GPU_TO_GPU) { 2671 uvm_gpu_t *dst_gpu = block_get_gpu(va_block, dst_id); 2672 return uvm_push_begin_acquire_gpu_to_gpu(gpu->channel_manager, 2673 dst_gpu, 2674 tracker, 2675 push, 2676 "Copy from %s to %s for block [0x%llx, 0x%llx]", 2677 block_processor_name(va_block, src_id), 2678 block_processor_name(va_block, dst_id), 2679 va_block->start, 2680 va_block->end); 2681 } 2682 2683 if (uvm_conf_computing_mode_enabled(gpu)) { 2684 // When the Confidential Feature is enabled, additional dependencies 2685 // apply to the input tracker as well as the dma_buffer tracker. 2686 // * In the CPU to GPU case, because UVM performs CPU side 2687 // crypto-operations first before the GPU copy, we both need to 2688 // ensure that the dma_buffer and the input tracker are completed. 2689 // * In the GPU to CPU case, the GPU copy happens first, but the same 2690 // principles apply. Hence, UVM acquires the input tracker and the 2691 // dma buffer. 2692 status = uvm_tracker_overwrite_safe(&local_tracker, tracker); 2693 if (status != NV_OK) 2694 goto error; 2695 2696 UVM_ASSERT(copy_state->dma_buffer == NULL); 2697 status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, 2698 ©_state->dma_buffer, 2699 &local_tracker); 2700 2701 if (status != NV_OK) 2702 goto error; 2703 2704 if (channel_type == UVM_CHANNEL_TYPE_CPU_TO_GPU) { 2705 status = uvm_tracker_wait(&local_tracker); 2706 if (status != NV_OK) 2707 goto error; 2708 } 2709 2710 tracker_ptr = &local_tracker; 2711 } 2712 2713 status = uvm_push_begin_acquire(gpu->channel_manager, 2714 channel_type, 2715 tracker_ptr, 2716 push, 2717 "Copy from %s to %s for block [0x%llx, 0x%llx]", 2718 block_processor_name(va_block, src_id), 2719 block_processor_name(va_block, dst_id), 2720 va_block->start, 2721 va_block->end); 2722 2723 error: 2724 // Caller is responsible for freeing the DMA buffer on error 2725 uvm_tracker_deinit(&local_tracker); 2726 return status; 2727 } 2728 2729 // A page is clean iff... 2730 // the destination is the preferred location and 2731 // the source is the CPU and 2732 // the destination does not support faults/eviction and 2733 // the CPU page is not dirty 2734 static bool block_page_is_clean(uvm_va_block_t *block, 2735 uvm_processor_id_t dst_id, 2736 uvm_processor_id_t src_id, 2737 uvm_page_index_t page_index) 2738 { 2739 return !uvm_va_block_is_hmm(block) && 2740 uvm_id_equal(dst_id, uvm_va_range_get_policy(block->va_range)->preferred_location) && 2741 UVM_ID_IS_CPU(src_id) && 2742 !block_get_gpu(block, dst_id)->parent->isr.replayable_faults.handling && 2743 !block_cpu_page_is_dirty(block, page_index); 2744 } 2745 2746 // When the destination is the CPU... 2747 // if the source is the preferred location, mark as clean 2748 // otherwise, mark as dirty 2749 static void block_update_page_dirty_state(uvm_va_block_t *block, 2750 uvm_processor_id_t dst_id, 2751 uvm_processor_id_t src_id, 2752 uvm_page_index_t page_index) 2753 { 2754 if (UVM_ID_IS_GPU(dst_id)) 2755 return; 2756 2757 if (uvm_id_equal(src_id, uvm_va_range_get_policy(block->va_range)->preferred_location)) 2758 block_mark_cpu_page_clean(block, page_index); 2759 else 2760 block_mark_cpu_page_dirty(block, page_index); 2761 } 2762 2763 static void block_mark_memory_used(uvm_va_block_t *block, uvm_processor_id_t id) 2764 { 2765 uvm_gpu_t *gpu; 2766 2767 if (UVM_ID_IS_CPU(id)) 2768 return; 2769 2770 gpu = block_get_gpu(block, id); 2771 2772 // If the block is of the max size and the GPU supports eviction, mark the 2773 // root chunk as used in PMM. 2774 // HMM always allocates PAGE_SIZE GPU chunks so skip HMM va_blocks. 2775 if (!uvm_va_block_is_hmm(block) && 2776 uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX && 2777 uvm_gpu_supports_eviction(gpu)) { 2778 // The chunk has to be there if this GPU is resident 2779 UVM_ASSERT(uvm_processor_mask_test(&block->resident, id)); 2780 uvm_pmm_gpu_mark_root_chunk_used(&gpu->pmm, uvm_va_block_gpu_state_get(block, gpu->id)->chunks[0]); 2781 } 2782 } 2783 2784 static void block_set_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id) 2785 { 2786 UVM_ASSERT(!uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id))); 2787 2788 if (uvm_processor_mask_test_and_set(&block->resident, id)) 2789 return; 2790 2791 block_mark_memory_used(block, id); 2792 } 2793 2794 static void block_clear_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id) 2795 { 2796 uvm_gpu_t *gpu; 2797 2798 UVM_ASSERT(uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id))); 2799 2800 if (!uvm_processor_mask_test_and_clear(&block->resident, id)) 2801 return; 2802 2803 if (UVM_ID_IS_CPU(id)) 2804 return; 2805 2806 gpu = block_get_gpu(block, id); 2807 2808 // If the block is of the max size and the GPU supports eviction, mark the 2809 // root chunk as unused in PMM. 2810 if (!uvm_va_block_is_hmm(block) && 2811 uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX && 2812 uvm_gpu_supports_eviction(gpu)) { 2813 // The chunk may not be there any more when residency is cleared. 2814 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 2815 if (gpu_state && gpu_state->chunks[0]) 2816 uvm_pmm_gpu_mark_root_chunk_unused(&gpu->pmm, gpu_state->chunks[0]); 2817 } 2818 } 2819 2820 static bool block_phys_copy_contig_check(uvm_va_block_t *block, 2821 uvm_page_index_t page_index, 2822 const uvm_gpu_address_t *base_address, 2823 uvm_processor_id_t proc_id, 2824 uvm_gpu_t *copying_gpu) 2825 { 2826 uvm_gpu_address_t page_address; 2827 uvm_gpu_address_t contig_address = *base_address; 2828 2829 contig_address.address += page_index * PAGE_SIZE; 2830 2831 page_address = block_phys_page_copy_address(block, block_phys_page(proc_id, page_index), copying_gpu); 2832 2833 return uvm_gpu_addr_cmp(page_address, contig_address) == 0; 2834 } 2835 2836 // Check if the VA block has a single physically-contiguous chunk of storage 2837 // on the processor. 2838 static bool is_block_phys_contig(uvm_va_block_t *block, uvm_processor_id_t id) 2839 { 2840 uvm_cpu_chunk_t *chunk; 2841 2842 if (UVM_ID_IS_GPU(id)) 2843 return uvm_va_block_size(block) == block_gpu_chunk_size(block, block_get_gpu(block, id), 0); 2844 2845 chunk = uvm_cpu_chunk_first_in_region(block, uvm_va_block_region_from_block(block), NULL); 2846 return chunk && (uvm_va_block_size(block) == uvm_cpu_chunk_get_size(chunk)); 2847 } 2848 2849 static uvm_va_block_region_t block_phys_contig_region(uvm_va_block_t *block, 2850 uvm_page_index_t page_index, 2851 uvm_processor_id_t resident_id) 2852 { 2853 if (UVM_ID_IS_CPU(resident_id)) { 2854 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 2855 return uvm_cpu_chunk_block_region(block, chunk, page_index); 2856 } 2857 else { 2858 uvm_chunk_size_t chunk_size; 2859 (void)block_gpu_chunk_index(block, block_get_gpu(block, resident_id), page_index, &chunk_size); 2860 return uvm_va_block_chunk_region(block, chunk_size, page_index); 2861 } 2862 } 2863 2864 // Like block_phys_page_copy_address, but uses the address cached in bca when 2865 // possible. 2866 static uvm_gpu_address_t block_copy_get_address(uvm_va_block_t *block, 2867 block_copy_addr_t *bca, 2868 uvm_page_index_t page_index, 2869 uvm_gpu_t *copying_gpu) 2870 { 2871 if (bca->is_block_contig) { 2872 uvm_gpu_address_t addr = bca->gpu_address; 2873 addr.address += page_index * PAGE_SIZE; 2874 UVM_ASSERT(block_phys_copy_contig_check(block, page_index, &bca->gpu_address, bca->id, copying_gpu)); 2875 return addr; 2876 } 2877 2878 return block_phys_page_copy_address(block, block_phys_page(bca->id, page_index), copying_gpu); 2879 } 2880 2881 // When the Confidential Computing feature is enabled, the function performs 2882 // CPU side page encryption and GPU side decryption to the CPR. 2883 // GPU operations respect the caller's membar previously set in the push. 2884 static void conf_computing_block_copy_push_cpu_to_gpu(uvm_va_block_t *block, 2885 block_copy_state_t *copy_state, 2886 uvm_va_block_region_t region, 2887 uvm_push_t *push) 2888 { 2889 uvm_push_flag_t membar_flag = 0; 2890 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 2891 uvm_page_index_t page_index = region.first; 2892 uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer; 2893 struct page *src_page = uvm_cpu_chunk_get_cpu_page(block, page_index); 2894 uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu); 2895 uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu); 2896 char *cpu_auth_tag_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag) + 2897 (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE); 2898 uvm_gpu_address_t dst_address = block_copy_get_address(block, ©_state->dst, page_index, gpu); 2899 char *cpu_va_staging_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc) + (page_index * PAGE_SIZE); 2900 2901 UVM_ASSERT(UVM_ID_IS_CPU(copy_state->src.id)); 2902 UVM_ASSERT(UVM_ID_IS_GPU(copy_state->dst.id)); 2903 2904 UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); 2905 2906 // See comment in block_copy_begin_push. 2907 UVM_ASSERT(uvm_tracker_is_completed(&block->tracker)); 2908 2909 staging_buffer.address += page_index * PAGE_SIZE; 2910 auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 2911 2912 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE)) 2913 membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE; 2914 else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU)) 2915 membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU; 2916 2917 // kmap() only guarantees PAGE_SIZE contiguity, all encryption and 2918 // decryption must happen on a PAGE_SIZE basis. 2919 for_each_va_block_page_in_region(page_index, region) { 2920 void *src_cpu_virt_addr; 2921 2922 // The caller guarantees that all pages in region are contiguous, 2923 // meaning they're guaranteed to be part of the same compound page. 2924 UVM_ASSERT(src_page == uvm_cpu_chunk_get_cpu_page(block, page_index)); 2925 2926 src_cpu_virt_addr = kmap(src_page); 2927 uvm_conf_computing_cpu_encrypt(push->channel, 2928 cpu_va_staging_buffer, 2929 src_cpu_virt_addr, 2930 NULL, 2931 PAGE_SIZE, 2932 cpu_auth_tag_buffer); 2933 kunmap(src_page); 2934 2935 // First LCE operation should be non-pipelined to guarantee ordering as 2936 // we do not know when was the last non-pipelined copy. 2937 // Last one applies the membar originally planned for the push if any 2938 // TODO: 3857691: Inherit policy instead of forcing first invocation to 2939 // be non pipelined. 2940 if (page_index > region.first) 2941 uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 2942 2943 if (page_index < (region.outer - 1)) 2944 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 2945 else if (membar_flag) 2946 uvm_push_set_flag(push, membar_flag); 2947 2948 gpu->parent->ce_hal->decrypt(push, dst_address, staging_buffer, PAGE_SIZE, auth_tag_buffer); 2949 2950 src_page++; 2951 dst_address.address += PAGE_SIZE; 2952 cpu_va_staging_buffer += PAGE_SIZE; 2953 staging_buffer.address += PAGE_SIZE; 2954 cpu_auth_tag_buffer += UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 2955 auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 2956 } 2957 } 2958 2959 // When the Confidential Computing feature is enabled, the function performs 2960 // GPU side page encryption. GPU operations respect the caller's membar 2961 // previously set in the push. 2962 static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block, 2963 block_copy_state_t *copy_state, 2964 uvm_va_block_region_t region, 2965 uvm_push_t *push) 2966 { 2967 uvm_push_flag_t membar_flag = 0; 2968 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 2969 uvm_page_index_t page_index = region.first; 2970 uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer; 2971 uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu); 2972 uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu); 2973 uvm_gpu_address_t src_address = block_copy_get_address(block, ©_state->src, page_index, gpu); 2974 2975 UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id)); 2976 UVM_ASSERT(UVM_ID_IS_CPU(copy_state->dst.id)); 2977 2978 UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); 2979 2980 staging_buffer.address += page_index * PAGE_SIZE; 2981 auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 2982 2983 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE)) 2984 membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE; 2985 else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU)) 2986 membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU; 2987 2988 // Because we use kmap() for mapping pages for CPU side 2989 // crypto-operations and it only guarantees PAGE_SIZE contiguity, all 2990 // encryptions and decryptions must happen on a PAGE_SIZE basis. 2991 for_each_va_block_page_in_region(page_index, region) { 2992 uvm_conf_computing_log_gpu_encryption(push->channel, &dma_buffer->decrypt_iv[page_index]); 2993 2994 // First LCE operation should be non-pipelined to guarantee ordering as 2995 // we do not know when was the last non-pipelined copy. 2996 // Last one applies the membar originally planned for the push if any 2997 // TODO: 3857691: Inherit policy instead of forcing first invocation to 2998 // be non pipelined. 2999 if (page_index > region.first) 3000 uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 3001 3002 if (page_index < (region.outer - 1)) 3003 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 3004 else if (membar_flag) 3005 uvm_push_set_flag(push, membar_flag); 3006 3007 gpu->parent->ce_hal->encrypt(push, staging_buffer, src_address, PAGE_SIZE, auth_tag_buffer); 3008 3009 src_address.address += PAGE_SIZE; 3010 staging_buffer.address += PAGE_SIZE; 3011 auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 3012 } 3013 3014 uvm_page_mask_region_fill(&dma_buffer->encrypted_page_mask, region); 3015 } 3016 3017 static NV_STATUS conf_computing_copy_pages_finish(uvm_va_block_t *block, 3018 block_copy_state_t *copy_state, 3019 uvm_push_t *push) 3020 { 3021 NV_STATUS status; 3022 uvm_page_index_t page_index; 3023 uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer; 3024 uvm_page_mask_t *encrypted_page_mask = &dma_buffer->encrypted_page_mask; 3025 void *auth_tag_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag); 3026 void *staging_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc); 3027 3028 UVM_ASSERT(uvm_conf_computing_mode_enabled(push->gpu)); 3029 3030 if (UVM_ID_IS_GPU(copy_state->dst.id)) 3031 return NV_OK; 3032 3033 UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id)); 3034 3035 status = uvm_push_wait(push); 3036 if (status != NV_OK) 3037 return status; 3038 3039 // kmap() only guarantees PAGE_SIZE contiguity, all encryption and 3040 // decryption must happen on a PAGE_SIZE basis. 3041 for_each_va_block_page_in_mask(page_index, encrypted_page_mask, block) { 3042 struct page *dst_page = uvm_cpu_chunk_get_cpu_page(block, page_index); 3043 void *staging_buffer = (char *)staging_buffer_base + (page_index * PAGE_SIZE); 3044 void *auth_tag_buffer = (char *)auth_tag_buffer_base + (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE); 3045 void *cpu_page_address = kmap(dst_page); 3046 3047 status = uvm_conf_computing_cpu_decrypt(push->channel, 3048 cpu_page_address, 3049 staging_buffer, 3050 &dma_buffer->decrypt_iv[page_index], 3051 PAGE_SIZE, 3052 auth_tag_buffer); 3053 kunmap(dst_page); 3054 if (status != NV_OK) { 3055 // TODO: Bug 3814087: [UVM][HCC] Handle CSL auth_tag verification 3056 // failures & other failures gracefully. 3057 // uvm_conf_computing_cpu_decrypt() can fail if the authentication 3058 // tag verification fails. May this happen, it is considered a 3059 // critical failure and cannot be recovered. 3060 uvm_global_set_fatal_error(status); 3061 return status; 3062 } 3063 } 3064 3065 return NV_OK; 3066 } 3067 3068 static void block_copy_push(uvm_va_block_t *block, 3069 block_copy_state_t *copy_state, 3070 uvm_va_block_region_t region, 3071 uvm_push_t *push) 3072 { 3073 uvm_gpu_address_t gpu_dst_address; 3074 uvm_gpu_address_t gpu_src_address; 3075 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 3076 3077 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 3078 3079 if (uvm_conf_computing_mode_enabled(gpu)) { 3080 if (UVM_ID_IS_CPU(copy_state->src.id)) 3081 conf_computing_block_copy_push_cpu_to_gpu(block, copy_state, region, push); 3082 else 3083 conf_computing_block_copy_push_gpu_to_cpu(block, copy_state, region, push); 3084 3085 return; 3086 } 3087 3088 gpu_dst_address = block_copy_get_address(block, ©_state->dst, region.first, gpu); 3089 gpu_src_address = block_copy_get_address(block, ©_state->src, region.first, gpu); 3090 gpu->parent->ce_hal->memcopy(push, gpu_dst_address, gpu_src_address, uvm_va_block_region_size(region)); 3091 } 3092 3093 static NV_STATUS block_copy_end_push(uvm_va_block_t *block, 3094 block_copy_state_t *copy_state, 3095 uvm_tracker_t *copy_tracker, 3096 NV_STATUS push_status, 3097 uvm_push_t *push) 3098 { 3099 NV_STATUS tracker_status; 3100 3101 // TODO: Bug 1766424: If the destination is a GPU and the copy was done 3102 // by that GPU, use a GPU-local membar if no peer can currently 3103 // map this page. When peer access gets enabled, do a MEMBAR_SYS 3104 // at that point. 3105 uvm_push_end(push); 3106 3107 if ((push_status == NV_OK) && uvm_conf_computing_mode_enabled(push->gpu)) 3108 push_status = conf_computing_copy_pages_finish(block, copy_state, push); 3109 3110 tracker_status = uvm_tracker_add_push_safe(copy_tracker, push); 3111 if (push_status == NV_OK) 3112 push_status = tracker_status; 3113 3114 if (uvm_conf_computing_mode_enabled(push->gpu)) { 3115 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 3116 3117 uvm_tracker_overwrite_with_push(&local_tracker, push); 3118 uvm_conf_computing_dma_buffer_free(&push->gpu->conf_computing.dma_buffer_pool, 3119 copy_state->dma_buffer, 3120 &local_tracker); 3121 copy_state->dma_buffer = NULL; 3122 uvm_tracker_deinit(&local_tracker); 3123 } 3124 3125 return push_status; 3126 } 3127 3128 // Copies pages resident on the src_id processor to the dst_id processor 3129 // 3130 // The function adds the pages that were successfully copied to the output 3131 // migrated_pages mask and returns the number of pages in copied_pages. These 3132 // fields are reliable even if an error is returned. 3133 // 3134 // Acquires the block's tracker and adds all of its pushes to the copy_tracker. 3135 static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block, 3136 uvm_va_block_context_t *block_context, 3137 uvm_processor_id_t dst_id, 3138 uvm_processor_id_t src_id, 3139 uvm_va_block_region_t region, 3140 uvm_page_mask_t *copy_mask, 3141 const uvm_page_mask_t *prefetch_page_mask, 3142 uvm_va_block_transfer_mode_t transfer_mode, 3143 uvm_page_mask_t *migrated_pages, 3144 NvU32 *copied_pages, 3145 uvm_tracker_t *copy_tracker) 3146 { 3147 NV_STATUS status = NV_OK; 3148 uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(block, dst_id); 3149 uvm_gpu_t *copying_gpu = NULL; 3150 uvm_push_t push; 3151 uvm_page_index_t page_index; 3152 uvm_page_index_t contig_start_index = region.outer; 3153 uvm_page_index_t last_index = region.outer; 3154 uvm_range_group_range_t *rgr = NULL; 3155 bool rgr_has_changed = false; 3156 uvm_make_resident_cause_t cause = block_context->make_resident.cause; 3157 uvm_make_resident_cause_t contig_cause = cause; 3158 const bool may_prefetch = (cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT || 3159 cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT || 3160 cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER) && !!prefetch_page_mask; 3161 block_copy_state_t copy_state = {0}; 3162 uvm_va_range_t *va_range = block->va_range; 3163 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 3164 3165 copy_state.src.id = src_id; 3166 copy_state.dst.id = dst_id; 3167 copy_state.src.is_block_contig = is_block_phys_contig(block, src_id); 3168 copy_state.dst.is_block_contig = is_block_phys_contig(block, dst_id); 3169 3170 *copied_pages = 0; 3171 3172 // If there are no pages to be copied, exit early 3173 if (!uvm_page_mask_andnot(copy_mask, copy_mask, dst_resident_mask) || 3174 !uvm_page_mask_andnot(copy_mask, copy_mask, migrated_pages)) 3175 return NV_OK; 3176 3177 // uvm_range_group_range_iter_first should only be called when the va_space 3178 // lock is held, which is always the case unless an eviction is taking 3179 // place. 3180 if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION) { 3181 rgr = uvm_range_group_range_iter_first(va_space, 3182 uvm_va_block_region_start(block, region), 3183 uvm_va_block_region_end(block, region)); 3184 rgr_has_changed = true; 3185 } 3186 3187 if (UVM_ID_IS_CPU(dst_id)) { 3188 uvm_memcg_context_t memcg_context; 3189 3190 // To support staging through CPU, populate CPU pages on demand. 3191 // GPU destinations should have their pages populated already, but 3192 // that might change if we add staging through GPUs. 3193 uvm_memcg_context_start(&memcg_context, block_context->mm); 3194 status = block_populate_pages_cpu(block, copy_mask, region, block_context); 3195 uvm_memcg_context_end(&memcg_context); 3196 if (status != NV_OK) 3197 return status; 3198 } 3199 3200 // TODO: Bug 3745051: This function is complicated and needs refactoring 3201 for_each_va_block_page_in_region_mask(page_index, copy_mask, region) { 3202 NvU64 page_start = uvm_va_block_cpu_page_address(block, page_index); 3203 uvm_make_resident_cause_t page_cause = (may_prefetch && uvm_page_mask_test(prefetch_page_mask, page_index)) ? 3204 UVM_MAKE_RESIDENT_CAUSE_PREFETCH: 3205 cause; 3206 3207 UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id)); 3208 UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index)); 3209 3210 // If we're not evicting and we're migrating away from the preferred 3211 // location, then we should add the range group range to the list of 3212 // migrated ranges in the range group. It's safe to skip this because 3213 // the use of range_group's migrated_ranges list is a UVM-Lite 3214 // optimization - eviction is not supported on UVM-Lite GPUs. 3215 if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION && !uvm_va_block_is_hmm(block) && 3216 uvm_id_equal(src_id, uvm_va_range_get_policy(va_range)->preferred_location)) { 3217 // rgr_has_changed is used to minimize the number of times the 3218 // migrated_ranges_lock is taken. It is set to false when the range 3219 // group range pointed by rgr is added to the migrated_ranges list, 3220 // and it is just set back to true when we move to a different 3221 // range group range. 3222 3223 // The current page could be after the end of rgr. Iterate over the 3224 // range group ranges until rgr's end location is greater than or 3225 // equal to the current page. 3226 while (rgr && rgr->node.end < page_start) { 3227 rgr = uvm_range_group_range_iter_next(va_space, rgr, uvm_va_block_region_end(block, region)); 3228 rgr_has_changed = true; 3229 } 3230 3231 // Check whether the current page lies within rgr. A single page 3232 // must entirely reside within a range group range. Since we've 3233 // incremented rgr until its end is higher than page_start, we now 3234 // check if page_start lies within rgr. 3235 if (rgr && rgr_has_changed && page_start >= rgr->node.start && page_start <= rgr->node.end) { 3236 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock); 3237 if (list_empty(&rgr->range_group_migrated_list_node)) 3238 list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges); 3239 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock); 3240 3241 rgr_has_changed = false; 3242 } 3243 } 3244 3245 // No need to copy pages that haven't changed. Just clear residency 3246 // information 3247 if (block_page_is_clean(block, dst_id, src_id, page_index)) 3248 continue; 3249 3250 if (!copying_gpu) { 3251 status = block_copy_begin_push(block, ©_state, &block->tracker, &push); 3252 3253 if (status != NV_OK) 3254 break; 3255 copying_gpu = uvm_push_get_gpu(&push); 3256 3257 // Record all processors involved in the copy 3258 uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, copying_gpu->id); 3259 uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, dst_id); 3260 uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, src_id); 3261 3262 // This function is called just once per VA block and needs to 3263 // receive the "main" cause for the migration (it mainly checks if 3264 // we are in the eviction path). Therefore, we pass cause instead 3265 // of contig_cause 3266 uvm_tools_record_block_migration_begin(block, &push, dst_id, src_id, page_start, cause); 3267 } 3268 else { 3269 uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 3270 } 3271 3272 if (!uvm_va_block_is_hmm(block)) 3273 block_update_page_dirty_state(block, dst_id, src_id, page_index); 3274 3275 if (last_index == region.outer) { 3276 bool can_cache_src_phys_addr = copy_state.src.is_block_contig; 3277 bool can_cache_dst_phys_addr = copy_state.dst.is_block_contig; 3278 contig_start_index = page_index; 3279 contig_cause = page_cause; 3280 3281 // When CC is enabled, transfers between GPU and CPU don't rely on 3282 // any GPU mapping of CPU chunks, physical or virtual. 3283 if (UVM_ID_IS_CPU(src_id) && uvm_conf_computing_mode_enabled(copying_gpu)) 3284 can_cache_src_phys_addr = false; 3285 if (UVM_ID_IS_CPU(dst_id) && uvm_conf_computing_mode_enabled(copying_gpu)) 3286 can_cache_dst_phys_addr = false; 3287 // Computing the physical address is a non-trivial operation and 3288 // seems to be a performance limiter on systems with 2 or more 3289 // NVLINK links. Therefore, for physically-contiguous block 3290 // storage, we cache the start address and compute the page address 3291 // using the page index. 3292 if (can_cache_src_phys_addr) { 3293 copy_state.src.gpu_address = block_phys_page_copy_address(block, 3294 block_phys_page(src_id, 0), 3295 copying_gpu); 3296 } 3297 if (can_cache_dst_phys_addr) { 3298 copy_state.dst.gpu_address = block_phys_page_copy_address(block, 3299 block_phys_page(dst_id, 0), 3300 copying_gpu); 3301 } 3302 } 3303 else if ((page_index != last_index + 1) || contig_cause != page_cause) { 3304 uvm_va_block_region_t contig_region = uvm_va_block_region(contig_start_index, last_index + 1); 3305 UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region)); 3306 3307 // If both src and dst are physically-contiguous, consolidate copies 3308 // of contiguous pages into a single method. 3309 if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig) 3310 block_copy_push(block, ©_state, contig_region, &push); 3311 3312 uvm_perf_event_notify_migration(&va_space->perf_events, 3313 &push, 3314 block, 3315 dst_id, 3316 src_id, 3317 uvm_va_block_region_start(block, contig_region), 3318 uvm_va_block_region_size(contig_region), 3319 transfer_mode, 3320 contig_cause, 3321 &block_context->make_resident); 3322 3323 contig_start_index = page_index; 3324 contig_cause = page_cause; 3325 } 3326 3327 if (!copy_state.src.is_block_contig || !copy_state.dst.is_block_contig) 3328 block_copy_push(block, ©_state, uvm_va_block_region_for_page(page_index), &push); 3329 3330 last_index = page_index; 3331 } 3332 3333 // Copy the remaining pages 3334 if (copying_gpu) { 3335 uvm_va_block_region_t contig_region = uvm_va_block_region(contig_start_index, last_index + 1); 3336 UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region)); 3337 3338 if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig) 3339 block_copy_push(block, ©_state, contig_region, &push); 3340 3341 uvm_perf_event_notify_migration(&va_space->perf_events, 3342 &push, 3343 block, 3344 dst_id, 3345 src_id, 3346 uvm_va_block_region_start(block, contig_region), 3347 uvm_va_block_region_size(contig_region), 3348 transfer_mode, 3349 contig_cause, 3350 &block_context->make_resident); 3351 3352 status = block_copy_end_push(block, ©_state, copy_tracker, status, &push); 3353 } 3354 3355 // Update VA block status bits 3356 // 3357 // Only update the bits for the pages that succeeded 3358 if (status != NV_OK) 3359 uvm_page_mask_region_clear(copy_mask, uvm_va_block_region(page_index, PAGES_PER_UVM_VA_BLOCK)); 3360 3361 *copied_pages = uvm_page_mask_weight(copy_mask); 3362 if (*copied_pages) 3363 uvm_page_mask_or(migrated_pages, migrated_pages, copy_mask); 3364 3365 return status; 3366 } 3367 3368 // Copy resident pages to the destination from all source processors in the 3369 // src_processor_mask 3370 // 3371 // The function adds the pages that were successfully copied to the output 3372 // migrated_pages mask and returns the number of pages in copied_pages. These 3373 // fields are reliable even if an error is returned. 3374 static NV_STATUS block_copy_resident_pages_mask(uvm_va_block_t *block, 3375 uvm_va_block_context_t *block_context, 3376 uvm_processor_id_t dst_id, 3377 const uvm_processor_mask_t *src_processor_mask, 3378 uvm_va_block_region_t region, 3379 const uvm_page_mask_t *page_mask, 3380 const uvm_page_mask_t *prefetch_page_mask, 3381 uvm_va_block_transfer_mode_t transfer_mode, 3382 NvU32 max_pages_to_copy, 3383 uvm_page_mask_t *migrated_pages, 3384 NvU32 *copied_pages_out, 3385 uvm_tracker_t *tracker_out) 3386 { 3387 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 3388 uvm_processor_id_t src_id; 3389 uvm_processor_mask_t search_mask; 3390 uvm_page_mask_t *copy_mask = &block_context->make_resident.copy_resident_pages_mask; 3391 3392 uvm_processor_mask_copy(&search_mask, src_processor_mask); 3393 3394 *copied_pages_out = 0; 3395 3396 for_each_closest_id(src_id, &search_mask, dst_id, va_space) { 3397 uvm_page_mask_t *src_resident_mask = uvm_va_block_resident_mask_get(block, src_id); 3398 NV_STATUS status; 3399 NvU32 copied_pages_from_src; 3400 3401 UVM_ASSERT(!uvm_id_equal(src_id, dst_id)); 3402 3403 uvm_page_mask_init_from_region(copy_mask, region, src_resident_mask); 3404 3405 if (page_mask) 3406 uvm_page_mask_and(copy_mask, copy_mask, page_mask); 3407 3408 status = block_copy_resident_pages_between(block, 3409 block_context, 3410 dst_id, 3411 src_id, 3412 region, 3413 copy_mask, 3414 prefetch_page_mask, 3415 transfer_mode, 3416 migrated_pages, 3417 &copied_pages_from_src, 3418 tracker_out); 3419 *copied_pages_out += copied_pages_from_src; 3420 UVM_ASSERT(*copied_pages_out <= max_pages_to_copy); 3421 3422 if (status != NV_OK) 3423 return status; 3424 3425 // Break out once we copied max pages already 3426 if (*copied_pages_out == max_pages_to_copy) 3427 break; 3428 } 3429 3430 return NV_OK; 3431 } 3432 3433 static void break_read_duplication_in_region(uvm_va_block_t *block, 3434 uvm_va_block_context_t *block_context, 3435 uvm_processor_id_t dst_id, 3436 uvm_va_block_region_t region, 3437 const uvm_page_mask_t *page_mask) 3438 { 3439 uvm_processor_id_t id; 3440 uvm_page_mask_t *break_pages_in_region = &block_context->scratch_page_mask; 3441 3442 uvm_page_mask_init_from_region(break_pages_in_region, region, page_mask); 3443 3444 UVM_ASSERT(uvm_page_mask_subset(break_pages_in_region, uvm_va_block_resident_mask_get(block, dst_id))); 3445 3446 // Clear read_duplicated bit for all pages in region 3447 uvm_page_mask_andnot(&block->read_duplicated_pages, &block->read_duplicated_pages, break_pages_in_region); 3448 3449 // Clear residency bits for all processors other than dst_id 3450 for_each_id_in_mask(id, &block->resident) { 3451 uvm_page_mask_t *other_resident_mask; 3452 3453 if (uvm_id_equal(id, dst_id)) 3454 continue; 3455 3456 other_resident_mask = uvm_va_block_resident_mask_get(block, id); 3457 3458 if (!uvm_page_mask_andnot(other_resident_mask, other_resident_mask, break_pages_in_region)) 3459 block_clear_resident_processor(block, id); 3460 } 3461 } 3462 3463 static void block_copy_set_first_touch_residency(uvm_va_block_t *block, 3464 uvm_va_block_context_t *block_context, 3465 uvm_processor_id_t dst_id, 3466 uvm_va_block_region_t region, 3467 const uvm_page_mask_t *page_mask) 3468 { 3469 uvm_page_index_t page_index; 3470 uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id); 3471 uvm_page_mask_t *first_touch_mask = &block_context->make_resident.page_mask; 3472 3473 if (page_mask) 3474 uvm_page_mask_andnot(first_touch_mask, page_mask, resident_mask); 3475 else 3476 uvm_page_mask_complement(first_touch_mask, resident_mask); 3477 3478 uvm_page_mask_region_clear_outside(first_touch_mask, region); 3479 3480 for_each_va_block_page_in_mask(page_index, first_touch_mask, block) { 3481 UVM_ASSERT(!block_is_page_resident_anywhere(block, page_index)); 3482 UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index)); 3483 UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id)); 3484 } 3485 3486 uvm_page_mask_or(resident_mask, resident_mask, first_touch_mask); 3487 if (!uvm_page_mask_empty(resident_mask)) 3488 block_set_resident_processor(block, dst_id); 3489 3490 // Add them to the output mask, too 3491 uvm_page_mask_or(&block_context->make_resident.pages_changed_residency, 3492 &block_context->make_resident.pages_changed_residency, 3493 first_touch_mask); 3494 } 3495 3496 // Copy resident pages from other processors to the destination. 3497 // All the pages on the destination need to be populated by the caller first. 3498 // Pages not resident anywhere else need to be zeroed out as well. 3499 // The transfer_mode is only used to tell uvm_perf_event_notify_migration() 3500 // whether the copy is for a migration or read duplication. 3501 static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block, 3502 uvm_va_block_context_t *block_context, 3503 uvm_processor_id_t dst_id, 3504 uvm_va_block_region_t region, 3505 const uvm_page_mask_t *page_mask, 3506 const uvm_page_mask_t *prefetch_page_mask, 3507 uvm_va_block_transfer_mode_t transfer_mode) 3508 { 3509 NV_STATUS status = NV_OK; 3510 NV_STATUS tracker_status; 3511 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 3512 uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id); 3513 NvU32 missing_pages_count; 3514 NvU32 pages_copied; 3515 NvU32 pages_copied_to_cpu; 3516 uvm_processor_mask_t src_processor_mask; 3517 uvm_page_mask_t *copy_page_mask = &block_context->make_resident.page_mask; 3518 uvm_page_mask_t *migrated_pages = &block_context->make_resident.pages_migrated; 3519 uvm_page_mask_t *staged_pages = &block_context->make_resident.pages_staged; 3520 3521 uvm_page_mask_zero(migrated_pages); 3522 uvm_page_mask_zero(staged_pages); 3523 3524 if (page_mask) 3525 uvm_page_mask_andnot(copy_page_mask, page_mask, resident_mask); 3526 else 3527 uvm_page_mask_complement(copy_page_mask, resident_mask); 3528 3529 missing_pages_count = uvm_page_mask_region_weight(copy_page_mask, region); 3530 3531 if (missing_pages_count == 0) 3532 goto out; 3533 3534 // TODO: Bug 1753731: Add P2P2P copies staged through a GPU 3535 // TODO: Bug 1753731: When a page is resident in multiple locations due to 3536 // read-duplication, spread out the source of the copy so we don't 3537 // bottleneck on a single location. 3538 3539 uvm_processor_mask_zero(&src_processor_mask); 3540 3541 if (!uvm_id_equal(dst_id, UVM_ID_CPU)) { 3542 // If the destination is a GPU, first copy everything from processors 3543 // with copy access supported. Notably this will copy pages from the CPU 3544 // as well even if later some extra copies from CPU are required for 3545 // staged copies. 3546 uvm_processor_mask_and(&src_processor_mask, block_get_can_copy_from_mask(block, dst_id), &block->resident); 3547 uvm_processor_mask_clear(&src_processor_mask, dst_id); 3548 3549 status = block_copy_resident_pages_mask(block, 3550 block_context, 3551 dst_id, 3552 &src_processor_mask, 3553 region, 3554 copy_page_mask, 3555 prefetch_page_mask, 3556 transfer_mode, 3557 missing_pages_count, 3558 migrated_pages, 3559 &pages_copied, 3560 &local_tracker); 3561 3562 UVM_ASSERT(missing_pages_count >= pages_copied); 3563 missing_pages_count -= pages_copied; 3564 3565 if (status != NV_OK) 3566 goto out; 3567 3568 if (missing_pages_count == 0) 3569 goto out; 3570 3571 if (pages_copied) 3572 uvm_page_mask_andnot(copy_page_mask, copy_page_mask, migrated_pages); 3573 } 3574 3575 // Now copy from everywhere else to the CPU. This is both for when the 3576 // destination is the CPU (src_processor_mask empty) and for a staged copy 3577 // (src_processor_mask containing processors with copy access to dst_id). 3578 uvm_processor_mask_andnot(&src_processor_mask, &block->resident, &src_processor_mask); 3579 uvm_processor_mask_clear(&src_processor_mask, dst_id); 3580 uvm_processor_mask_clear(&src_processor_mask, UVM_ID_CPU); 3581 3582 status = block_copy_resident_pages_mask(block, 3583 block_context, 3584 UVM_ID_CPU, 3585 &src_processor_mask, 3586 region, 3587 copy_page_mask, 3588 prefetch_page_mask, 3589 transfer_mode, 3590 missing_pages_count, 3591 staged_pages, 3592 &pages_copied_to_cpu, 3593 &local_tracker); 3594 if (status != NV_OK) 3595 goto out; 3596 3597 // If destination is the CPU then we copied everything there above 3598 if (UVM_ID_IS_CPU(dst_id)) { 3599 uvm_page_mask_or(migrated_pages, migrated_pages, staged_pages); 3600 missing_pages_count -= pages_copied_to_cpu; 3601 3602 goto out; 3603 } 3604 3605 // Add everything to the block's tracker so that the 3606 // block_copy_resident_pages_between() call below will acquire it. 3607 status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker); 3608 if (status != NV_OK) 3609 goto out; 3610 uvm_tracker_clear(&local_tracker); 3611 3612 // Now copy staged pages from the CPU to the destination. 3613 status = block_copy_resident_pages_between(block, 3614 block_context, 3615 dst_id, 3616 UVM_ID_CPU, 3617 region, 3618 staged_pages, 3619 prefetch_page_mask, 3620 transfer_mode, 3621 migrated_pages, 3622 &pages_copied, 3623 &local_tracker); 3624 3625 UVM_ASSERT(missing_pages_count >= pages_copied); 3626 missing_pages_count -= pages_copied; 3627 3628 if (status != NV_OK) 3629 goto out; 3630 3631 // If we get here, that means we were staging the copy through the CPU and 3632 // we should copy as many pages from the CPU as we copied to the CPU. 3633 UVM_ASSERT(pages_copied == pages_copied_to_cpu); 3634 3635 out: 3636 // Add everything from the local tracker to the block's tracker. 3637 // Notably this is also needed for handling 3638 // block_copy_resident_pages_between() failures in the first loop. 3639 tracker_status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker); 3640 uvm_tracker_deinit(&local_tracker); 3641 3642 return status == NV_OK ? tracker_status : status; 3643 } 3644 3645 NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block, 3646 uvm_va_block_retry_t *va_block_retry, 3647 uvm_va_block_context_t *va_block_context, 3648 uvm_processor_id_t dest_id, 3649 uvm_va_block_region_t region, 3650 const uvm_page_mask_t *page_mask, 3651 const uvm_page_mask_t *prefetch_page_mask, 3652 uvm_make_resident_cause_t cause) 3653 { 3654 NV_STATUS status; 3655 uvm_processor_mask_t unmap_processor_mask; 3656 uvm_page_mask_t *unmap_page_mask = &va_block_context->make_resident.page_mask; 3657 uvm_page_mask_t *resident_mask; 3658 3659 va_block_context->make_resident.dest_id = dest_id; 3660 va_block_context->make_resident.cause = cause; 3661 3662 if (prefetch_page_mask) { 3663 UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT || 3664 cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT || 3665 cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER); 3666 } 3667 3668 uvm_assert_mutex_locked(&va_block->lock); 3669 UVM_ASSERT(uvm_va_block_is_hmm(va_block) || va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 3670 3671 resident_mask = block_resident_mask_get_alloc(va_block, dest_id); 3672 if (!resident_mask) 3673 return NV_ERR_NO_MEMORY; 3674 3675 // Unmap all mapped processors except for UVM-Lite GPUs as their mappings 3676 // are largely persistent. 3677 uvm_processor_mask_andnot(&unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block)); 3678 3679 if (page_mask) 3680 uvm_page_mask_andnot(unmap_page_mask, page_mask, resident_mask); 3681 else 3682 uvm_page_mask_complement(unmap_page_mask, resident_mask); 3683 uvm_page_mask_region_clear_outside(unmap_page_mask, region); 3684 3685 // Unmap all pages not resident on the destination 3686 status = uvm_va_block_unmap_mask(va_block, va_block_context, &unmap_processor_mask, region, unmap_page_mask); 3687 if (status != NV_OK) 3688 return status; 3689 3690 if (page_mask) 3691 uvm_page_mask_and(unmap_page_mask, page_mask, &va_block->read_duplicated_pages); 3692 else 3693 uvm_page_mask_init_from_region(unmap_page_mask, region, &va_block->read_duplicated_pages); 3694 uvm_page_mask_region_clear_outside(unmap_page_mask, region); 3695 3696 // Also unmap read-duplicated pages excluding dest_id 3697 uvm_processor_mask_clear(&unmap_processor_mask, dest_id); 3698 status = uvm_va_block_unmap_mask(va_block, va_block_context, &unmap_processor_mask, region, unmap_page_mask); 3699 if (status != NV_OK) 3700 return status; 3701 3702 uvm_tools_record_read_duplicate_invalidate(va_block, 3703 dest_id, 3704 region, 3705 unmap_page_mask); 3706 3707 // Note that block_populate_pages and block_copy_resident_pages also use 3708 // va_block_context->make_resident.page_mask. 3709 unmap_page_mask = NULL; 3710 3711 status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask); 3712 if (status != NV_OK) 3713 return status; 3714 3715 return block_copy_resident_pages(va_block, 3716 va_block_context, 3717 dest_id, 3718 region, 3719 page_mask, 3720 prefetch_page_mask, 3721 UVM_VA_BLOCK_TRANSFER_MODE_MOVE); 3722 } 3723 3724 static void block_make_resident_clear_evicted(uvm_va_block_t *va_block, 3725 uvm_processor_id_t dst_id, 3726 uvm_page_mask_t *page_mask) 3727 { 3728 uvm_va_block_gpu_state_t *dst_gpu_state = uvm_va_block_gpu_state_get(va_block, dst_id); 3729 3730 UVM_ASSERT(dst_gpu_state); 3731 3732 if (!uvm_page_mask_andnot(&dst_gpu_state->evicted, &dst_gpu_state->evicted, page_mask)) 3733 uvm_processor_mask_clear(&va_block->evicted_gpus, dst_id); 3734 } 3735 3736 static void block_make_resident_update_state(uvm_va_block_t *va_block, 3737 uvm_va_block_context_t *va_block_context, 3738 uvm_processor_id_t dst_id, 3739 uvm_va_block_region_t region, 3740 uvm_page_mask_t *copy_mask, 3741 uvm_make_resident_cause_t cause) 3742 { 3743 uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dst_id); 3744 3745 uvm_page_mask_or(dst_resident_mask, dst_resident_mask, copy_mask); 3746 block_set_resident_processor(va_block, dst_id); 3747 3748 // Accumulate the pages that migrated into the output mask. 3749 uvm_page_mask_or(&va_block_context->make_resident.pages_changed_residency, 3750 &va_block_context->make_resident.pages_changed_residency, 3751 copy_mask); 3752 3753 // Any move operation implies that mappings have been removed from all 3754 // non-UVM-Lite GPUs. 3755 uvm_page_mask_andnot(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, copy_mask); 3756 3757 // If we are migrating due to an eviction, set the GPU as evicted and 3758 // mark the evicted pages. If we are migrating away from the CPU this 3759 // means that those pages are not evicted. 3760 if (cause == UVM_MAKE_RESIDENT_CAUSE_EVICTION) { 3761 uvm_processor_id_t src_id; 3762 3763 UVM_ASSERT(UVM_ID_IS_CPU(dst_id)); 3764 3765 // Note that the destination is the CPU so this loop excludes it. 3766 for_each_gpu_id_in_mask(src_id, &va_block_context->make_resident.all_involved_processors) { 3767 uvm_va_block_gpu_state_t *src_gpu_state = uvm_va_block_gpu_state_get(va_block, src_id); 3768 3769 UVM_ASSERT(src_gpu_state); 3770 3771 uvm_page_mask_or(&src_gpu_state->evicted, &src_gpu_state->evicted, copy_mask); 3772 uvm_processor_mask_set(&va_block->evicted_gpus, src_id); 3773 } 3774 } 3775 else if (UVM_ID_IS_GPU(dst_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dst_id)) 3776 block_make_resident_clear_evicted(va_block, dst_id, copy_mask); 3777 } 3778 3779 void uvm_va_block_make_resident_finish(uvm_va_block_t *va_block, 3780 uvm_va_block_context_t *va_block_context, 3781 uvm_va_block_region_t region, 3782 const uvm_page_mask_t *page_mask) 3783 { 3784 uvm_page_mask_t *migrated_pages = &va_block_context->make_resident.pages_migrated; 3785 uvm_processor_id_t dst_id = va_block_context->make_resident.dest_id; 3786 3787 uvm_assert_mutex_locked(&va_block->lock); 3788 3789 if (page_mask) 3790 uvm_page_mask_and(migrated_pages, migrated_pages, page_mask); 3791 3792 if (!uvm_page_mask_empty(migrated_pages)) { 3793 // The migrated pages are now resident on the destination. 3794 block_make_resident_update_state(va_block, 3795 va_block_context, 3796 dst_id, 3797 region, 3798 migrated_pages, 3799 va_block_context->make_resident.cause); 3800 } 3801 3802 // Pages that weren't resident anywhere else were populated at the 3803 // destination directly. Mark them as resident now. 3804 block_copy_set_first_touch_residency(va_block, va_block_context, dst_id, region, page_mask); 3805 3806 // Break read duplication and clear residency from other processors. 3807 break_read_duplication_in_region(va_block, va_block_context, dst_id, region, page_mask); 3808 3809 // Update eviction heuristics, if needed. Notably this could repeat the call 3810 // done in block_set_resident_processor(), but that doesn't do anything bad 3811 // and it's simpler to keep it in both places. 3812 // 3813 // Skip this if we didn't do anything (the input region and/or page mask was 3814 // empty). 3815 if (uvm_processor_mask_test(&va_block->resident, dst_id)) 3816 block_mark_memory_used(va_block, dst_id); 3817 } 3818 3819 NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block, 3820 uvm_va_block_retry_t *va_block_retry, 3821 uvm_va_block_context_t *va_block_context, 3822 uvm_processor_id_t dest_id, 3823 uvm_va_block_region_t region, 3824 const uvm_page_mask_t *page_mask, 3825 const uvm_page_mask_t *prefetch_page_mask, 3826 uvm_make_resident_cause_t cause) 3827 { 3828 NV_STATUS status; 3829 3830 status = uvm_va_block_make_resident_copy(va_block, 3831 va_block_retry, 3832 va_block_context, 3833 dest_id, 3834 region, 3835 page_mask, 3836 prefetch_page_mask, 3837 cause); 3838 if (status != NV_OK) 3839 return status; 3840 3841 uvm_va_block_make_resident_finish(va_block, 3842 va_block_context, 3843 region, 3844 page_mask); 3845 3846 return NV_OK; 3847 } 3848 3849 // Combination function which prepares the input {region, page_mask} for 3850 // entering read-duplication. It: 3851 // - Unmaps all processors but revoke_id 3852 // - Revokes write access from revoke_id 3853 static NV_STATUS block_prep_read_duplicate_mapping(uvm_va_block_t *va_block, 3854 uvm_va_block_context_t *va_block_context, 3855 uvm_processor_id_t revoke_id, 3856 uvm_va_block_region_t region, 3857 const uvm_page_mask_t *page_mask) 3858 { 3859 uvm_processor_mask_t unmap_processor_mask; 3860 uvm_processor_id_t unmap_id; 3861 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 3862 NV_STATUS status, tracker_status; 3863 3864 // Unmap everybody except revoke_id 3865 uvm_processor_mask_andnot(&unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block)); 3866 uvm_processor_mask_clear(&unmap_processor_mask, revoke_id); 3867 3868 for_each_id_in_mask(unmap_id, &unmap_processor_mask) { 3869 status = uvm_va_block_unmap(va_block, 3870 va_block_context, 3871 unmap_id, 3872 region, 3873 page_mask, 3874 &local_tracker); 3875 if (status != NV_OK) 3876 goto out; 3877 } 3878 3879 // Revoke WRITE/ATOMIC access permissions from the remaining mapped 3880 // processor. 3881 status = uvm_va_block_revoke_prot(va_block, 3882 va_block_context, 3883 revoke_id, 3884 region, 3885 page_mask, 3886 UVM_PROT_READ_WRITE, 3887 &local_tracker); 3888 if (status != NV_OK) 3889 goto out; 3890 3891 out: 3892 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 3893 uvm_tracker_deinit(&local_tracker); 3894 return status == NV_OK ? tracker_status : status; 3895 } 3896 3897 NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block, 3898 uvm_va_block_retry_t *va_block_retry, 3899 uvm_va_block_context_t *va_block_context, 3900 uvm_processor_id_t dest_id, 3901 uvm_va_block_region_t region, 3902 const uvm_page_mask_t *page_mask, 3903 const uvm_page_mask_t *prefetch_page_mask, 3904 uvm_make_resident_cause_t cause) 3905 { 3906 NV_STATUS status = NV_OK; 3907 uvm_processor_id_t src_id; 3908 uvm_page_mask_t *dst_resident_mask; 3909 uvm_page_mask_t *cpu_resident_mask; 3910 uvm_page_mask_t *migrated_pages; 3911 uvm_page_mask_t *staged_pages; 3912 uvm_page_mask_t *first_touch_mask; 3913 3914 // TODO: Bug 3660922: need to implement HMM read duplication support. 3915 UVM_ASSERT(!uvm_va_block_is_hmm(va_block)); 3916 3917 va_block_context->make_resident.dest_id = dest_id; 3918 va_block_context->make_resident.cause = cause; 3919 3920 if (prefetch_page_mask) { 3921 // TODO: Bug 1877578: investigate automatic read-duplicate policies 3922 UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT || 3923 cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT || 3924 cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER); 3925 } 3926 3927 uvm_assert_mutex_locked(&va_block->lock); 3928 UVM_ASSERT(!uvm_va_block_is_dead(va_block)); 3929 3930 // For pages that are entering read-duplication we need to unmap remote 3931 // mappings and revoke RW and higher access permissions. 3932 // 3933 // The current implementation: 3934 // - Unmaps pages from all processors but the one with the resident copy 3935 // - Revokes write access from the processor with the resident copy 3936 for_each_id_in_mask(src_id, &va_block->resident) { 3937 // Note that the below calls to block_populate_pages and 3938 // block_copy_resident_pages also use 3939 // va_block_context->make_resident.page_mask. 3940 uvm_page_mask_t *preprocess_page_mask = &va_block_context->make_resident.page_mask; 3941 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id); 3942 UVM_ASSERT(!uvm_page_mask_empty(resident_mask)); 3943 3944 if (page_mask) 3945 uvm_page_mask_andnot(preprocess_page_mask, page_mask, &va_block->read_duplicated_pages); 3946 else 3947 uvm_page_mask_complement(preprocess_page_mask, &va_block->read_duplicated_pages); 3948 3949 // If there are no pages that need to be unmapped/revoked, skip to the 3950 // next processor 3951 if (!uvm_page_mask_and(preprocess_page_mask, preprocess_page_mask, resident_mask)) 3952 continue; 3953 3954 status = block_prep_read_duplicate_mapping(va_block, va_block_context, src_id, region, preprocess_page_mask); 3955 if (status != NV_OK) 3956 return status; 3957 } 3958 3959 status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask); 3960 if (status != NV_OK) 3961 return status; 3962 3963 status = block_copy_resident_pages(va_block, 3964 va_block_context, 3965 dest_id, 3966 region, 3967 page_mask, 3968 prefetch_page_mask, 3969 UVM_VA_BLOCK_TRANSFER_MODE_COPY); 3970 if (status != NV_OK) 3971 return status; 3972 3973 // Pages that weren't resident anywhere else were populated at the 3974 // destination directly. Mark them as resident now, since there were no 3975 // errors from block_copy_resident_pages() above. 3976 // Note that va_block_context->scratch_page_mask is passed to 3977 // block_copy_set_first_touch_residency() which is generally unsafe but in 3978 // this case, block_copy_set_first_touch_residency() copies page_mask 3979 // before scratch_page_mask could be clobbered. 3980 migrated_pages = &va_block_context->make_resident.pages_migrated; 3981 first_touch_mask = &va_block_context->scratch_page_mask; 3982 uvm_page_mask_init_from_region(first_touch_mask, region, page_mask); 3983 uvm_page_mask_andnot(first_touch_mask, first_touch_mask, migrated_pages); 3984 3985 if (!uvm_page_mask_empty(first_touch_mask)) 3986 block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, first_touch_mask); 3987 3988 staged_pages = &va_block_context->make_resident.pages_staged; 3989 if (!UVM_ID_IS_CPU(dest_id) && !uvm_page_mask_empty(staged_pages)) { 3990 cpu_resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU); 3991 uvm_page_mask_or(cpu_resident_mask, cpu_resident_mask, staged_pages); 3992 block_set_resident_processor(va_block, UVM_ID_CPU); 3993 uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, staged_pages); 3994 uvm_tools_record_read_duplicate(va_block, UVM_ID_CPU, region, staged_pages); 3995 } 3996 3997 if (!uvm_page_mask_empty(migrated_pages)) { 3998 dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dest_id); 3999 uvm_page_mask_or(dst_resident_mask, dst_resident_mask, migrated_pages); 4000 block_set_resident_processor(va_block, dest_id); 4001 uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, migrated_pages); 4002 uvm_tools_record_read_duplicate(va_block, dest_id, region, migrated_pages); 4003 } 4004 4005 UVM_ASSERT(cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION); 4006 if (UVM_ID_IS_GPU(dest_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dest_id)) 4007 block_make_resident_clear_evicted(va_block, dest_id, migrated_pages); 4008 4009 // Update eviction heuristics, if needed. Notably this could repeat the call 4010 // done in block_set_resident_processor(), but that doesn't do anything bad 4011 // and it's simpler to keep it in both places. 4012 // 4013 // Skip this if we didn't do anything (the input region and/or page mask was 4014 // empty). 4015 if (uvm_processor_mask_test(&va_block->resident, dest_id)) 4016 block_mark_memory_used(va_block, dest_id); 4017 4018 return NV_OK; 4019 } 4020 4021 // Looks up the current CPU mapping state of page from the 4022 // block->cpu.pte_bits bitmaps. If write access is enabled, 4023 // UVM_PROT_READ_WRITE_ATOMIC is returned instead of UVM_PROT_READ_WRITE, since 4024 // write access implies atomic access for CPUs. 4025 static uvm_prot_t block_page_prot_cpu(uvm_va_block_t *block, uvm_page_index_t page_index) 4026 { 4027 uvm_prot_t prot; 4028 4029 UVM_ASSERT(!uvm_va_block_is_dead(block)); 4030 4031 if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index)) 4032 prot = UVM_PROT_READ_WRITE_ATOMIC; 4033 else if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index)) 4034 prot = UVM_PROT_READ_ONLY; 4035 else 4036 prot = UVM_PROT_NONE; 4037 4038 return prot; 4039 } 4040 4041 // Looks up the current GPU mapping state of page from the 4042 // block->gpus[i]->pte_bits bitmaps. 4043 static uvm_prot_t block_page_prot_gpu(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t page_index) 4044 { 4045 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4046 uvm_prot_t prot; 4047 4048 UVM_ASSERT(!uvm_va_block_is_dead(block)); 4049 4050 if (!gpu_state) 4051 return UVM_PROT_NONE; 4052 4053 if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], page_index)) 4054 prot = UVM_PROT_READ_WRITE_ATOMIC; 4055 else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], page_index)) 4056 prot = UVM_PROT_READ_WRITE; 4057 else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index)) 4058 prot = UVM_PROT_READ_ONLY; 4059 else 4060 prot = UVM_PROT_NONE; 4061 4062 return prot; 4063 } 4064 4065 static uvm_prot_t block_page_prot(uvm_va_block_t *block, uvm_processor_id_t id, uvm_page_index_t page_index) 4066 { 4067 if (UVM_ID_IS_CPU(id)) 4068 return block_page_prot_cpu(block, page_index); 4069 else 4070 return block_page_prot_gpu(block, block_get_gpu(block, id), page_index); 4071 } 4072 4073 // Returns true if the block has any valid CPU PTE mapping in the block region. 4074 static bool block_has_valid_mapping_cpu(uvm_va_block_t *block, uvm_va_block_region_t region) 4075 { 4076 size_t valid_page; 4077 4078 UVM_ASSERT(region.outer <= uvm_va_block_num_cpu_pages(block)); 4079 4080 // Early-out: check whether any address in this block has a CPU mapping 4081 if (!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) { 4082 UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])); 4083 UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])); 4084 return false; 4085 } 4086 4087 // All valid mappings have at least read permissions so we only need to 4088 // inspect the read bits. 4089 valid_page = uvm_va_block_first_page_in_mask(region, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]); 4090 if (valid_page == region.outer) 4091 return false; 4092 4093 UVM_ASSERT(block_page_prot_cpu(block, valid_page) != UVM_PROT_NONE); 4094 return true; 4095 } 4096 4097 static bool block_check_chunk_indirect_peers(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) 4098 { 4099 uvm_gpu_t *accessing_gpu; 4100 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 4101 4102 if (!uvm_pmm_sysmem_mappings_indirect_supported()) 4103 return true; 4104 4105 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 4106 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); 4107 uvm_reverse_map_t reverse_map; 4108 size_t num_mappings; 4109 4110 num_mappings = uvm_pmm_sysmem_mappings_dma_to_virt(&accessing_gpu->pmm_reverse_sysmem_mappings, 4111 peer_addr, 4112 uvm_gpu_chunk_get_size(chunk), 4113 &reverse_map, 4114 1); 4115 UVM_ASSERT(num_mappings == 1); 4116 UVM_ASSERT(reverse_map.va_block == block); 4117 UVM_ASSERT(reverse_map.region.first == chunk->va_block_page_index); 4118 UVM_ASSERT(uvm_va_block_region_size(reverse_map.region) == uvm_gpu_chunk_get_size(chunk)); 4119 4120 uvm_va_block_release_no_destroy(reverse_map.va_block); 4121 } 4122 4123 return true; 4124 } 4125 4126 // Sanity check the given GPU's chunks array 4127 static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id) 4128 { 4129 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id); 4130 uvm_gpu_t *gpu; 4131 size_t i, num_chunks; 4132 uvm_page_index_t page_index; 4133 uvm_chunk_size_t chunk_size; 4134 4135 if (!gpu_state) 4136 return true; 4137 4138 gpu = block_get_gpu(block, id); 4139 4140 num_chunks = block_num_gpu_chunks(block, gpu); 4141 for (page_index = 0, i = 0; i < num_chunks; i++) { 4142 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 4143 size_t chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size); 4144 4145 if (chunk_index != i) { 4146 UVM_ERR_PRINT("chunk index mismatch: calculated %zu, is in %zu. VA block [0x%llx, 0x%llx) GPU %u page_index: %u\n", 4147 chunk_index, 4148 i, 4149 block->start, 4150 block->end + 1, 4151 uvm_id_value(id), 4152 page_index); 4153 return false; 4154 } 4155 4156 if (chunk) { 4157 if (chunk_size != uvm_gpu_chunk_get_size(chunk)) { 4158 UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu\n", 4159 chunk_size, 4160 uvm_gpu_chunk_get_size(chunk), 4161 block->start, 4162 block->end + 1, 4163 uvm_id_value(id), 4164 page_index, 4165 i); 4166 return false; 4167 } 4168 4169 if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) { 4170 UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu chunk_size: %u\n", 4171 uvm_pmm_gpu_chunk_state_string(chunk->state), 4172 block->start, 4173 block->end + 1, 4174 uvm_id_value(id), 4175 page_index, 4176 i, 4177 chunk_size); 4178 return false; 4179 } 4180 4181 UVM_ASSERT(chunk->va_block == block); 4182 UVM_ASSERT(chunk->va_block_page_index == page_index); 4183 4184 UVM_ASSERT(block_check_chunk_indirect_peers(block, gpu, chunk)); 4185 } 4186 4187 page_index += chunk_size / PAGE_SIZE; 4188 } 4189 4190 return true; 4191 } 4192 4193 static bool block_check_chunks(uvm_va_block_t *va_block) 4194 { 4195 uvm_gpu_id_t id; 4196 4197 for_each_gpu_id(id) { 4198 if (!block_check_gpu_chunks(va_block, id)) 4199 return false; 4200 } 4201 4202 return block_check_cpu_chunks(va_block); 4203 } 4204 4205 // Sanity checks for page mappings 4206 static bool block_check_mappings_page(uvm_va_block_t *block, uvm_page_index_t page_index) 4207 { 4208 uvm_processor_mask_t atomic_mappings, write_mappings, read_mappings; 4209 uvm_processor_mask_t lite_read_mappings, lite_atomic_mappings; 4210 uvm_processor_mask_t remaining_mappings, temp_mappings; 4211 uvm_processor_mask_t resident_processors; 4212 const uvm_processor_mask_t *residency_accessible_from = NULL; 4213 const uvm_processor_mask_t *residency_has_native_atomics = NULL; 4214 uvm_processor_id_t residency, id; 4215 uvm_va_range_t *va_range = block->va_range; 4216 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 4217 uvm_processor_id_t preferred_location = va_range ? 4218 uvm_va_range_get_policy(va_range)->preferred_location : 4219 UVM_ID_INVALID; 4220 const uvm_processor_mask_t *uvm_lite_gpus = block_get_uvm_lite_gpus(block); 4221 4222 block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings); 4223 block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE, &write_mappings); 4224 block_page_authorized_processors(block, page_index, UVM_PROT_READ_ONLY, &read_mappings); 4225 4226 // Each access bit implies all accesses below it 4227 UVM_ASSERT(uvm_processor_mask_subset(&atomic_mappings, &write_mappings)); 4228 UVM_ASSERT(uvm_processor_mask_subset(&write_mappings, &read_mappings)); 4229 UVM_ASSERT(uvm_processor_mask_subset(&read_mappings, &block->mapped)); 4230 4231 uvm_va_block_page_resident_processors(block, page_index, &resident_processors); 4232 UVM_ASSERT(uvm_processor_mask_subset(&resident_processors, &block->resident)); 4233 4234 // Sanity check block_get_mapped_processors 4235 uvm_processor_mask_copy(&remaining_mappings, &read_mappings); 4236 for_each_id_in_mask(residency, &resident_processors) { 4237 block_get_mapped_processors(block, residency, page_index, &temp_mappings); 4238 UVM_ASSERT(uvm_processor_mask_subset(&temp_mappings, &remaining_mappings)); 4239 uvm_processor_mask_andnot(&remaining_mappings, &remaining_mappings, &temp_mappings); 4240 } 4241 4242 // Any remaining mappings point to non-resident locations, so they must be 4243 // UVM-Lite mappings. 4244 UVM_ASSERT(uvm_processor_mask_subset(&remaining_mappings, uvm_lite_gpus)); 4245 4246 residency = uvm_processor_mask_find_first_id(&resident_processors); 4247 4248 if (uvm_processor_mask_get_count(&resident_processors) > 0) { 4249 residency_accessible_from = &va_space->accessible_from[uvm_id_value(residency)]; 4250 residency_has_native_atomics = &va_space->has_native_atomics[uvm_id_value(residency)]; 4251 } 4252 4253 // If the page is not resident, there should be no valid mappings 4254 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&resident_processors) > 0 || 4255 uvm_processor_mask_get_count(&read_mappings) == 0, 4256 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", 4257 *resident_processors.bitmap, 4258 *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, 4259 *va_space->system_wide_atomics_enabled_processors.bitmap, 4260 *block->read_duplicated_pages.bitmap); 4261 4262 // Test read_duplicated_pages mask 4263 UVM_ASSERT_MSG((uvm_processor_mask_get_count(&resident_processors) <= 1 && 4264 !uvm_page_mask_test(&block->read_duplicated_pages, page_index)) || 4265 (uvm_processor_mask_get_count(&resident_processors) > 1 && 4266 uvm_page_mask_test(&block->read_duplicated_pages, page_index)), 4267 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", 4268 *resident_processors.bitmap, 4269 *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, 4270 *va_space->system_wide_atomics_enabled_processors.bitmap, 4271 *block->read_duplicated_pages.bitmap); 4272 4273 if (!uvm_processor_mask_empty(uvm_lite_gpus)) 4274 UVM_ASSERT(UVM_ID_IS_VALID(preferred_location)); 4275 4276 // UVM-Lite checks. Since the range group is made non-migratable before the 4277 // actual migrations for that range group happen, we can only make those 4278 // checks which are valid on both migratable and non-migratable range 4279 // groups. 4280 uvm_processor_mask_and(&lite_read_mappings, &read_mappings, uvm_lite_gpus); 4281 uvm_processor_mask_and(&lite_atomic_mappings, &atomic_mappings, uvm_lite_gpus); 4282 4283 // Any mapping from a UVM-Lite GPU must be atomic... 4284 UVM_ASSERT(uvm_processor_mask_equal(&lite_read_mappings, &lite_atomic_mappings)); 4285 4286 // ... and must have access to preferred_location 4287 if (UVM_ID_IS_VALID(preferred_location)) { 4288 const uvm_processor_mask_t *preferred_location_accessible_from; 4289 4290 preferred_location_accessible_from = &va_space->accessible_from[uvm_id_value(preferred_location)]; 4291 UVM_ASSERT(uvm_processor_mask_subset(&lite_atomic_mappings, preferred_location_accessible_from)); 4292 } 4293 4294 for_each_id_in_mask(id, &lite_atomic_mappings) 4295 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], preferred_location)); 4296 4297 // Exclude uvm_lite_gpus from mappings' masks after UVM-Lite tests 4298 uvm_processor_mask_andnot(&read_mappings, &read_mappings, uvm_lite_gpus); 4299 uvm_processor_mask_andnot(&write_mappings, &write_mappings, uvm_lite_gpus); 4300 uvm_processor_mask_andnot(&atomic_mappings, &atomic_mappings, uvm_lite_gpus); 4301 4302 // Pages set to zero in maybe_mapped_pages must not be mapped on any 4303 // non-UVM-Lite GPU 4304 if (!uvm_page_mask_test(&block->maybe_mapped_pages, page_index)) { 4305 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&read_mappings) == 0, 4306 "Resident: 0x%lx - Mappings Block: 0x%lx / Page R: 0x%lx W: 0x%lx A: 0x%lx\n", 4307 *resident_processors.bitmap, 4308 *block->mapped.bitmap, 4309 *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap); 4310 } 4311 4312 // atomic mappings from GPUs with disabled system-wide atomics are treated 4313 // as write mappings. Therefore, we remove them from the atomic mappings mask 4314 uvm_processor_mask_and(&atomic_mappings, &atomic_mappings, &va_space->system_wide_atomics_enabled_processors); 4315 4316 if (!uvm_processor_mask_empty(&read_mappings)) { 4317 // Read-duplicate: if a page is resident in multiple locations, it 4318 // must be resident locally on each mapped processor. 4319 if (uvm_processor_mask_get_count(&resident_processors) > 1) { 4320 UVM_ASSERT_MSG(uvm_processor_mask_subset(&read_mappings, &resident_processors), 4321 "Read-duplicate copies from remote processors\n" 4322 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", 4323 *resident_processors.bitmap, 4324 *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, 4325 *va_space->system_wide_atomics_enabled_processors.bitmap, 4326 *block->read_duplicated_pages.bitmap); 4327 } 4328 else { 4329 // Processors with mappings must have access to the processor that 4330 // has the valid copy 4331 UVM_ASSERT_MSG(uvm_processor_mask_subset(&read_mappings, residency_accessible_from), 4332 "Not all processors have access to %s\n" 4333 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" 4334 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", 4335 uvm_va_space_processor_name(va_space, residency), 4336 *resident_processors.bitmap, 4337 *read_mappings.bitmap, 4338 *write_mappings.bitmap, 4339 *atomic_mappings.bitmap, 4340 *residency_accessible_from->bitmap, 4341 *residency_has_native_atomics->bitmap, 4342 *va_space->system_wide_atomics_enabled_processors.bitmap); 4343 for_each_id_in_mask(id, &read_mappings) { 4344 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], residency)); 4345 4346 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(residency)], id)) { 4347 uvm_gpu_t *resident_gpu = uvm_va_space_get_gpu(va_space, residency); 4348 uvm_gpu_t *mapped_gpu = uvm_va_space_get_gpu(va_space, id); 4349 uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_phys_page(residency, page_index), NULL); 4350 4351 // This function will assert if no mapping exists 4352 (void)uvm_pmm_gpu_indirect_peer_addr(&resident_gpu->pmm, chunk, mapped_gpu); 4353 } 4354 } 4355 } 4356 } 4357 4358 // If any processor has a writable mapping, there must only be one copy of 4359 // the page in the system 4360 if (!uvm_processor_mask_empty(&write_mappings)) { 4361 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&resident_processors) == 1, 4362 "Too many resident copies for pages with write_mappings\n" 4363 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", 4364 *resident_processors.bitmap, 4365 *read_mappings.bitmap, 4366 *write_mappings.bitmap, 4367 *atomic_mappings.bitmap, 4368 *va_space->system_wide_atomics_enabled_processors.bitmap, 4369 *block->read_duplicated_pages.bitmap); 4370 } 4371 4372 if (!uvm_processor_mask_empty(&atomic_mappings)) { 4373 uvm_processor_mask_t native_atomics; 4374 4375 uvm_processor_mask_and(&native_atomics, &atomic_mappings, residency_has_native_atomics); 4376 4377 if (uvm_processor_mask_empty(&native_atomics)) { 4378 // No other faultable processor should be able to write 4379 uvm_processor_mask_and(&write_mappings, &write_mappings, &va_space->faultable_processors); 4380 4381 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&write_mappings) == 1, 4382 "Too many write mappings to %s from processors with non-native atomics\n" 4383 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" 4384 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", 4385 uvm_va_space_processor_name(va_space, residency), 4386 *resident_processors.bitmap, 4387 *read_mappings.bitmap, 4388 *write_mappings.bitmap, 4389 *atomic_mappings.bitmap, 4390 *residency_accessible_from->bitmap, 4391 *residency_has_native_atomics->bitmap, 4392 *va_space->system_wide_atomics_enabled_processors.bitmap); 4393 4394 // Only one processor outside of the native group can have atomics enabled 4395 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&atomic_mappings) == 1, 4396 "Too many atomics mappings to %s from processors with non-native atomics\n" 4397 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" 4398 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", 4399 uvm_va_space_processor_name(va_space, residency), 4400 *resident_processors.bitmap, 4401 *read_mappings.bitmap, 4402 *write_mappings.bitmap, 4403 *atomic_mappings.bitmap, 4404 *residency_accessible_from->bitmap, 4405 *residency_has_native_atomics->bitmap, 4406 *va_space->system_wide_atomics_enabled_processors.bitmap); 4407 } 4408 else { 4409 uvm_processor_mask_t non_native_atomics; 4410 4411 // One or more processors within the native group have atomics enabled. 4412 // All processors outside of that group may have write but not atomic 4413 // permissions. 4414 uvm_processor_mask_andnot(&non_native_atomics, &atomic_mappings, residency_has_native_atomics); 4415 4416 UVM_ASSERT_MSG(uvm_processor_mask_empty(&non_native_atomics), 4417 "atomic mappings to %s from processors native and non-native\n" 4418 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" 4419 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", 4420 uvm_va_space_processor_name(va_space, residency), 4421 *resident_processors.bitmap, 4422 *read_mappings.bitmap, 4423 *write_mappings.bitmap, 4424 *atomic_mappings.bitmap, 4425 *residency_accessible_from->bitmap, 4426 *residency_has_native_atomics->bitmap, 4427 *va_space->system_wide_atomics_enabled_processors.bitmap); 4428 } 4429 } 4430 4431 return true; 4432 } 4433 4434 static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_gpu_t *gpu) 4435 { 4436 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4437 uvm_va_block_gpu_state_t *resident_gpu_state; 4438 uvm_pte_bits_gpu_t pte_bit; 4439 uvm_processor_id_t resident_id; 4440 uvm_prot_t prot; 4441 NvU32 big_page_size; 4442 size_t num_big_pages, big_page_index; 4443 uvm_va_block_region_t big_region, chunk_region; 4444 uvm_gpu_chunk_t *chunk; 4445 4446 if (!gpu_state->page_table_range_4k.table) 4447 UVM_ASSERT(!gpu_state->activated_4k); 4448 4449 if (!gpu_state->page_table_range_big.table) { 4450 UVM_ASSERT(!gpu_state->initialized_big); 4451 UVM_ASSERT(!gpu_state->activated_big); 4452 } 4453 4454 // It's only safe to check the PTE mappings if we have page tables. See 4455 // uvm_va_block_get_gpu_va_space. 4456 if (!block_gpu_has_page_tables(block, gpu)) { 4457 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, gpu->id)); 4458 return true; 4459 } 4460 4461 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); 4462 num_big_pages = uvm_va_block_num_big_pages(block, big_page_size); 4463 4464 if (block_gpu_supports_2m(block, gpu)) { 4465 if (gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table) { 4466 // 2M blocks require the 2M entry to be allocated for the lower 4467 // ranges to also be allocated. 4468 UVM_ASSERT(gpu_state->page_table_range_2m.table); 4469 } 4470 else if (gpu_state->page_table_range_2m.table) { 4471 // If the 2M entry is present but the lower ones aren't, the PTE 4472 // must be 2M. 4473 UVM_ASSERT(gpu_state->pte_is_2m); 4474 } 4475 } 4476 else { 4477 UVM_ASSERT(!gpu_state->page_table_range_2m.table); 4478 if (num_big_pages == 0) 4479 UVM_ASSERT(!gpu_state->page_table_range_big.table); 4480 } 4481 4482 // If we have the big table and it's in use then it must have been 4483 // initialized, even if it doesn't currently contain active PTEs. 4484 if ((!block_gpu_supports_2m(block, gpu) && gpu_state->page_table_range_big.table) || 4485 (block_gpu_supports_2m(block, gpu) && !gpu_state->pte_is_2m && gpu_state->activated_big)) 4486 UVM_ASSERT(gpu_state->initialized_big); 4487 4488 if (gpu_state->pte_is_2m) { 4489 UVM_ASSERT(block_gpu_supports_2m(block, gpu)); 4490 UVM_ASSERT(gpu_state->page_table_range_2m.table); 4491 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 4492 UVM_ASSERT(!gpu_state->force_4k_ptes); 4493 4494 // GPU architectures which support 2M pages only support 64K as the big 4495 // page size. All of the 2M code assumes that 4496 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK covers a 2M PTE exactly (bitmap_full, 4497 // bitmap_complement, etc). 4498 BUILD_BUG_ON((UVM_PAGE_SIZE_2M / UVM_PAGE_SIZE_64K) != MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 4499 4500 prot = block_page_prot_gpu(block, gpu, 0); 4501 4502 // All page permissions match 4503 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { 4504 if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot)) 4505 UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[pte_bit])); 4506 else 4507 UVM_ASSERT(uvm_page_mask_full(&gpu_state->pte_bits[pte_bit])); 4508 } 4509 4510 if (prot != UVM_PROT_NONE) { 4511 resident_id = block_gpu_get_processor_to_map(block, gpu, 0); 4512 4513 // block_check_resident_proximity verifies that no closer processor 4514 // has a resident page, so we don't need to check that all pages 4515 // have the same resident_id. 4516 4517 // block_check_mappings_page verifies that all pages marked resident 4518 // are backed by populated memory. 4519 4520 // The mapped processor should be fully resident and physically- 4521 // contiguous. 4522 UVM_ASSERT(uvm_page_mask_full(uvm_va_block_resident_mask_get(block, resident_id))); 4523 4524 if (UVM_ID_IS_GPU(resident_id)) { 4525 resident_gpu_state = uvm_va_block_gpu_state_get(block, resident_id); 4526 UVM_ASSERT(resident_gpu_state); 4527 UVM_ASSERT(uvm_gpu_chunk_get_size(resident_gpu_state->chunks[0]) == UVM_CHUNK_SIZE_2M); 4528 } 4529 else { 4530 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_first_in_region(block, 4531 uvm_va_block_region_from_block(block), 4532 NULL); 4533 4534 UVM_ASSERT(uvm_page_mask_full(&block->cpu.allocated)); 4535 UVM_ASSERT(chunk); 4536 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M); 4537 } 4538 } 4539 } 4540 else if (!bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 4541 UVM_ASSERT(gpu_state->page_table_range_big.table); 4542 UVM_ASSERT(!gpu_state->force_4k_ptes); 4543 UVM_ASSERT(num_big_pages > 0); 4544 UVM_ASSERT(gpu_state->initialized_big); 4545 4546 for (big_page_index = 0; big_page_index < num_big_pages; big_page_index++) { 4547 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 4548 4549 if (!test_bit(big_page_index, gpu_state->big_ptes)) { 4550 // If there are valid mappings but this isn't a big PTE, the 4551 // mapping must be using the 4k PTEs. 4552 if (!uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region)) 4553 UVM_ASSERT(gpu_state->page_table_range_4k.table); 4554 continue; 4555 } 4556 4557 prot = block_page_prot_gpu(block, gpu, big_region.first); 4558 4559 // All page permissions match 4560 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { 4561 if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot)) 4562 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[pte_bit], big_region)); 4563 else 4564 UVM_ASSERT(uvm_page_mask_region_full(&gpu_state->pte_bits[pte_bit], big_region)); 4565 } 4566 4567 if (prot != UVM_PROT_NONE) { 4568 resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first); 4569 4570 // The mapped processor should be fully resident and physically- 4571 // contiguous. Exception: UVM-Lite GPUs always map the preferred 4572 // location even if the memory is resident elsewhere. Skip the 4573 // residency check but still verify contiguity. 4574 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) { 4575 UVM_ASSERT(uvm_page_mask_region_full(uvm_va_block_resident_mask_get(block, resident_id), 4576 big_region)); 4577 } 4578 4579 if (UVM_ID_IS_CPU(resident_id)) { 4580 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, big_region.first); 4581 4582 UVM_ASSERT(gpu->parent->can_map_sysmem_with_large_pages); 4583 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) >= uvm_va_block_region_size(big_region)); 4584 } 4585 else { 4586 // Check GPU chunks 4587 chunk = block_phys_page_chunk(block, block_phys_page(resident_id, big_region.first), NULL); 4588 chunk_region = uvm_va_block_chunk_region(block, uvm_gpu_chunk_get_size(chunk), big_region.first); 4589 UVM_ASSERT(uvm_va_block_region_contains_region(chunk_region, big_region)); 4590 } 4591 } 4592 } 4593 } 4594 4595 return true; 4596 } 4597 4598 static bool block_check_mappings(uvm_va_block_t *block) 4599 { 4600 uvm_page_index_t page_index; 4601 uvm_processor_id_t id; 4602 4603 // Verify the master masks, since block_check_mappings_page relies on them 4604 for_each_processor_id(id) { 4605 const uvm_page_mask_t *resident_mask, *map_mask; 4606 4607 if (UVM_ID_IS_GPU(id) && !uvm_va_block_gpu_state_get(block, id)) { 4608 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id)); 4609 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id)); 4610 UVM_ASSERT(!uvm_processor_mask_test(&block->evicted_gpus, id)); 4611 continue; 4612 } 4613 4614 resident_mask = uvm_va_block_resident_mask_get(block, id); 4615 UVM_ASSERT(uvm_processor_mask_test(&block->resident, id) == !uvm_page_mask_empty(resident_mask)); 4616 4617 map_mask = uvm_va_block_map_mask_get(block, id); 4618 UVM_ASSERT(uvm_processor_mask_test(&block->mapped, id) == !uvm_page_mask_empty(map_mask)); 4619 4620 if (UVM_ID_IS_GPU(id)) { 4621 const uvm_page_mask_t *evicted_mask = block_evicted_mask_get(block, id); 4622 UVM_ASSERT(uvm_processor_mask_test(&block->evicted_gpus, id) == !uvm_page_mask_empty(evicted_mask)); 4623 4624 // Pages cannot be resident if they are marked as evicted 4625 UVM_ASSERT(!uvm_page_mask_intersects(evicted_mask, resident_mask)); 4626 4627 // Pages cannot be resident on a GPU with no memory 4628 if (!block_processor_has_memory(block, id)) 4629 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id)); 4630 } 4631 } 4632 4633 // Check that every page has coherent mappings 4634 for_each_va_block_page(page_index, block) 4635 block_check_mappings_page(block, page_index); 4636 4637 for_each_gpu_id(id) { 4638 if (uvm_va_block_gpu_state_get(block, id)) { 4639 uvm_gpu_t *gpu = block_get_gpu(block, id); 4640 4641 // Check big and/or 2M PTE state 4642 block_check_mappings_ptes(block, gpu); 4643 } 4644 } 4645 4646 return true; 4647 } 4648 4649 // See the comments on uvm_va_block_unmap 4650 static void block_unmap_cpu(uvm_va_block_t *block, uvm_va_block_region_t region, const uvm_page_mask_t *unmap_pages) 4651 { 4652 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 4653 uvm_pte_bits_cpu_t pte_bit; 4654 bool unmapped_something = false; 4655 uvm_va_block_region_t subregion; 4656 NvU32 num_mapped_processors; 4657 4658 // Early-out if nothing in the region is mapped or being unmapped. 4659 if (!block_has_valid_mapping_cpu(block, region) || 4660 (unmap_pages && !uvm_page_mask_intersects(unmap_pages, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]))) 4661 return; 4662 4663 // We can't actually unmap HMM ranges from the CPU here. 4664 // Unmapping happens as part of migrate_vma_setup(). 4665 if (uvm_va_block_is_hmm(block)) { 4666 UVM_ASSERT(!uvm_va_block_is_hmm(block)); 4667 return; 4668 } 4669 4670 num_mapped_processors = uvm_processor_mask_get_count(&block->mapped); 4671 4672 // If we are unmapping a page which we are tracking due to CPU faults with 4673 // correct permissions, clear the info. This will cover both the unmap and 4674 // revoke cases (since we implement CPU revocation by unmap + map) 4675 if (block->cpu.fault_authorized.first_fault_stamp && 4676 uvm_page_mask_region_test(unmap_pages, region, block->cpu.fault_authorized.page_index)) 4677 block->cpu.fault_authorized.first_fault_stamp = 0; 4678 4679 for_each_va_block_subregion_in_mask(subregion, unmap_pages, region) { 4680 if (!block_has_valid_mapping_cpu(block, subregion)) 4681 continue; 4682 4683 unmap_mapping_range(va_space->mapping, 4684 uvm_va_block_region_start(block, subregion), 4685 uvm_va_block_region_size(subregion), 1); 4686 4687 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++) 4688 uvm_page_mask_region_clear(&block->cpu.pte_bits[pte_bit], subregion); 4689 4690 // If the CPU is the only processor with mappings we can safely mark 4691 // the pages as fully unmapped 4692 if (num_mapped_processors == 1) 4693 uvm_page_mask_region_clear(&block->maybe_mapped_pages, subregion); 4694 4695 unmapped_something = true; 4696 } 4697 4698 if (!unmapped_something) 4699 return; 4700 4701 // Check whether the block has any more mappings 4702 if (uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) { 4703 UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])); 4704 uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU); 4705 } 4706 4707 UVM_ASSERT(block_check_mappings(block)); 4708 } 4709 4710 // Given a mask of mapped pages, returns true if any of the pages in the mask 4711 // are mapped remotely by the given GPU. 4712 static bool block_has_remote_mapping_gpu(uvm_va_block_t *block, 4713 uvm_page_mask_t *scratch_page_mask, 4714 uvm_gpu_id_t gpu_id, 4715 const uvm_page_mask_t *mapped_pages) 4716 { 4717 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id); 4718 4719 if (!gpu_state) 4720 return false; 4721 4722 // The caller must ensure that all pages of the input mask are really mapped 4723 UVM_ASSERT(uvm_page_mask_subset(mapped_pages, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ])); 4724 4725 // UVM-Lite GPUs map the preferred location if it's accessible, regardless 4726 // of the resident location. 4727 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu_id)) { 4728 if (uvm_page_mask_empty(mapped_pages)) 4729 return false; 4730 4731 return !uvm_id_equal(uvm_va_range_get_policy(block->va_range)->preferred_location, gpu_id); 4732 } 4733 4734 // Remote pages are pages which are mapped but not resident locally 4735 return uvm_page_mask_andnot(scratch_page_mask, mapped_pages, &gpu_state->resident); 4736 } 4737 4738 // Writes pte_clear_val to the 4k PTEs covered by clear_page_mask. If 4739 // clear_page_mask is NULL, all 4k PTEs in the {block, gpu} are written. 4740 // 4741 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The 4742 // caller is responsible for ending the TLB batch with the appropriate membar. 4743 static void block_gpu_pte_clear_4k(uvm_va_block_t *block, 4744 uvm_gpu_t *gpu, 4745 const uvm_page_mask_t *clear_page_mask, 4746 NvU64 pte_clear_val, 4747 uvm_pte_batch_t *pte_batch, 4748 uvm_tlb_batch_t *tlb_batch) 4749 { 4750 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4751 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 4752 uvm_gpu_phys_address_t pte_addr; 4753 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K); 4754 uvm_va_block_region_t region = uvm_va_block_region_from_block(block); 4755 uvm_va_block_region_t subregion; 4756 size_t num_ptes, ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K; 4757 4758 for_each_va_block_subregion_in_mask(subregion, clear_page_mask, region) { 4759 num_ptes = uvm_va_block_region_num_pages(subregion) * ptes_per_page; 4760 4761 pte_addr = uvm_page_table_range_entry_address(tree, 4762 &gpu_state->page_table_range_4k, 4763 subregion.first * ptes_per_page); 4764 4765 uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, num_ptes); 4766 4767 if (tlb_batch) { 4768 uvm_tlb_batch_invalidate(tlb_batch, 4769 uvm_va_block_region_start(block, subregion), 4770 uvm_va_block_region_size(subregion), 4771 UVM_PAGE_SIZE_4K, 4772 UVM_MEMBAR_NONE); 4773 } 4774 } 4775 } 4776 4777 // Writes the 4k PTEs covered by write_page_mask using memory from resident_id 4778 // with new_prot permissions. new_prot must not be UVM_PROT_NONE: use 4779 // block_gpu_pte_clear_4k instead. 4780 // 4781 // If write_page_mask is NULL, all 4k PTEs in the {block, gpu} are written. 4782 // 4783 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The 4784 // caller is responsible for ending the TLB batch with the appropriate membar. 4785 static void block_gpu_pte_write_4k(uvm_va_block_t *block, 4786 uvm_gpu_t *gpu, 4787 uvm_processor_id_t resident_id, 4788 uvm_prot_t new_prot, 4789 const uvm_page_mask_t *write_page_mask, 4790 uvm_pte_batch_t *pte_batch, 4791 uvm_tlb_batch_t *tlb_batch) 4792 { 4793 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4794 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 4795 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K); 4796 const size_t ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K; 4797 uvm_va_block_region_t contig_region = {0}; 4798 uvm_gpu_phys_address_t contig_addr = {0}; 4799 uvm_gpu_phys_address_t page_addr = {0}; 4800 uvm_page_index_t page_index; 4801 NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id); 4802 4803 UVM_ASSERT(new_prot != UVM_PROT_NONE); 4804 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 4805 4806 for_each_va_block_page_in_mask(page_index, write_page_mask, block) { 4807 uvm_gpu_phys_address_t pte_addr; 4808 size_t i; 4809 4810 // Assume that this mapping will be used to write to the page 4811 if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block)) 4812 block_mark_cpu_page_dirty(block, page_index); 4813 4814 if (page_index >= contig_region.outer) { 4815 contig_region = block_phys_contig_region(block, page_index, resident_id); 4816 contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu); 4817 page_addr = contig_addr; 4818 } 4819 4820 page_addr.address = contig_addr.address + (page_index - contig_region.first) * PAGE_SIZE; 4821 4822 pte_addr = uvm_page_table_range_entry_address(tree, 4823 &gpu_state->page_table_range_4k, 4824 page_index * ptes_per_page); 4825 4826 // Handle PAGE_SIZE > GPU PTE size 4827 for (i = 0; i < ptes_per_page; i++) { 4828 NvU64 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags); 4829 uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size); 4830 page_addr.address += UVM_PAGE_SIZE_4K; 4831 pte_addr.address += pte_size; 4832 } 4833 4834 if (tlb_batch) { 4835 NvU64 page_virt_addr = uvm_va_block_cpu_page_address(block, page_index); 4836 uvm_tlb_batch_invalidate(tlb_batch, page_virt_addr, PAGE_SIZE, UVM_PAGE_SIZE_4K, UVM_MEMBAR_NONE); 4837 } 4838 } 4839 } 4840 4841 // Writes all 4k PTEs under the big PTE regions described by big_ptes_covered. 4842 // This is used to initialize the 4k PTEs when splitting 2M and big PTEs. It 4843 // only writes 4k PTEs, not big PTEs. 4844 // 4845 // For those 4k PTEs, new_pages_mask indicates which ones should inherit the 4846 // mapping from the corresponding big page (0) and which ones should be written 4847 // using memory from resident_id and new_prot (1). Unlike the other pte_write 4848 // functions, new_prot may be UVM_PROT_NONE. 4849 // 4850 // If resident_id is UVM_ID_INVALID, this function looks up the resident ID 4851 // which should inherit the current permissions. new_prot must be UVM_PROT_NONE 4852 // in this case. 4853 // 4854 // new_pages_mask must not be NULL. 4855 // 4856 // No TLB invalidates are required since we've set up the lower PTEs to never be 4857 // cached by the GPU's MMU when covered by larger PTEs. 4858 static void block_gpu_pte_big_split_write_4k(uvm_va_block_t *block, 4859 uvm_va_block_context_t *block_context, 4860 uvm_gpu_t *gpu, 4861 uvm_processor_id_t resident_id, 4862 uvm_prot_t new_prot, 4863 const unsigned long *big_ptes_covered, 4864 const uvm_page_mask_t *new_pages_mask, 4865 uvm_pte_batch_t *pte_batch) 4866 { 4867 uvm_va_block_region_t big_region; 4868 size_t big_page_index; 4869 uvm_processor_id_t curr_resident_id; 4870 uvm_prot_t curr_prot; 4871 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); 4872 4873 if (UVM_ID_IS_INVALID(resident_id)) 4874 UVM_ASSERT(new_prot == UVM_PROT_NONE); 4875 4876 for_each_set_bit(big_page_index, big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 4877 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 4878 4879 curr_prot = block_page_prot_gpu(block, gpu, big_region.first); 4880 4881 // The unmap path doesn't know the current residency ahead of time, so 4882 // we have to look it up. 4883 if (UVM_ID_IS_INVALID(resident_id)) { 4884 curr_resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first); 4885 } 4886 else { 4887 // Check that we aren't changing the aperture of the existing 4888 // mappings. It could be legal in some cases (switching from {RO, A} 4889 // to {RO, B} for example) but we'd need to issue TLB membars. 4890 if (curr_prot != UVM_PROT_NONE) 4891 UVM_ASSERT(uvm_id_equal(block_gpu_get_processor_to_map(block, gpu, big_region.first), resident_id)); 4892 4893 curr_resident_id = resident_id; 4894 } 4895 4896 // pages in new_pages_mask under this big page get new_prot 4897 uvm_page_mask_zero(&block_context->scratch_page_mask); 4898 uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region); 4899 if (uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) { 4900 if (new_prot == UVM_PROT_NONE) { 4901 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 4902 } 4903 else { 4904 block_gpu_pte_write_4k(block, 4905 gpu, 4906 curr_resident_id, 4907 new_prot, 4908 &block_context->scratch_page_mask, 4909 pte_batch, 4910 NULL); 4911 } 4912 } 4913 4914 // All other pages under this big page inherit curr_prot 4915 uvm_page_mask_zero(&block_context->scratch_page_mask); 4916 uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region); 4917 if (uvm_page_mask_andnot(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) { 4918 if (curr_prot == UVM_PROT_NONE) { 4919 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 4920 } 4921 else { 4922 block_gpu_pte_write_4k(block, 4923 gpu, 4924 curr_resident_id, 4925 curr_prot, 4926 &block_context->scratch_page_mask, 4927 pte_batch, 4928 NULL); 4929 } 4930 } 4931 } 4932 } 4933 4934 // Writes pte_clear_val to the big PTEs in big_ptes_mask. If big_ptes_mask is 4935 // NULL, all big PTEs in the {block, gpu} are cleared. 4936 // 4937 // If tlb_batch is provided, the big PTEs written are added to the batch. The 4938 // caller is responsible for ending the TLB batch with the appropriate membar. 4939 static void block_gpu_pte_clear_big(uvm_va_block_t *block, 4940 uvm_gpu_t *gpu, 4941 const unsigned long *big_ptes_mask, 4942 NvU64 pte_clear_val, 4943 uvm_pte_batch_t *pte_batch, 4944 uvm_tlb_batch_t *tlb_batch) 4945 { 4946 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4947 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 4948 NvU32 big_page_size = gpu_va_space->page_tables.big_page_size; 4949 uvm_gpu_phys_address_t pte_addr; 4950 NvU32 pte_size = uvm_mmu_pte_size(&gpu_va_space->page_tables, big_page_size); 4951 size_t big_page_index; 4952 DECLARE_BITMAP(big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 4953 4954 if (big_ptes_mask) 4955 bitmap_copy(big_ptes_to_clear, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 4956 else 4957 bitmap_set(big_ptes_to_clear, 0, uvm_va_block_num_big_pages(block, big_page_size)); 4958 4959 for_each_set_bit(big_page_index, big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 4960 pte_addr = uvm_page_table_range_entry_address(&gpu_va_space->page_tables, 4961 &gpu_state->page_table_range_big, 4962 big_page_index); 4963 uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, 1); 4964 4965 if (tlb_batch) { 4966 uvm_tlb_batch_invalidate(tlb_batch, 4967 uvm_va_block_big_page_addr(block, big_page_index, big_page_size), 4968 big_page_size, 4969 big_page_size, 4970 UVM_MEMBAR_NONE); 4971 } 4972 } 4973 } 4974 4975 // Writes the big PTEs in big_ptes_mask using memory from resident_id with 4976 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use 4977 // block_gpu_pte_clear_big instead. 4978 // 4979 // Unlike block_gpu_pte_clear_big, big_ptes_mask must not be NULL. 4980 // 4981 // If tlb_batch is provided, the big PTEs written are added to the batch. The 4982 // caller is responsible for ending the TLB batch with the appropriate membar. 4983 static void block_gpu_pte_write_big(uvm_va_block_t *block, 4984 uvm_gpu_t *gpu, 4985 uvm_processor_id_t resident_id, 4986 uvm_prot_t new_prot, 4987 const unsigned long *big_ptes_mask, 4988 uvm_pte_batch_t *pte_batch, 4989 uvm_tlb_batch_t *tlb_batch) 4990 { 4991 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4992 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 4993 uvm_page_tree_t *tree = &gpu_va_space->page_tables; 4994 NvU32 big_page_size = tree->big_page_size; 4995 NvU32 pte_size = uvm_mmu_pte_size(tree, big_page_size); 4996 size_t big_page_index; 4997 uvm_va_block_region_t contig_region = {0}; 4998 uvm_gpu_phys_address_t contig_addr = {0}; 4999 uvm_gpu_phys_address_t page_addr = {0}; 5000 NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id); 5001 5002 UVM_ASSERT(new_prot != UVM_PROT_NONE); 5003 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 5004 UVM_ASSERT(big_ptes_mask); 5005 5006 if (!bitmap_empty(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 5007 UVM_ASSERT(uvm_va_block_num_big_pages(block, big_page_size) > 0); 5008 5009 if (!gpu->parent->can_map_sysmem_with_large_pages) 5010 UVM_ASSERT(UVM_ID_IS_GPU(resident_id)); 5011 } 5012 5013 for_each_set_bit(big_page_index, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 5014 NvU64 pte_val; 5015 uvm_gpu_phys_address_t pte_addr; 5016 uvm_va_block_region_t big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 5017 5018 // Assume that this mapping will be used to write to the page 5019 if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block)) { 5020 uvm_page_index_t page_index; 5021 5022 for_each_va_block_page_in_region(page_index, big_region) 5023 block_mark_cpu_page_dirty(block, page_index); 5024 } 5025 5026 if (big_region.first >= contig_region.outer) { 5027 contig_region = block_phys_contig_region(block, big_region.first, resident_id); 5028 contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu); 5029 page_addr = contig_addr; 5030 } 5031 5032 page_addr.address = contig_addr.address + (big_region.first - contig_region.first) * PAGE_SIZE; 5033 5034 pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_big, big_page_index); 5035 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags); 5036 uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size); 5037 5038 if (tlb_batch) { 5039 uvm_tlb_batch_invalidate(tlb_batch, 5040 uvm_va_block_region_start(block, big_region), 5041 big_page_size, 5042 big_page_size, 5043 UVM_MEMBAR_NONE); 5044 } 5045 } 5046 } 5047 5048 // Switches any mix of valid or invalid 4k PTEs under the big PTEs in 5049 // big_ptes_to_merge to an unmapped big PTE. This also ends both pte_batch and 5050 // tlb_batch in order to poison the now-unused 4k PTEs. 5051 // 5052 // The 4k PTEs are invalidated with the specified membar. 5053 static void block_gpu_pte_merge_big_and_end(uvm_va_block_t *block, 5054 uvm_va_block_context_t *block_context, 5055 uvm_gpu_t *gpu, 5056 const unsigned long *big_ptes_to_merge, 5057 uvm_push_t *push, 5058 uvm_pte_batch_t *pte_batch, 5059 uvm_tlb_batch_t *tlb_batch, 5060 uvm_membar_t tlb_membar) 5061 { 5062 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5063 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5064 NvU32 big_page_size = tree->big_page_size; 5065 NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size); 5066 size_t big_page_index; 5067 DECLARE_BITMAP(dummy_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5068 5069 UVM_ASSERT(!bitmap_empty(big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5070 UVM_ASSERT(!bitmap_and(dummy_big_ptes, gpu_state->big_ptes, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5071 5072 // We can be called with the 4k PTEs in two cases: 5073 // 1) 4k PTEs allocated. In this case the 4k PTEs are currently active. 5074 // 5075 // 2) 4k PTEs unallocated. In this case the GPU may not have invalid 4k PTEs 5076 // active under the big PTE, depending on whether neighboring blocks 5077 // caused the page tables to be allocated. 5078 // 5079 // In both cases we need to invalidate the 4k PTEs in case the GPU MMU has 5080 // them cached. 5081 5082 // Each big PTE is currently invalid so the 4ks are active (or unallocated). 5083 // First make the big PTEs unmapped to disable future lookups of the 4ks 5084 // under it. We can't directly transition the entry from valid 4k PTEs to 5085 // valid big PTEs, because that could cause the GPU TLBs to cache the same 5086 // VA in different cache lines. That could cause memory ordering to not be 5087 // maintained. 5088 block_gpu_pte_clear_big(block, gpu, big_ptes_to_merge, unmapped_pte_val, pte_batch, tlb_batch); 5089 5090 // Now invalidate the big PTEs we just wrote as well as all 4ks under them. 5091 // Subsequent MMU fills will stop at the now-unmapped big PTEs, so we only 5092 // need to invalidate the 4k PTEs without actually writing them. 5093 for_each_set_bit(big_page_index, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 5094 uvm_tlb_batch_invalidate(tlb_batch, 5095 uvm_va_block_big_page_addr(block, big_page_index, big_page_size), 5096 big_page_size, 5097 big_page_size | UVM_PAGE_SIZE_4K, 5098 UVM_MEMBAR_NONE); 5099 } 5100 5101 // End the batches for the caller. We need to do this here in order to 5102 // poison the 4ks below. 5103 uvm_pte_batch_end(pte_batch); 5104 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5105 5106 // As a guard against bad PTE writes/TLB invalidates, fill the now-unused 5107 // PTEs with a pattern which will trigger fatal faults on access. We have to 5108 // do this after the TLB invalidate of the big PTEs, or the GPU might use 5109 // the new values. 5110 if (UVM_IS_DEBUG() && gpu_state->page_table_range_4k.table) { 5111 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_to_merge); 5112 uvm_pte_batch_begin(push, pte_batch); 5113 block_gpu_pte_clear_4k(block, 5114 gpu, 5115 &block_context->scratch_page_mask, 5116 tree->hal->poisoned_pte(), 5117 pte_batch, 5118 NULL); 5119 uvm_pte_batch_end(pte_batch); 5120 } 5121 } 5122 5123 // Writes 0 (invalid) to the 2M PTE for this {block, gpu}. 5124 // 5125 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is 5126 // responsible for ending the TLB batch with the appropriate membar. 5127 static void block_gpu_pte_clear_2m(uvm_va_block_t *block, 5128 uvm_gpu_t *gpu, 5129 uvm_pte_batch_t *pte_batch, 5130 uvm_tlb_batch_t *tlb_batch) 5131 { 5132 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5133 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5134 uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0); 5135 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M); 5136 5137 // uvm_pte_batch_write_pte only writes the lower 8 bytes of the 16-byte PTE, 5138 // which would cause a problem when trying to make the entry invalid since 5139 // both halves must be 0. Using uvm_pte_batch_clear_ptes writes the entire 5140 // 16 bytes. 5141 uvm_pte_batch_clear_ptes(pte_batch, pte_addr, 0, pte_size, 1); 5142 5143 if (tlb_batch) 5144 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE); 5145 } 5146 5147 // Writes the 2M PTE for {block, gpu} using memory from resident_id with 5148 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use 5149 // block_gpu_pte_clear_2m instead. 5150 // 5151 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is 5152 // responsible for ending the TLB batch with the appropriate membar. 5153 static void block_gpu_pte_write_2m(uvm_va_block_t *block, 5154 uvm_gpu_t *gpu, 5155 uvm_processor_id_t resident_id, 5156 uvm_prot_t new_prot, 5157 uvm_pte_batch_t *pte_batch, 5158 uvm_tlb_batch_t *tlb_batch) 5159 { 5160 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5161 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5162 uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0); 5163 uvm_gpu_phys_address_t page_addr; 5164 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M); 5165 NvU64 pte_val; 5166 NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id); 5167 5168 UVM_ASSERT(new_prot != UVM_PROT_NONE); 5169 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 5170 5171 if (UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block)) 5172 block_mark_cpu_page_dirty(block, 0); 5173 5174 page_addr = block_phys_page_address(block, block_phys_page(resident_id, 0), gpu); 5175 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags); 5176 uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size); 5177 5178 if (tlb_batch) 5179 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE); 5180 } 5181 5182 static bool block_gpu_needs_to_activate_table(uvm_va_block_t *block, uvm_gpu_t *gpu) 5183 { 5184 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5185 5186 if (!block_gpu_supports_2m(block, gpu)) 5187 return false; 5188 5189 if ((gpu_state->page_table_range_big.table && !gpu_state->activated_big) || 5190 (gpu_state->page_table_range_4k.table && !gpu_state->activated_4k)) 5191 return true; 5192 5193 return false; 5194 } 5195 5196 // Only used if 2M PTEs are supported. Either transitions a 2M PTE to a PDE, or 5197 // activates a newly-allocated page table (big or 4k) while the other is already 5198 // active. The caller must have already written the new PTEs under the table 5199 // with the appropriate membar. 5200 static void block_gpu_write_pde(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_push_t *push, uvm_tlb_batch_t *tlb_batch) 5201 { 5202 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5203 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5204 5205 if (!gpu_state->pte_is_2m) 5206 UVM_ASSERT(block_gpu_needs_to_activate_table(block, gpu)); 5207 5208 UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table); 5209 5210 // We always need a membar to order PDE/PTE writes with the TLB invalidate. 5211 // write_pde will do a MEMBAR_SYS by default. 5212 if (uvm_page_table_range_aperture(&gpu_state->page_table_range_2m) == UVM_APERTURE_VID) 5213 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU); 5214 uvm_page_tree_write_pde(tree, &gpu_state->page_table_range_2m, push); 5215 5216 gpu->parent->host_hal->wait_for_idle(push); 5217 5218 // Invalidate just the PDE 5219 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE); 5220 5221 if (gpu_state->page_table_range_big.table) 5222 gpu_state->activated_big = true; 5223 5224 if (gpu_state->page_table_range_4k.table) 5225 gpu_state->activated_4k = true; 5226 } 5227 5228 // Called to switch the 2M PTE (valid or invalid) to a PDE. The caller should 5229 // have written all lower PTEs as appropriate into the given pte_batch already. 5230 // This function ends the PTE batch, activates the 2M PDE, and does a TLB 5231 // invalidate. 5232 // 5233 // The caller does not need to do any TLB invalidates since none of the lower 5234 // PTEs could be cached. 5235 static void block_gpu_pte_finish_split_2m(uvm_va_block_t *block, 5236 uvm_gpu_t *gpu, 5237 uvm_push_t *push, 5238 uvm_pte_batch_t *pte_batch, 5239 uvm_tlb_batch_t *tlb_batch, 5240 uvm_membar_t tlb_membar) 5241 { 5242 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5243 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); 5244 5245 // Step 1: Make the 2M entry invalid. We can't directly transition from a 5246 // valid 2M PTE to valid lower PTEs, because that could cause the 5247 // GPU TLBs to cache the same VA in different cache lines. That 5248 // could cause memory ordering to not be maintained. 5249 // 5250 // If the 2M PTE is already invalid, no TLB invalidate is needed. 5251 5252 if (curr_prot == UVM_PROT_NONE) { 5253 // If we aren't downgrading, then we don't need a membar. 5254 UVM_ASSERT(tlb_membar == UVM_MEMBAR_NONE); 5255 5256 // End the batch, which pushes a membar to ensure that the caller's PTE 5257 // writes below 2M are observed before the PDE write we're about to do. 5258 uvm_pte_batch_end(pte_batch); 5259 } 5260 else { 5261 // The 64k and 4k PTEs can't possibly be cached since the 2M entry is 5262 // not yet a PDE, so we just need to invalidate this single 2M entry. 5263 uvm_tlb_batch_begin(tree, tlb_batch); 5264 block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch); 5265 5266 // Make sure the PTE writes are observed before the TLB invalidate 5267 uvm_pte_batch_end(pte_batch); 5268 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5269 } 5270 5271 // Step 2: Switch the 2M entry from invalid to a PDE. This activates the 5272 // smaller PTEs. 5273 uvm_tlb_batch_begin(tree, tlb_batch); 5274 block_gpu_write_pde(block, gpu, push, tlb_batch); 5275 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 5276 } 5277 5278 // Switches any mix of valid or invalid 4k or 64k PTEs to an invalid 2M PTE. 5279 // Any lower PTEs are invalidated with the specified membar. 5280 static void block_gpu_pte_merge_2m(uvm_va_block_t *block, 5281 uvm_va_block_context_t *block_context, 5282 uvm_gpu_t *gpu, 5283 uvm_push_t *push, 5284 uvm_membar_t tlb_membar) 5285 { 5286 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5287 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5288 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5289 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5290 NvU32 tlb_inval_sizes; 5291 5292 UVM_ASSERT(!gpu_state->pte_is_2m); 5293 UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table); 5294 5295 // The 2M entry is currently a PDE, so first make it invalid. We can't 5296 // directly transition the entry from a valid PDE to a valid 2M PTE, because 5297 // that could cause the GPU TLBs to cache the same VA in different cache 5298 // lines. That could cause memory ordering to not be maintained. 5299 uvm_pte_batch_begin(push, pte_batch); 5300 block_gpu_pte_clear_2m(block, gpu, pte_batch, NULL); 5301 uvm_pte_batch_end(pte_batch); 5302 5303 // Now invalidate both the 2M entry we just wrote as well as all lower-level 5304 // entries which could be cached. Subsequent MMU fills will stop at the now- 5305 // invalid 2M entry, so we only need to invalidate the lower PTEs without 5306 // actually writing them. 5307 tlb_inval_sizes = UVM_PAGE_SIZE_2M; 5308 if (gpu_state->page_table_range_big.table) 5309 tlb_inval_sizes |= UVM_PAGE_SIZE_64K; 5310 5311 // Strictly-speaking we only need to invalidate those 4k ranges which are 5312 // not covered by a big pte. However, any such invalidate will require 5313 // enough 4k invalidates to force the TLB batching to invalidate everything 5314 // anyway, so just do the simpler thing. 5315 if (!bitmap_full(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 5316 tlb_inval_sizes |= UVM_PAGE_SIZE_4K; 5317 5318 uvm_tlb_batch_begin(tree, tlb_batch); 5319 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, tlb_inval_sizes, UVM_MEMBAR_NONE); 5320 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5321 5322 // As a guard against bad PTE writes/TLB invalidates, fill the now-unused 5323 // PTEs with a pattern which will trigger fatal faults on access. We have to 5324 // do this after the TLB invalidate of the 2M entry, or the GPU might use 5325 // the new values. 5326 if (UVM_IS_DEBUG()) { 5327 uvm_pte_batch_begin(push, pte_batch); 5328 5329 if (gpu_state->page_table_range_big.table) { 5330 block_gpu_pte_clear_big(block, 5331 gpu, 5332 NULL, 5333 tree->hal->poisoned_pte(), 5334 pte_batch, 5335 NULL); 5336 } 5337 5338 if (gpu_state->page_table_range_4k.table) { 5339 block_gpu_pte_clear_4k(block, 5340 gpu, 5341 NULL, 5342 tree->hal->poisoned_pte(), 5343 pte_batch, 5344 NULL); 5345 } 5346 5347 uvm_pte_batch_end(pte_batch); 5348 } 5349 } 5350 5351 static uvm_membar_t block_pte_op_membar(block_pte_op_t pte_op, uvm_gpu_t *gpu, uvm_processor_id_t resident_id) 5352 { 5353 // Permissions upgrades (MAP) don't need membars 5354 if (pte_op == BLOCK_PTE_OP_MAP) 5355 return UVM_MEMBAR_NONE; 5356 5357 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 5358 UVM_ASSERT(pte_op == BLOCK_PTE_OP_REVOKE); 5359 5360 return uvm_hal_downgrade_membar_type(gpu, uvm_id_equal(gpu->id, resident_id)); 5361 } 5362 5363 // Write the 2M PTE for {block, gpu} to the memory on resident_id with new_prot 5364 // permissions. If the 2M entry is currently a PDE, it is first merged into a 5365 // PTE. 5366 // 5367 // new_prot must not be UVM_PROT_NONE: use block_gpu_unmap_to_2m instead. 5368 // 5369 // pte_op specifies whether this is a MAP or REVOKE operation, which determines 5370 // the TLB membar required. 5371 static void block_gpu_map_to_2m(uvm_va_block_t *block, 5372 uvm_va_block_context_t *block_context, 5373 uvm_gpu_t *gpu, 5374 uvm_processor_id_t resident_id, 5375 uvm_prot_t new_prot, 5376 uvm_push_t *push, 5377 block_pte_op_t pte_op) 5378 { 5379 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5380 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 5381 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5382 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5383 uvm_membar_t tlb_membar; 5384 5385 UVM_ASSERT(new_prot != UVM_PROT_NONE); 5386 5387 // If we have a mix of big and 4k PTEs, we have to first merge them to an 5388 // invalid 2M PTE. 5389 if (!gpu_state->pte_is_2m) { 5390 block_gpu_pte_merge_2m(block, block_context, gpu, push, UVM_MEMBAR_NONE); 5391 5392 gpu_state->pte_is_2m = true; 5393 bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5394 } 5395 5396 // Write the new permissions 5397 uvm_pte_batch_begin(push, pte_batch); 5398 uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch); 5399 5400 block_gpu_pte_write_2m(block, gpu, resident_id, new_prot, pte_batch, tlb_batch); 5401 5402 uvm_pte_batch_end(pte_batch); 5403 5404 tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id); 5405 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5406 } 5407 5408 // Combination split + map operation, called when only part of a 2M PTE mapping 5409 // is being changed. This splits an existing valid or invalid 2M PTE into the 5410 // mix of big and 4k PTEs described by block_context->mapping.new_pte_state. 5411 // 5412 // The PTEs covering the pages in pages_to_write are written to the memory on 5413 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE. 5414 // 5415 // The PTEs covering the pages not set in pages_to_write inherit the mapping of 5416 // the current 2M PTE. If the current mapping is valid, it must target 5417 // resident_id. 5418 // 5419 // pte_op specifies whether this is a MAP or REVOKE operation, which determines 5420 // the TLB membar required. 5421 static void block_gpu_map_split_2m(uvm_va_block_t *block, 5422 uvm_va_block_context_t *block_context, 5423 uvm_gpu_t *gpu, 5424 uvm_processor_id_t resident_id, 5425 const uvm_page_mask_t *pages_to_write, 5426 uvm_prot_t new_prot, 5427 uvm_push_t *push, 5428 block_pte_op_t pte_op) 5429 { 5430 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5431 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5432 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 5433 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5434 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5435 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); 5436 uvm_membar_t tlb_membar; 5437 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5438 DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5439 DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5440 5441 UVM_ASSERT(gpu_state->pte_is_2m); 5442 5443 if (!gpu_state->page_table_range_4k.table) 5444 UVM_ASSERT(bitmap_full(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5445 5446 uvm_pte_batch_begin(push, pte_batch); 5447 5448 // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries 5449 // from the lower levels. This means we don't need to issue a TLB invalidate 5450 // when writing those levels. 5451 5452 // Cases to handle: 5453 // 1) Big PTEs which inherit curr_prot 5454 // 2) Big PTEs which get new_prot 5455 // 3) Big PTEs which are split to 4k 5456 // a) 4k PTEs which inherit curr_prot under the split big PTEs 5457 // b) 4k PTEs which get new_prot under the split big PTEs 5458 5459 // Compute the big PTEs which will need to be split to 4k, if any. 5460 bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5461 5462 if (gpu_state->page_table_range_big.table) { 5463 // Case 1: Write the big PTEs which will inherit the 2M permissions, if 5464 // any. These are the big PTEs which are unchanged (uncovered) by the 5465 // operation. 5466 bitmap_andnot(big_ptes_inherit, 5467 new_pte_state->big_ptes, 5468 new_pte_state->big_ptes_covered, 5469 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5470 5471 if (curr_prot == UVM_PROT_NONE) { 5472 block_gpu_pte_clear_big(block, 5473 gpu, 5474 big_ptes_inherit, 5475 tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K), 5476 pte_batch, 5477 NULL); 5478 } 5479 else { 5480 block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL); 5481 } 5482 5483 // Case 2: Write the new big PTEs 5484 bitmap_and(big_ptes_new_prot, 5485 new_pte_state->big_ptes, 5486 new_pte_state->big_ptes_covered, 5487 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5488 block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_new_prot, pte_batch, NULL); 5489 5490 // Case 3: Write the big PTEs which cover 4k PTEs 5491 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL); 5492 5493 // We just wrote all possible big PTEs, so mark them as initialized 5494 gpu_state->initialized_big = true; 5495 } 5496 else { 5497 UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5498 } 5499 5500 // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs 5501 block_gpu_pte_big_split_write_4k(block, 5502 block_context, 5503 gpu, 5504 resident_id, 5505 new_prot, 5506 big_ptes_split, 5507 pages_to_write, 5508 pte_batch); 5509 5510 // Activate the 2M PDE. This ends the pte_batch and issues a single TLB 5511 // invalidate for the 2M entry. 5512 tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id); 5513 block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar); 5514 5515 gpu_state->pte_is_2m = false; 5516 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5517 } 5518 5519 // Split the existing 2M PTE into big and 4k PTEs. No permissions are changed. 5520 // 5521 // new_big_ptes specifies which PTEs should be big. NULL means all PTEs should 5522 // be 4k. 5523 static void block_gpu_split_2m(uvm_va_block_t *block, 5524 uvm_va_block_context_t *block_context, 5525 uvm_gpu_t *gpu, 5526 const unsigned long *new_big_ptes, 5527 uvm_push_t *push) 5528 { 5529 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5530 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5531 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5532 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5533 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); 5534 DECLARE_BITMAP(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5535 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5536 NvU64 unmapped_pte_val; 5537 uvm_processor_id_t curr_residency; 5538 5539 UVM_ASSERT(gpu_state->pte_is_2m); 5540 5541 if (new_big_ptes) 5542 bitmap_copy(new_big_ptes_local, new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5543 else 5544 bitmap_zero(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5545 5546 if (!bitmap_empty(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 5547 UVM_ASSERT(gpu_state->page_table_range_big.table); 5548 5549 // We're splitting from 2M to big only, so we'll be writing all big PTEs 5550 if (gpu_state->page_table_range_big.table) 5551 gpu_state->initialized_big = true; 5552 5553 // Cases to handle: 5554 // 1) Big PTEs which inherit curr_prot 5555 // 2) Big PTEs which are split to 4k 5556 // a) 4k PTEs inherit curr_prot under the split big PTEs 5557 5558 // big_ptes_split will cover the 4k regions 5559 bitmap_complement(big_ptes_split, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5560 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_split); 5561 5562 uvm_pte_batch_begin(push, pte_batch); 5563 5564 // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries 5565 // from the lower levels. This means we don't need to issue a TLB invalidate 5566 // when writing those levels. 5567 5568 if (curr_prot == UVM_PROT_NONE) { 5569 unmapped_pte_val = tree->hal->unmapped_pte(tree->big_page_size); 5570 5571 // Case 2a: Clear the 4k PTEs under big_ptes_split 5572 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 5573 5574 // Case 1: Make the remaining big PTEs unmapped 5575 block_gpu_pte_clear_big(block, gpu, new_big_ptes_local, unmapped_pte_val, pte_batch, NULL); 5576 } 5577 else { 5578 curr_residency = block_gpu_get_processor_to_map(block, gpu, 0); 5579 5580 // Case 2a: Write the new 4k PTEs under big_ptes_split 5581 block_gpu_pte_write_4k(block, 5582 gpu, 5583 curr_residency, 5584 curr_prot, 5585 &block_context->scratch_page_mask, 5586 pte_batch, 5587 NULL); 5588 5589 // Case 1: Write the new big PTEs 5590 block_gpu_pte_write_big(block, gpu, curr_residency, curr_prot, new_big_ptes_local, pte_batch, NULL); 5591 } 5592 5593 // Case 2: Make big_ptes_split invalid to activate the 4k PTEs 5594 if (gpu_state->page_table_range_big.table) 5595 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL); 5596 5597 // Activate the 2M PDE. This ends the pte_batch and issues a single TLB 5598 // invalidate for the 2M entry. No membar is necessary since we aren't 5599 // changing permissions. 5600 block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, UVM_MEMBAR_NONE); 5601 5602 gpu_state->pte_is_2m = false; 5603 bitmap_copy(gpu_state->big_ptes, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5604 } 5605 5606 // Split the big PTEs in big_ptes_to_split into 4k PTEs. No permissions are 5607 // changed. 5608 // 5609 // big_ptes_to_split must not be NULL. 5610 static void block_gpu_split_big(uvm_va_block_t *block, 5611 uvm_va_block_context_t *block_context, 5612 uvm_gpu_t *gpu, 5613 const unsigned long *big_ptes_to_split, 5614 uvm_push_t *push) 5615 { 5616 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5617 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5618 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5619 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5620 NvU32 big_page_size = tree->big_page_size; 5621 uvm_va_block_region_t big_region; 5622 uvm_processor_id_t resident_id; 5623 size_t big_page_index; 5624 uvm_prot_t curr_prot; 5625 DECLARE_BITMAP(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5626 5627 UVM_ASSERT(!gpu_state->pte_is_2m); 5628 UVM_ASSERT(bitmap_subset(big_ptes_to_split, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5629 UVM_ASSERT(!bitmap_empty(big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5630 5631 uvm_pte_batch_begin(push, pte_batch); 5632 uvm_tlb_batch_begin(tree, tlb_batch); 5633 5634 // Write all 4k PTEs under all big PTEs which are being split. We'll make 5635 // the big PTEs inactive below after flushing these writes. No TLB 5636 // invalidate is needed since the big PTE is active. 5637 bitmap_zero(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5638 for_each_set_bit(big_page_index, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 5639 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 5640 curr_prot = block_page_prot_gpu(block, gpu, big_region.first); 5641 5642 uvm_page_mask_zero(&block_context->scratch_page_mask); 5643 uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region); 5644 if (curr_prot == UVM_PROT_NONE) { 5645 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 5646 } 5647 else { 5648 __set_bit(big_page_index, big_ptes_valid); 5649 5650 resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first); 5651 5652 block_gpu_pte_write_4k(block, 5653 gpu, 5654 resident_id, 5655 curr_prot, 5656 &block_context->scratch_page_mask, 5657 pte_batch, 5658 NULL); 5659 } 5660 } 5661 5662 // Unmap the big PTEs which are valid and are being split to 4k. We can't 5663 // directly transition from a valid big PTE to valid lower PTEs, because 5664 // that could cause the GPU TLBs to cache the same VA in different cache 5665 // lines. That could cause memory ordering to not be maintained. 5666 block_gpu_pte_clear_big(block, gpu, big_ptes_valid, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch); 5667 5668 // End the batches. We have to commit the membars and TLB invalidates 5669 // before we finish splitting formerly-big PTEs. No membar is necessary 5670 // since we aren't changing permissions. 5671 uvm_pte_batch_end(pte_batch); 5672 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 5673 5674 // Finish the split by switching the big PTEs from unmapped to invalid. This 5675 // causes the GPU MMU to start reading the 4k PTEs instead of stopping at 5676 // the unmapped big PTEs. 5677 uvm_pte_batch_begin(push, pte_batch); 5678 uvm_tlb_batch_begin(tree, tlb_batch); 5679 5680 block_gpu_pte_clear_big(block, gpu, big_ptes_to_split, 0, pte_batch, tlb_batch); 5681 5682 uvm_pte_batch_end(pte_batch); 5683 5684 // Finally, activate the page tables if they're inactive 5685 if (block_gpu_needs_to_activate_table(block, gpu)) 5686 block_gpu_write_pde(block, gpu, push, tlb_batch); 5687 5688 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 5689 5690 bitmap_andnot(gpu_state->big_ptes, gpu_state->big_ptes, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5691 } 5692 5693 // Changes permissions on some pre-existing mix of big and 4k PTEs into some 5694 // other mix of big and 4k PTEs, as described by 5695 // block_context->mapping.new_pte_state. 5696 // 5697 // The PTEs covering the pages in pages_to_write are written to the memory on 5698 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE. 5699 // 5700 // pte_op specifies whether this is a MAP or REVOKE operation, which determines 5701 // the TLB membar required. 5702 static void block_gpu_map_big_and_4k(uvm_va_block_t *block, 5703 uvm_va_block_context_t *block_context, 5704 uvm_gpu_t *gpu, 5705 uvm_processor_id_t resident_id, 5706 const uvm_page_mask_t *pages_to_write, 5707 uvm_prot_t new_prot, 5708 uvm_push_t *push, 5709 block_pte_op_t pte_op) 5710 { 5711 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5712 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5713 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 5714 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5715 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5716 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5717 DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5718 DECLARE_BITMAP(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5719 DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5720 uvm_va_block_region_t big_region; 5721 size_t big_page_index; 5722 NvU32 big_page_size = tree->big_page_size; 5723 uvm_membar_t tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id); 5724 5725 UVM_ASSERT(!gpu_state->pte_is_2m); 5726 5727 uvm_pte_batch_begin(push, pte_batch); 5728 uvm_tlb_batch_begin(tree, tlb_batch); 5729 5730 // All of these cases might be perfomed in the same call: 5731 // 1) Split currently-big PTEs to 4k 5732 // a) Write new 4k PTEs which inherit curr_prot under the split big PTEs 5733 // b) Write new 4k PTEs which get new_prot under the split big PTEs 5734 // 2) Merge currently-4k PTEs to big with new_prot 5735 // 3) Write currently-big PTEs which wholly get new_prot 5736 // 4) Write currently-4k PTEs which get new_prot 5737 // 5) Initialize big PTEs which are not covered by this operation 5738 5739 // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are 5740 // being split. We'll make the big PTEs inactive below after flushing these 5741 // writes. No TLB invalidate is needed since the big PTE is active. 5742 // 5743 // Mask computation: big_before && !big_after 5744 bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5745 5746 block_gpu_pte_big_split_write_4k(block, 5747 block_context, 5748 gpu, 5749 resident_id, 5750 new_prot, 5751 big_ptes_split, 5752 pages_to_write, 5753 pte_batch); 5754 5755 // Case 4: Write the 4k PTEs which weren't covered by a big PTE before, and 5756 // remain uncovered after the operation. 5757 // 5758 // Mask computation: !big_before && !big_after 5759 bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5760 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after); 5761 if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_write, &block_context->scratch_page_mask)) { 5762 block_gpu_pte_write_4k(block, 5763 gpu, 5764 resident_id, 5765 new_prot, 5766 &block_context->scratch_page_mask, 5767 pte_batch, 5768 tlb_batch); 5769 } 5770 5771 // Case 5: If the big page table is newly-allocated, make sure that all big 5772 // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are 5773 // all initialized to invalid. 5774 // 5775 // The similar case of making newly-allocated big PTEs unmapped when no 5776 // lower 4k table is present is handled by having 5777 // block_gpu_compute_new_pte_state set new_pte_state->big_ptes 5778 // appropriately. 5779 if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) { 5780 // TODO: Bug 1766424: If we have the 4k page table already, we could 5781 // attempt to merge all uncovered big PTE regions when first 5782 // allocating the big table. That's probably not worth doing. 5783 UVM_ASSERT(gpu_state->page_table_range_4k.table); 5784 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5785 bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size)); 5786 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch); 5787 gpu_state->initialized_big = true; 5788 } 5789 5790 // Case 1 (step 1): Unmap the currently-big PTEs which are valid and are 5791 // being split to 4k. We can't directly transition from a valid big PTE to 5792 // valid lower PTEs, because that could cause the GPU TLBs to cache the same 5793 // VA in different cache lines. That could cause memory ordering to not be 5794 // maintained. 5795 bitmap_zero(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5796 for_each_set_bit(big_page_index, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 5797 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 5798 if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region.first)) 5799 __set_bit(big_page_index, big_ptes_mask); 5800 } 5801 5802 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch); 5803 5804 // Case 3: Write the currently-big PTEs which remain big PTEs, and are 5805 // wholly changing permissions. 5806 // 5807 // Mask computation: big_before && big_after && covered 5808 bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5809 if (bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 5810 block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_mask, pte_batch, tlb_batch); 5811 5812 // Case 2 (step 1): Merge the new big PTEs and end the batches, now that 5813 // we've done all of the independent PTE writes we can. This also merges 5814 // newly-allocated uncovered big PTEs to unmapped (see 5815 // block_gpu_compute_new_pte_state). 5816 // 5817 // Mask computation: !big_before && big_after 5818 if (bitmap_andnot(big_ptes_merge, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 5819 // This writes the newly-big PTEs to unmapped and ends the PTE and TLB 5820 // batches. 5821 block_gpu_pte_merge_big_and_end(block, 5822 block_context, 5823 gpu, 5824 big_ptes_merge, 5825 push, 5826 pte_batch, 5827 tlb_batch, 5828 tlb_membar); 5829 5830 // Remove uncovered big PTEs. We needed to merge them to unmapped above, 5831 // but they shouldn't get new_prot below. 5832 bitmap_and(big_ptes_merge, big_ptes_merge, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5833 } 5834 else { 5835 // End the batches. We have to commit the membars and TLB invalidates 5836 // before we finish splitting formerly-big PTEs. 5837 uvm_pte_batch_end(pte_batch); 5838 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5839 } 5840 5841 if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) || 5842 !bitmap_empty(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) || 5843 block_gpu_needs_to_activate_table(block, gpu)) { 5844 5845 uvm_pte_batch_begin(push, pte_batch); 5846 uvm_tlb_batch_begin(tree, tlb_batch); 5847 5848 // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by 5849 // switching them from unmapped to invalid. This causes the GPU MMU to 5850 // start reading the 4k PTEs instead of stopping at the unmapped big 5851 // PTEs. 5852 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch); 5853 5854 // Case 2 (step 2): Finish merging our big PTEs, if we have any, by 5855 // switching them from unmapped to new_prot. 5856 block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_merge, pte_batch, tlb_batch); 5857 5858 uvm_pte_batch_end(pte_batch); 5859 5860 // Finally, activate the page tables if they're inactive 5861 if (block_gpu_needs_to_activate_table(block, gpu)) 5862 block_gpu_write_pde(block, gpu, push, tlb_batch); 5863 5864 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 5865 } 5866 5867 // Update gpu_state 5868 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5869 } 5870 5871 // Unmap all PTEs for {block, gpu}. If the 2M entry is currently a PDE, it is 5872 // merged into a PTE. 5873 static void block_gpu_unmap_to_2m(uvm_va_block_t *block, 5874 uvm_va_block_context_t *block_context, 5875 uvm_gpu_t *gpu, 5876 uvm_push_t *push, 5877 uvm_membar_t tlb_membar) 5878 { 5879 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5880 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 5881 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5882 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5883 5884 if (gpu_state->pte_is_2m) { 5885 // If we're already mapped as a valid 2M PTE, just write it to invalid 5886 uvm_pte_batch_begin(push, pte_batch); 5887 uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch); 5888 5889 block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch); 5890 5891 uvm_pte_batch_end(pte_batch); 5892 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5893 } 5894 else { 5895 // Otherwise we have a mix of big and 4K PTEs which need to be merged 5896 // into an invalid 2M PTE. 5897 block_gpu_pte_merge_2m(block, block_context, gpu, push, tlb_membar); 5898 5899 gpu_state->pte_is_2m = true; 5900 bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5901 } 5902 } 5903 5904 // Combination split + unmap operation, called when only part of a valid 2M PTE 5905 // mapping is being unmapped. The 2M PTE is split into a mix of valid and 5906 // invalid big and/or 4k PTEs, as described by 5907 // block_context->mapping.new_pte_state. 5908 // 5909 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped). 5910 // 5911 // The PTEs covering the pages not set in pages_to_unmap inherit the mapping of 5912 // the current 2M PTE. 5913 static void block_gpu_unmap_split_2m(uvm_va_block_t *block, 5914 uvm_va_block_context_t *block_context, 5915 uvm_gpu_t *gpu, 5916 const uvm_page_mask_t *pages_to_unmap, 5917 uvm_push_t *push, 5918 uvm_membar_t tlb_membar) 5919 { 5920 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5921 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5922 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 5923 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5924 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5925 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); 5926 uvm_processor_id_t resident_id; 5927 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5928 DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5929 DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5930 5931 UVM_ASSERT(gpu_state->pte_is_2m); 5932 5933 resident_id = block_gpu_get_processor_to_map(block, gpu, 0); 5934 5935 uvm_pte_batch_begin(push, pte_batch); 5936 5937 // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries 5938 // from the lower levels. This means we don't need to issue a TLB invalidate 5939 // when writing those levels. 5940 5941 // Cases to handle: 5942 // 1) Big PTEs which inherit curr_prot 5943 // 2) Big PTEs which get unmapped 5944 // 3) Big PTEs which are split to 4k 5945 // a) 4k PTEs which inherit curr_prot under the split big PTEs 5946 // b) 4k PTEs which get unmapped under the split big PTEs 5947 5948 // Compute the big PTEs which will need to be split to 4k, if any. 5949 bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5950 5951 if (gpu_state->page_table_range_big.table) { 5952 // Case 1: Write the big PTEs which will inherit the 2M permissions, if 5953 // any. These are the big PTEs which are unchanged (uncovered) by the 5954 // operation. 5955 bitmap_andnot(big_ptes_inherit, 5956 new_pte_state->big_ptes, 5957 new_pte_state->big_ptes_covered, 5958 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5959 5960 block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL); 5961 5962 // Case 2: Clear the new big PTEs which get unmapped (those not covering 5963 // 4ks) 5964 bitmap_and(big_ptes_new_prot, 5965 new_pte_state->big_ptes, 5966 new_pte_state->big_ptes_covered, 5967 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5968 5969 block_gpu_pte_clear_big(block, 5970 gpu, 5971 big_ptes_new_prot, 5972 tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K), 5973 pte_batch, 5974 NULL); 5975 5976 // Case 3: Write the big PTEs which cover 4k PTEs 5977 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL); 5978 5979 // We just wrote all possible big PTEs, so mark them as initialized 5980 gpu_state->initialized_big = true; 5981 } 5982 else { 5983 UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5984 UVM_ASSERT(bitmap_full(new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5985 } 5986 5987 // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs 5988 block_gpu_pte_big_split_write_4k(block, 5989 block_context, 5990 gpu, 5991 resident_id, 5992 UVM_PROT_NONE, 5993 big_ptes_split, 5994 pages_to_unmap, 5995 pte_batch); 5996 5997 // And activate the 2M PDE. This ends the pte_batch and issues a single TLB 5998 // invalidate for the 2M entry. 5999 block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar); 6000 6001 gpu_state->pte_is_2m = false; 6002 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6003 } 6004 6005 // Unmap some pre-existing mix of big and 4k PTEs into some other mix of big 6006 // and 4k PTEs. 6007 // 6008 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped). 6009 static void block_gpu_unmap_big_and_4k(uvm_va_block_t *block, 6010 uvm_va_block_context_t *block_context, 6011 uvm_gpu_t *gpu, 6012 const uvm_page_mask_t *pages_to_unmap, 6013 uvm_push_t *push, 6014 uvm_membar_t tlb_membar) 6015 { 6016 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 6017 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 6018 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 6019 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 6020 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 6021 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6022 DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6023 DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6024 NvU32 big_page_size = tree->big_page_size; 6025 NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size); 6026 6027 UVM_ASSERT(!gpu_state->pte_is_2m); 6028 6029 uvm_pte_batch_begin(push, pte_batch); 6030 uvm_tlb_batch_begin(tree, tlb_batch); 6031 6032 // All of these cases might be perfomed in the same call: 6033 // 1) Split currently-big PTEs to 4k 6034 // a) Write new 4k PTEs which inherit curr_prot under the split big PTEs 6035 // b) Clear new 4k PTEs which get unmapped under the split big PTEs 6036 // 2) Merge currently-4k PTEs to unmapped big 6037 // 3) Clear currently-big PTEs which wholly get unmapped 6038 // 4) Clear currently-4k PTEs which get unmapped 6039 // 5) Initialize big PTEs which are not covered by this operation 6040 6041 // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are 6042 // being split. We'll make the big PTEs inactive below after flushing these 6043 // writes. No TLB invalidate is needed since the big PTE is active. 6044 // 6045 // Mask computation: big_before && !big_after 6046 bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6047 6048 block_gpu_pte_big_split_write_4k(block, 6049 block_context, 6050 gpu, 6051 UVM_ID_INVALID, 6052 UVM_PROT_NONE, 6053 big_ptes_split, 6054 pages_to_unmap, 6055 pte_batch); 6056 6057 // Case 4: Clear the 4k PTEs which weren't covered by a big PTE before, and 6058 // remain uncovered after the unmap. 6059 // 6060 // Mask computation: !big_before && !big_after 6061 bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6062 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after); 6063 if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_unmap, &block_context->scratch_page_mask)) 6064 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, tlb_batch); 6065 6066 // Case 5: If the big page table is newly-allocated, make sure that all big 6067 // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are 6068 // all initialized to invalid. 6069 // 6070 // The similar case of making newly-allocated big PTEs unmapped when no 6071 // lower 4k table is present is handled by having 6072 // block_gpu_compute_new_pte_state set new_pte_state->big_ptes 6073 // appropriately. 6074 if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) { 6075 // TODO: Bug 1766424: If we have the 4k page table already, we could 6076 // attempt to merge all uncovered big PTE regions when first 6077 // allocating the big table. That's probably not worth doing. 6078 UVM_ASSERT(gpu_state->page_table_range_4k.table); 6079 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6080 bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size)); 6081 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch); 6082 gpu_state->initialized_big = true; 6083 } 6084 6085 // Case 3 and step 1 of case 1: Unmap both currently-big PTEs which are 6086 // getting wholly unmapped, and those currently-big PTEs which are being 6087 // split to 4k. We can't directly transition from a valid big PTE to valid 6088 // lower PTEs, because that could cause the GPU TLBs to cache the same VA in 6089 // different cache lines. That could cause memory ordering to not be 6090 // maintained. 6091 // 6092 // Mask computation: (big_before && big_after && covered) || 6093 // (big_before && !big_after) 6094 bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6095 bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6096 bitmap_or(big_ptes_mask, big_ptes_mask, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6097 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, unmapped_pte_val, pte_batch, tlb_batch); 6098 6099 // Case 2: Merge the new big PTEs and end the batches, now that we've done 6100 // all of the independent PTE writes we can. 6101 // 6102 // Mask computation: !big_before && big_after 6103 if (bitmap_andnot(big_ptes_mask, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 6104 // This writes the newly-big PTEs to unmapped and ends the PTE and TLB 6105 // batches. 6106 block_gpu_pte_merge_big_and_end(block, 6107 block_context, 6108 gpu, 6109 big_ptes_mask, 6110 push, 6111 pte_batch, 6112 tlb_batch, 6113 tlb_membar); 6114 } 6115 else { 6116 // End the batches. We have to commit the membars and TLB invalidates 6117 // before we finish splitting formerly-big PTEs. 6118 uvm_pte_batch_end(pte_batch); 6119 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 6120 } 6121 6122 if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) || 6123 block_gpu_needs_to_activate_table(block, gpu)) { 6124 uvm_pte_batch_begin(push, pte_batch); 6125 uvm_tlb_batch_begin(tree, tlb_batch); 6126 6127 // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by 6128 // switching them from unmapped to invalid. This causes the GPU MMU to 6129 // start reading the 4k PTEs instead of stopping at the unmapped big 6130 // PTEs. 6131 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch); 6132 6133 uvm_pte_batch_end(pte_batch); 6134 6135 // Finally, activate the page tables if they're inactive 6136 if (block_gpu_needs_to_activate_table(block, gpu)) 6137 block_gpu_write_pde(block, gpu, push, tlb_batch); 6138 6139 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 6140 } 6141 6142 // Update gpu_state 6143 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6144 } 6145 6146 // When PTE state is about to change (for example due to a map/unmap/revoke 6147 // operation), this function decides how to split and merge the PTEs in response 6148 // to that operation. 6149 // 6150 // The operation is described with the two page masks: 6151 // 6152 // - pages_changing indicates which pages will have their PTE mappings changed 6153 // on the GPU in some way as a result of the operation (for example, which 6154 // pages will actually have their mapping permissions upgraded). 6155 // 6156 // - page_mask_after indicates which pages on this GPU will have exactly the 6157 // same PTE attributes (permissions, residency) as pages_changing after the 6158 // operation is applied. 6159 // 6160 // PTEs are merged eagerly. 6161 static void block_gpu_compute_new_pte_state(uvm_va_block_t *block, 6162 uvm_gpu_t *gpu, 6163 uvm_processor_id_t resident_id, 6164 const uvm_page_mask_t *pages_changing, 6165 const uvm_page_mask_t *page_mask_after, 6166 uvm_va_block_new_pte_state_t *new_pte_state) 6167 { 6168 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 6169 uvm_va_block_region_t big_region_all, big_page_region, region; 6170 NvU32 big_page_size; 6171 uvm_page_index_t page_index; 6172 size_t big_page_index; 6173 DECLARE_BITMAP(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6174 bool can_make_new_big_ptes; 6175 6176 memset(new_pte_state, 0, sizeof(*new_pte_state)); 6177 new_pte_state->needs_4k = true; 6178 6179 // TODO: Bug 1676485: Force a specific page size for perf testing 6180 6181 if (gpu_state->force_4k_ptes) 6182 return; 6183 6184 // Limit HMM GPU allocations to PAGE_SIZE since migrate_vma_*(), 6185 // hmm_range_fault(), and make_device_exclusive_range() don't handle folios 6186 // yet. Also, it makes mremap() difficult since the new address may not 6187 // align with the GPU block size otherwise. 6188 // If PAGE_SIZE is 64K, the code following this check is OK since 64K 6189 // big_pages is supported on all HMM supported GPUs (Turing+). 6190 // TODO: Bug 3368756: add support for transparent huge pages (THP). 6191 if (uvm_va_block_is_hmm(block) && PAGE_SIZE == UVM_PAGE_SIZE_4K) 6192 return; 6193 6194 UVM_ASSERT(uvm_page_mask_subset(pages_changing, page_mask_after)); 6195 6196 // If all pages in the 2M mask have the same attributes after the 6197 // operation is applied, we can use a 2M PTE. 6198 if (block_gpu_supports_2m(block, gpu) && 6199 uvm_page_mask_full(page_mask_after) && 6200 (UVM_ID_IS_INVALID(resident_id) || is_block_phys_contig(block, resident_id))) { 6201 new_pte_state->pte_is_2m = true; 6202 new_pte_state->needs_4k = false; 6203 return; 6204 } 6205 6206 // Find big PTEs with matching attributes 6207 6208 // Can this block fit any big pages? 6209 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); 6210 big_region_all = uvm_va_block_big_page_region_all(block, big_page_size); 6211 if (big_region_all.first >= big_region_all.outer) 6212 return; 6213 6214 new_pte_state->needs_4k = false; 6215 6216 can_make_new_big_ptes = true; 6217 6218 // Big pages can be used when mapping sysmem if the GPU supports it (Pascal+). 6219 if (UVM_ID_IS_CPU(resident_id) && !gpu->parent->can_map_sysmem_with_large_pages) 6220 can_make_new_big_ptes = false; 6221 6222 // We must not fail during teardown: unmap (resident_id == UVM_ID_INVALID) 6223 // with no splits required. That means we should avoid allocating PTEs 6224 // which are only needed for merges. 6225 // 6226 // This only matters if we're merging to big PTEs. If we're merging to 2M, 6227 // then we must already have the 2M level (since it has to be allocated 6228 // before the lower levels). 6229 // 6230 // If pte_is_2m already and we don't have a big table, we're splitting so we 6231 // have to allocate. 6232 if (UVM_ID_IS_INVALID(resident_id) && !gpu_state->page_table_range_big.table && !gpu_state->pte_is_2m) 6233 can_make_new_big_ptes = false; 6234 6235 for_each_va_block_page_in_region_mask(page_index, pages_changing, big_region_all) { 6236 uvm_va_block_region_t contig_region = {0}; 6237 6238 big_page_index = uvm_va_block_big_page_index(block, page_index, big_page_size); 6239 big_page_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 6240 6241 if (!UVM_ID_IS_INVALID(resident_id)) 6242 contig_region = block_phys_contig_region(block, page_index, resident_id); 6243 6244 __set_bit(big_page_index, new_pte_state->big_ptes_covered); 6245 6246 // When mapping sysmem, we can use big pages only if we are mapping all 6247 // pages in the big page subregion and the CPU pages backing the 6248 // subregion are physically contiguous. 6249 if (can_make_new_big_ptes && 6250 uvm_page_mask_region_full(page_mask_after, big_page_region) && 6251 (!UVM_ID_IS_CPU(resident_id) || 6252 (contig_region.first <= big_page_region.first && contig_region.outer >= big_page_region.outer))) { 6253 __set_bit(big_page_index, new_pte_state->big_ptes); 6254 } 6255 6256 if (!test_bit(big_page_index, new_pte_state->big_ptes)) 6257 new_pte_state->needs_4k = true; 6258 6259 // Skip to the end of the region 6260 page_index = big_page_region.outer - 1; 6261 } 6262 6263 if (!new_pte_state->needs_4k) { 6264 // All big page regions in pages_changing will be big PTEs. Now check if 6265 // there are any unaligned pages outside of big_region_all which are 6266 // changing. 6267 region = uvm_va_block_region(0, big_region_all.first); 6268 if (!uvm_page_mask_region_empty(pages_changing, region)) { 6269 new_pte_state->needs_4k = true; 6270 } 6271 else { 6272 region = uvm_va_block_region(big_region_all.outer, uvm_va_block_num_cpu_pages(block)); 6273 if (!uvm_page_mask_region_empty(pages_changing, region)) 6274 new_pte_state->needs_4k = true; 6275 } 6276 } 6277 6278 // Now add in the PTEs which should be big but weren't covered by this 6279 // operation. 6280 // 6281 // Note that we can't assume that a given page table range has been 6282 // initialized if it's present here, since it could have been allocated by a 6283 // thread which had to restart its operation due to allocation retry. 6284 if (gpu_state->pte_is_2m || (block_gpu_supports_2m(block, gpu) && !gpu_state->page_table_range_2m.table)) { 6285 // We're splitting a 2M PTE so all of the uncovered big PTE regions will 6286 // become big PTEs which inherit the 2M permissions. If we haven't 6287 // allocated the 2M table yet, it will start as a 2M PTE until the lower 6288 // levels are allocated, so it's the same split case regardless of 6289 // whether this operation will need to retry a later allocation. 6290 bitmap_complement(big_ptes_not_covered, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6291 } 6292 else if (!gpu_state->page_table_range_4k.table && !new_pte_state->needs_4k) { 6293 // If we don't have 4k PTEs and we won't be allocating them for this 6294 // operation, all of our PTEs need to be big. 6295 UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6296 bitmap_zero(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6297 bitmap_set(big_ptes_not_covered, 0, uvm_va_block_num_big_pages(block, big_page_size)); 6298 } 6299 else { 6300 // Otherwise, add in all of the currently-big PTEs which are unchanging. 6301 // They won't be written, but they need to be carried into the new 6302 // gpu_state->big_ptes when it's updated. 6303 bitmap_andnot(big_ptes_not_covered, 6304 gpu_state->big_ptes, 6305 new_pte_state->big_ptes_covered, 6306 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6307 } 6308 6309 bitmap_or(new_pte_state->big_ptes, new_pte_state->big_ptes, big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6310 } 6311 6312 // Wrapper around uvm_page_tree_get_ptes() and uvm_page_tree_alloc_table() that 6313 // handles allocation retry. If the block lock has been unlocked and relocked as 6314 // part of the allocation, NV_ERR_MORE_PROCESSING_REQUIRED is returned to signal 6315 // to the caller that the operation likely needs to be restarted. If that 6316 // happens, the pending tracker is added to the block's tracker. 6317 static NV_STATUS block_alloc_pt_range_with_retry(uvm_va_block_t *va_block, 6318 uvm_gpu_t *gpu, 6319 NvU32 page_size, 6320 uvm_page_table_range_t *page_table_range, 6321 uvm_tracker_t *pending_tracker) 6322 { 6323 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 6324 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu); 6325 uvm_page_tree_t *page_tables = &gpu_va_space->page_tables; 6326 uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block); 6327 uvm_page_table_range_t local_range; 6328 NV_STATUS status; 6329 6330 // Blocks may contain large PTEs without starting on a PTE boundary or 6331 // having an aligned size. Cover the PTEs of this size in the block's 6332 // interior so we match uvm_va_block_gpu_state_t::big_ptes. 6333 NvU64 start = UVM_ALIGN_UP(va_block->start, page_size); 6334 NvU64 size = UVM_ALIGN_DOWN(va_block->end + 1, page_size) - start; 6335 6336 // VA blocks which can use the 2MB level as either a PTE or a PDE need to 6337 // account for the PDE specially, so they must use uvm_page_tree_alloc_table 6338 // to allocate the lower levels. 6339 bool use_alloc_table = block_gpu_supports_2m(va_block, gpu) && page_size < UVM_PAGE_SIZE_2M; 6340 6341 UVM_ASSERT(page_table_range->table == NULL); 6342 6343 if (va_block_test && va_block_test->page_table_allocation_retry_force_count > 0) { 6344 --va_block_test->page_table_allocation_retry_force_count; 6345 status = NV_ERR_NO_MEMORY; 6346 } 6347 else if (use_alloc_table) { 6348 // Pascal+: 4k/64k tables under a 2M entry 6349 UVM_ASSERT(gpu_state->page_table_range_2m.table); 6350 status = uvm_page_tree_alloc_table(page_tables, 6351 page_size, 6352 UVM_PMM_ALLOC_FLAGS_NONE, 6353 &gpu_state->page_table_range_2m, 6354 page_table_range); 6355 } 6356 else { 6357 // 4k/big tables on pre-Pascal, and the 2M entry on Pascal+ 6358 status = uvm_page_tree_get_ptes(page_tables, 6359 page_size, 6360 start, 6361 size, 6362 UVM_PMM_ALLOC_FLAGS_NONE, 6363 page_table_range); 6364 } 6365 6366 if (status == NV_OK) 6367 goto allocated; 6368 6369 if (status != NV_ERR_NO_MEMORY) 6370 return status; 6371 6372 // Before unlocking the block lock, any pending work on the block has to be 6373 // added to the block's tracker. 6374 if (pending_tracker) { 6375 status = uvm_tracker_add_tracker_safe(&va_block->tracker, pending_tracker); 6376 if (status != NV_OK) 6377 return status; 6378 } 6379 6380 // Unlock the va block and retry with eviction enabled 6381 uvm_mutex_unlock(&va_block->lock); 6382 6383 if (use_alloc_table) { 6384 // Although we don't hold the block lock here, it's safe to pass 6385 // gpu_state->page_table_range_2m to the page tree code because we know 6386 // that the 2m range has already been allocated, and that it can't go 6387 // away while we have the va_space lock held. 6388 status = uvm_page_tree_alloc_table(page_tables, 6389 page_size, 6390 UVM_PMM_ALLOC_FLAGS_EVICT, 6391 &gpu_state->page_table_range_2m, 6392 &local_range); 6393 } 6394 else { 6395 status = uvm_page_tree_get_ptes(page_tables, 6396 page_size, 6397 start, 6398 size, 6399 UVM_PMM_ALLOC_FLAGS_EVICT, 6400 &local_range); 6401 } 6402 6403 uvm_mutex_lock(&va_block->lock); 6404 6405 if (status != NV_OK) 6406 return status; 6407 6408 status = NV_ERR_MORE_PROCESSING_REQUIRED; 6409 6410 if (page_table_range->table) { 6411 // A different caller allocated the page tables in the meantime, release the 6412 // local copy. 6413 uvm_page_tree_put_ptes(page_tables, &local_range); 6414 return status; 6415 } 6416 6417 *page_table_range = local_range; 6418 6419 allocated: 6420 // Mark the 2M PTE as active when we first allocate it, since we don't have 6421 // any PTEs below it yet. 6422 if (page_size == UVM_PAGE_SIZE_2M) { 6423 UVM_ASSERT(!gpu_state->pte_is_2m); 6424 gpu_state->pte_is_2m = true; 6425 } 6426 else if (page_size != UVM_PAGE_SIZE_4K) { 6427 // uvm_page_tree_get_ptes initializes big PTEs to invalid. 6428 // uvm_page_tree_alloc_table does not, so we'll have to do it later. 6429 if (use_alloc_table) 6430 UVM_ASSERT(!gpu_state->initialized_big); 6431 else 6432 gpu_state->initialized_big = true; 6433 } 6434 6435 return status; 6436 } 6437 6438 // Helper which allocates all page table ranges necessary for the given page 6439 // sizes. See block_alloc_pt_range_with_retry. 6440 static NV_STATUS block_alloc_ptes_with_retry(uvm_va_block_t *va_block, 6441 uvm_gpu_t *gpu, 6442 NvU32 page_sizes, 6443 uvm_tracker_t *pending_tracker) 6444 { 6445 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 6446 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu); 6447 uvm_page_table_range_t *range; 6448 NvU32 page_size; 6449 NV_STATUS status, final_status = NV_OK; 6450 6451 UVM_ASSERT(gpu_state); 6452 6453 // Blocks which can map 2M PTE/PDEs must always allocate the 2MB level first 6454 // in order to allocate the levels below. 6455 if (block_gpu_supports_2m(va_block, gpu)) 6456 page_sizes |= UVM_PAGE_SIZE_2M; 6457 6458 UVM_ASSERT((page_sizes & gpu_va_space->page_tables.hal->page_sizes()) == page_sizes); 6459 6460 for_each_chunk_size_rev(page_size, page_sizes) { 6461 if (page_size == UVM_PAGE_SIZE_2M) 6462 range = &gpu_state->page_table_range_2m; 6463 else if (page_size == UVM_PAGE_SIZE_4K) 6464 range = &gpu_state->page_table_range_4k; 6465 else 6466 range = &gpu_state->page_table_range_big; 6467 6468 if (range->table) 6469 continue; 6470 6471 if (page_size == UVM_PAGE_SIZE_2M) { 6472 UVM_ASSERT(!gpu_state->pte_is_2m); 6473 UVM_ASSERT(!gpu_state->page_table_range_big.table); 6474 UVM_ASSERT(!gpu_state->page_table_range_4k.table); 6475 } 6476 else if (page_size != UVM_PAGE_SIZE_4K) { 6477 UVM_ASSERT(uvm_va_block_num_big_pages(va_block, uvm_va_block_gpu_big_page_size(va_block, gpu)) > 0); 6478 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6479 } 6480 6481 status = block_alloc_pt_range_with_retry(va_block, gpu, page_size, range, pending_tracker); 6482 6483 // Keep going to allocate the remaining levels even if the allocation 6484 // requires a retry, since we'll likely still need them when we retry 6485 // anyway. 6486 if (status == NV_ERR_MORE_PROCESSING_REQUIRED) 6487 final_status = NV_ERR_MORE_PROCESSING_REQUIRED; 6488 else if (status != NV_OK) 6489 return status; 6490 } 6491 6492 return final_status; 6493 } 6494 6495 static NV_STATUS block_alloc_ptes_new_state(uvm_va_block_t *va_block, 6496 uvm_gpu_t *gpu, 6497 uvm_va_block_new_pte_state_t *new_pte_state, 6498 uvm_tracker_t *pending_tracker) 6499 { 6500 NvU32 page_sizes = 0; 6501 6502 if (new_pte_state->pte_is_2m) { 6503 page_sizes |= UVM_PAGE_SIZE_2M; 6504 } 6505 else { 6506 if (!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 6507 page_sizes |= uvm_va_block_gpu_big_page_size(va_block, gpu); 6508 6509 if (new_pte_state->needs_4k) 6510 page_sizes |= UVM_PAGE_SIZE_4K; 6511 else 6512 UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6513 } 6514 6515 return block_alloc_ptes_with_retry(va_block, gpu, page_sizes, pending_tracker); 6516 } 6517 6518 // Make sure that GMMU PDEs down to PDE1 are populated for the given VA block. 6519 // This is currently used on ATS systems to prevent GPUs from inadvertently 6520 // accessing sysmem via ATS because there is no PDE1 in the GMMU page tables, 6521 // which is where the NOATS bit resides. 6522 // 6523 // The current implementation simply pre-allocates the PTEs for the VA Block, 6524 // which is wasteful because the GPU may never need them. 6525 // 6526 // TODO: Bug 2064188: Change the MMU code to be able to directly refcount PDE1 6527 // page table entries without having to request PTEs. 6528 static NV_STATUS block_pre_populate_pde1_gpu(uvm_va_block_t *block, 6529 uvm_gpu_va_space_t *gpu_va_space, 6530 uvm_tracker_t *pending_tracker) 6531 { 6532 NvU32 page_sizes; 6533 NvU32 big_page_size; 6534 uvm_gpu_t *gpu; 6535 uvm_va_block_gpu_state_t *gpu_state; 6536 6537 UVM_ASSERT(block); 6538 UVM_ASSERT(gpu_va_space); 6539 UVM_ASSERT(gpu_va_space->ats.enabled); 6540 UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE); 6541 6542 gpu = gpu_va_space->gpu; 6543 big_page_size = gpu_va_space->page_tables.big_page_size; 6544 6545 gpu_state = block_gpu_state_get_alloc(block, gpu); 6546 if (!gpu_state) 6547 return NV_ERR_NO_MEMORY; 6548 6549 // If the VA Block supports 2M pages, allocate the 2M PTE only, as it 6550 // requires less memory 6551 if (block_gpu_supports_2m(block, gpu)) 6552 page_sizes = UVM_PAGE_SIZE_2M; 6553 else if (uvm_va_block_num_big_pages(block, big_page_size) > 0) 6554 page_sizes = big_page_size; 6555 else 6556 page_sizes = UVM_PAGE_SIZE_4K; 6557 6558 return block_alloc_ptes_with_retry(block, gpu, page_sizes, pending_tracker); 6559 } 6560 6561 static NV_STATUS block_pre_populate_pde1_all_gpus(uvm_va_block_t *block, uvm_tracker_t *pending_tracker) 6562 { 6563 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 6564 NV_STATUS status = NV_OK; 6565 6566 // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See 6567 // comments in block_pre_populate_pde1_gpu. 6568 if (g_uvm_global.ats.enabled && !block->cpu.ever_mapped) { 6569 uvm_gpu_va_space_t *gpu_va_space; 6570 6571 for_each_gpu_va_space(gpu_va_space, va_space) { 6572 // We only care about systems where ATS is supported and the application 6573 // enabled it. 6574 if (!gpu_va_space->ats.enabled) 6575 continue; 6576 6577 status = block_pre_populate_pde1_gpu(block, gpu_va_space, pending_tracker); 6578 if (status != NV_OK) 6579 break; 6580 } 6581 } 6582 6583 return status; 6584 } 6585 6586 static NV_STATUS block_unmap_gpu(uvm_va_block_t *block, 6587 uvm_va_block_context_t *block_context, 6588 uvm_gpu_t *gpu, 6589 const uvm_page_mask_t *unmap_page_mask, 6590 uvm_tracker_t *out_tracker) 6591 { 6592 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 6593 uvm_pte_bits_gpu_t pte_bit; 6594 uvm_push_t push; 6595 uvm_membar_t tlb_membar; 6596 bool only_local_mappings; 6597 uvm_page_mask_t *pages_to_unmap = &block_context->mapping.page_mask; 6598 NV_STATUS status; 6599 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 6600 bool mask_empty; 6601 6602 // We have to check gpu_state before looking at any VA space state like our 6603 // gpu_va_space, because we could be on the eviction path where we don't 6604 // have a lock on that state. However, since remove_gpu_va_space walks each 6605 // block to unmap the GPU before destroying the gpu_va_space, we're 6606 // guaranteed that if this GPU has page tables, the gpu_va_space can't go 6607 // away while we're holding the block lock. 6608 if (!block_gpu_has_page_tables(block, gpu)) 6609 return NV_OK; 6610 6611 if (!uvm_page_mask_and(pages_to_unmap, unmap_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ])) 6612 return NV_OK; 6613 6614 // block_gpu_compute_new_pte_state needs a mask of pages which will have 6615 // matching attributes after the operation is performed. In the case of 6616 // unmap, those are the pages with unset bits. 6617 uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], pages_to_unmap); 6618 uvm_page_mask_complement(&block_context->scratch_page_mask, &block_context->scratch_page_mask); 6619 block_gpu_compute_new_pte_state(block, 6620 gpu, 6621 UVM_ID_INVALID, 6622 pages_to_unmap, 6623 &block_context->scratch_page_mask, 6624 new_pte_state); 6625 6626 status = block_alloc_ptes_new_state(block, gpu, new_pte_state, out_tracker); 6627 if (status != NV_OK) 6628 return status; 6629 6630 only_local_mappings = !block_has_remote_mapping_gpu(block, &block_context->scratch_page_mask, gpu->id, pages_to_unmap); 6631 tlb_membar = uvm_hal_downgrade_membar_type(gpu, only_local_mappings); 6632 6633 status = uvm_push_begin_acquire(gpu->channel_manager, 6634 UVM_CHANNEL_TYPE_MEMOPS, 6635 &block->tracker, 6636 &push, 6637 "Unmapping pages in block [0x%llx, 0x%llx)", 6638 block->start, 6639 block->end + 1); 6640 if (status != NV_OK) 6641 return status; 6642 6643 if (new_pte_state->pte_is_2m) { 6644 // We're either unmapping a whole valid 2M PTE, or we're unmapping all 6645 // remaining pages in a split 2M PTE. 6646 block_gpu_unmap_to_2m(block, block_context, gpu, &push, tlb_membar); 6647 } 6648 else if (gpu_state->pte_is_2m) { 6649 // The block is currently mapped as a valid 2M PTE and we're unmapping 6650 // some pages within the 2M, so we have to split it into the appropriate 6651 // mix of big and 4k PTEs. 6652 block_gpu_unmap_split_2m(block, block_context, gpu, pages_to_unmap, &push, tlb_membar); 6653 } 6654 else { 6655 // We're unmapping some pre-existing mix of big and 4K PTEs into some 6656 // other mix of big and 4K PTEs. 6657 block_gpu_unmap_big_and_4k(block, block_context, gpu, pages_to_unmap, &push, tlb_membar); 6658 } 6659 6660 uvm_push_end(&push); 6661 6662 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) { 6663 uvm_processor_mask_t non_uvm_lite_gpus; 6664 uvm_processor_mask_andnot(&non_uvm_lite_gpus, &block->mapped, block_get_uvm_lite_gpus(block)); 6665 6666 UVM_ASSERT(uvm_processor_mask_test(&non_uvm_lite_gpus, gpu->id)); 6667 6668 // If the GPU is the only non-UVM-Lite processor with mappings, we can 6669 // safely mark pages as fully unmapped 6670 if (uvm_processor_mask_get_count(&non_uvm_lite_gpus) == 1) 6671 uvm_page_mask_andnot(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_unmap); 6672 } 6673 6674 // Clear block PTE state 6675 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { 6676 mask_empty = !uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit], 6677 &gpu_state->pte_bits[pte_bit], 6678 pages_to_unmap); 6679 if (pte_bit == UVM_PTE_BITS_GPU_READ && mask_empty) 6680 uvm_processor_mask_clear(&block->mapped, gpu->id); 6681 } 6682 6683 UVM_ASSERT(block_check_mappings(block)); 6684 6685 return uvm_tracker_add_push_safe(out_tracker, &push); 6686 } 6687 6688 NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block, 6689 uvm_va_block_context_t *va_block_context, 6690 uvm_processor_id_t id, 6691 uvm_va_block_region_t region, 6692 const uvm_page_mask_t *unmap_page_mask, 6693 uvm_tracker_t *out_tracker) 6694 { 6695 uvm_page_mask_t *region_page_mask = &va_block_context->mapping.map_running_page_mask; 6696 6697 UVM_ASSERT(!uvm_va_block_is_dead(va_block)); 6698 uvm_assert_mutex_locked(&va_block->lock); 6699 6700 if (UVM_ID_IS_CPU(id)) { 6701 block_unmap_cpu(va_block, region, unmap_page_mask); 6702 return NV_OK; 6703 } 6704 6705 uvm_page_mask_init_from_region(region_page_mask, region, unmap_page_mask); 6706 6707 return block_unmap_gpu(va_block, va_block_context, block_get_gpu(va_block, id), region_page_mask, out_tracker); 6708 } 6709 6710 // This function essentially works as a wrapper around vm_insert_page (hence 6711 // the similar function prototype). This is needed since vm_insert_page 6712 // doesn't take permissions as input, but uses vma->vm_page_prot instead. 6713 // Since we may have multiple VA blocks under one VMA which need to map 6714 // with different permissions, we have to manually change vma->vm_page_prot for 6715 // each call to vm_insert_page. Multiple faults under one VMA in separate 6716 // blocks can be serviced concurrently, so the VMA wrapper lock is used 6717 // to protect access to vma->vm_page_prot. 6718 static NV_STATUS uvm_cpu_insert_page(struct vm_area_struct *vma, 6719 NvU64 addr, 6720 struct page *page, 6721 uvm_prot_t new_prot) 6722 { 6723 uvm_vma_wrapper_t *vma_wrapper; 6724 unsigned long target_flags; 6725 pgprot_t target_pgprot; 6726 int ret; 6727 6728 UVM_ASSERT(vma); 6729 UVM_ASSERT(vma->vm_private_data); 6730 6731 vma_wrapper = vma->vm_private_data; 6732 target_flags = vma->vm_flags; 6733 6734 if (new_prot == UVM_PROT_READ_ONLY) 6735 target_flags &= ~VM_WRITE; 6736 6737 target_pgprot = vm_get_page_prot(target_flags); 6738 6739 // Take VMA wrapper lock to check vma->vm_page_prot 6740 uvm_down_read(&vma_wrapper->lock); 6741 6742 // Take a write lock if we need to modify the VMA vm_page_prot 6743 // - vma->vm_page_prot creates writable PTEs but new prot is RO 6744 // - vma->vm_page_prot creates read-only PTEs but new_prot is RW 6745 if (pgprot_val(vma->vm_page_prot) != pgprot_val(target_pgprot)) { 6746 uvm_up_read(&vma_wrapper->lock); 6747 uvm_down_write(&vma_wrapper->lock); 6748 6749 vma->vm_page_prot = target_pgprot; 6750 6751 uvm_downgrade_write(&vma_wrapper->lock); 6752 } 6753 6754 ret = vm_insert_page(vma, addr, page); 6755 uvm_up_read(&vma_wrapper->lock); 6756 if (ret) { 6757 UVM_ASSERT_MSG(ret == -ENOMEM, "ret: %d\n", ret); 6758 return errno_to_nv_status(ret); 6759 } 6760 6761 return NV_OK; 6762 } 6763 6764 static uvm_prot_t compute_logical_prot(uvm_va_block_t *va_block, 6765 struct vm_area_struct *hmm_vma, 6766 uvm_page_index_t page_index) 6767 { 6768 uvm_prot_t logical_prot; 6769 6770 if (uvm_va_block_is_hmm(va_block)) { 6771 NvU64 addr = uvm_va_block_cpu_page_address(va_block, page_index); 6772 6773 logical_prot = uvm_hmm_compute_logical_prot(va_block, hmm_vma, addr); 6774 } 6775 else { 6776 uvm_va_range_t *va_range = va_block->va_range; 6777 6778 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 6779 6780 // Zombified VA ranges no longer have a vma, so they have no permissions 6781 if (uvm_va_range_is_managed_zombie(va_range)) { 6782 logical_prot = UVM_PROT_NONE; 6783 } 6784 else { 6785 struct vm_area_struct *vma; 6786 6787 vma = uvm_va_range_vma(va_range); 6788 6789 if (!(vma->vm_flags & VM_READ)) 6790 logical_prot = UVM_PROT_NONE; 6791 else if (!(vma->vm_flags & VM_WRITE)) 6792 logical_prot = UVM_PROT_READ_ONLY; 6793 else 6794 logical_prot = UVM_PROT_READ_WRITE_ATOMIC; 6795 } 6796 } 6797 6798 return logical_prot; 6799 } 6800 6801 static struct page *block_page_get(uvm_va_block_t *block, block_phys_page_t block_page) 6802 { 6803 struct page *page; 6804 6805 if (UVM_ID_IS_CPU(block_page.processor)) { 6806 page = uvm_cpu_chunk_get_cpu_page(block, block_page.page_index); 6807 } 6808 else { 6809 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 6810 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, block_page.processor); 6811 size_t chunk_offset; 6812 uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_page, &chunk_offset); 6813 6814 UVM_ASSERT(gpu->mem_info.numa.enabled); 6815 page = uvm_gpu_chunk_to_page(&gpu->pmm, chunk) + chunk_offset / PAGE_SIZE; 6816 } 6817 6818 UVM_ASSERT(page); 6819 return page; 6820 } 6821 6822 // Creates or upgrades a CPU mapping for the given page, updating the block's 6823 // mapping and pte_bits bitmaps as appropriate. Upon successful return, the page 6824 // will be mapped with at least new_prot permissions. 6825 // 6826 // This never downgrades mappings, so new_prot must not be UVM_PROT_NONE. Use 6827 // block_unmap_cpu or uvm_va_block_revoke_prot instead. 6828 // 6829 // If the existing mapping is >= new_prot already, this is a no-op. 6830 // 6831 // It is the caller's responsibility to: 6832 // - Revoke mappings from other processors as appropriate so the CPU can map 6833 // with new_prot permissions 6834 // - Guarantee that vm_insert_page is safe to use (vma->vm_mm has a reference 6835 // and mmap_lock is held in at least read mode) 6836 // - For HMM blocks that vma is valid and safe to use, vma->vm_mm has a 6837 // reference and mmap_lock is held in at least read mode 6838 // - Ensure that the struct page corresponding to the physical memory being 6839 // mapped exists 6840 // - Manage the block's residency bitmap 6841 // - Ensure that the block hasn't been killed (block->va_range is present) 6842 // - Update the pte/mapping tracking state on success 6843 static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block, 6844 struct vm_area_struct *hmm_vma, 6845 uvm_processor_id_t resident_id, 6846 uvm_page_index_t page_index, 6847 uvm_prot_t new_prot) 6848 { 6849 uvm_prot_t curr_prot = block_page_prot_cpu(block, page_index); 6850 uvm_va_range_t *va_range = block->va_range; 6851 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 6852 struct vm_area_struct *vma; 6853 NV_STATUS status; 6854 NvU64 addr; 6855 struct page *page; 6856 6857 UVM_ASSERT((uvm_va_block_is_hmm(block) && hmm_vma) || va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 6858 UVM_ASSERT(new_prot != UVM_PROT_NONE); 6859 UVM_ASSERT(new_prot < UVM_PROT_MAX); 6860 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU)); 6861 6862 uvm_assert_mutex_locked(&block->lock); 6863 if (UVM_ID_IS_CPU(resident_id)) 6864 UVM_ASSERT(uvm_page_mask_test(&block->cpu.allocated, page_index)); 6865 6866 // For the CPU, write implies atomic 6867 if (new_prot == UVM_PROT_READ_WRITE) 6868 new_prot = UVM_PROT_READ_WRITE_ATOMIC; 6869 6870 // Only upgrades are supported in this function 6871 UVM_ASSERT(curr_prot <= new_prot); 6872 6873 if (new_prot == curr_prot) 6874 return NV_OK; 6875 6876 // Check for existing VMA permissions. They could have been modified after 6877 // the initial mmap by mprotect. 6878 if (new_prot > compute_logical_prot(block, hmm_vma, page_index)) 6879 return NV_ERR_INVALID_ACCESS_TYPE; 6880 6881 if (uvm_va_block_is_hmm(block)) { 6882 // Do not map CPU pages because they belong to the Linux kernel. 6883 return NV_OK; 6884 } 6885 6886 UVM_ASSERT(va_range); 6887 6888 if (UVM_ID_IS_CPU(resident_id) && UVM_ID_IS_CPU(uvm_va_range_get_policy(va_range)->preferred_location)) { 6889 // Add the page's range group range to the range group's migrated list. 6890 uvm_range_group_range_t *rgr = uvm_range_group_range_find(va_space, 6891 uvm_va_block_cpu_page_address(block, page_index)); 6892 if (rgr != NULL) { 6893 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock); 6894 if (list_empty(&rgr->range_group_migrated_list_node)) 6895 list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges); 6896 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock); 6897 } 6898 } 6899 6900 // It's possible here that current->mm != vma->vm_mm. That can happen for 6901 // example due to access_process_vm (ptrace) or get_user_pages from another 6902 // driver. 6903 // 6904 // In such cases the caller has taken care of ref counting vma->vm_mm for 6905 // us, so we can safely operate on the vma but we can't use 6906 // uvm_va_range_vma_current. 6907 vma = uvm_va_range_vma(va_range); 6908 uvm_assert_mmap_lock_locked(vma->vm_mm); 6909 UVM_ASSERT(!uvm_va_space_mm_enabled(va_space) || va_space->va_space_mm.mm == vma->vm_mm); 6910 6911 // Add the mapping 6912 addr = uvm_va_block_cpu_page_address(block, page_index); 6913 6914 // This unmap handles upgrades as vm_insert_page returns -EBUSY when 6915 // there's already a mapping present at fault_addr, so we have to unmap 6916 // first anyway when upgrading from RO -> RW. 6917 if (curr_prot != UVM_PROT_NONE) 6918 unmap_mapping_range(va_space->mapping, addr, PAGE_SIZE, 1); 6919 6920 // Don't map the CPU until prior copies and GPU PTE updates finish, 6921 // otherwise we might not stay coherent. 6922 status = uvm_tracker_wait(&block->tracker); 6923 if (status != NV_OK) 6924 return status; 6925 6926 page = block_page_get(block, block_phys_page(resident_id, page_index)); 6927 return uvm_cpu_insert_page(vma, addr, page, new_prot); 6928 } 6929 6930 // Maps the CPU to the given pages which are resident on resident_id. 6931 // map_page_mask is an in/out parameter: the pages which are mapped to 6932 // resident_id are removed from the mask before returning. 6933 // 6934 // Caller must ensure that: 6935 // - Pages in map_page_mask must not be set in the corresponding cpu.pte_bits 6936 // mask for the requested protection. 6937 static NV_STATUS block_map_cpu_to(uvm_va_block_t *block, 6938 uvm_va_block_context_t *block_context, 6939 uvm_processor_id_t resident_id, 6940 uvm_va_block_region_t region, 6941 uvm_page_mask_t *map_page_mask, 6942 uvm_prot_t new_prot, 6943 uvm_tracker_t *out_tracker) 6944 { 6945 NV_STATUS status = NV_OK; 6946 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 6947 uvm_page_index_t page_index; 6948 uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask; 6949 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, resident_id); 6950 uvm_pte_bits_cpu_t prot_pte_bit = get_cpu_pte_bit_index(new_prot); 6951 uvm_pte_bits_cpu_t pte_bit; 6952 6953 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU)); 6954 6955 // TODO: Bug 1766424: Check if optimizing the unmap_mapping_range calls 6956 // within block_map_cpu_page_to by doing them once here is helpful. 6957 6958 UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask, 6959 map_page_mask, 6960 &block->cpu.pte_bits[prot_pte_bit])); 6961 6962 // The pages which will actually change are those in the input page mask 6963 // which are resident on the target. 6964 if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask)) 6965 return NV_OK; 6966 6967 status = block_pre_populate_pde1_all_gpus(block, out_tracker); 6968 if (status != NV_OK) 6969 return status; 6970 6971 block->cpu.ever_mapped = true; 6972 6973 for_each_va_block_page_in_region_mask(page_index, pages_to_map, region) { 6974 status = block_map_cpu_page_to(block, 6975 block_context->hmm.vma, 6976 resident_id, 6977 page_index, 6978 new_prot); 6979 if (status != NV_OK) 6980 break; 6981 6982 uvm_processor_mask_set(&block->mapped, UVM_ID_CPU); 6983 } 6984 6985 // If there was some error, shrink the region so that we only update the 6986 // pte/mapping tracking bits for the pages that succeeded 6987 if (status != NV_OK) { 6988 region = uvm_va_block_region(region.first, page_index); 6989 uvm_page_mask_region_clear_outside(pages_to_map, region); 6990 } 6991 6992 // If pages are mapped from a remote residency, notify the remote mapping 6993 // events to tools. We skip event notification if the cause is Invalid. We 6994 // use it to signal that this function is being called from the revocation 6995 // path to avoid reporting duplicate events. 6996 if (UVM_ID_IS_GPU(resident_id) && 6997 va_space->tools.enabled && 6998 block_context->mapping.cause != UvmEventMapRemoteCauseInvalid) { 6999 uvm_va_block_region_t subregion; 7000 for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) { 7001 uvm_tools_record_map_remote(block, 7002 NULL, 7003 UVM_ID_CPU, 7004 resident_id, 7005 uvm_va_block_region_start(block, subregion), 7006 uvm_va_block_region_size(subregion), 7007 block_context->mapping.cause); 7008 } 7009 } 7010 7011 // Update CPU mapping state 7012 for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++) 7013 uvm_page_mask_or(&block->cpu.pte_bits[pte_bit], &block->cpu.pte_bits[pte_bit], pages_to_map); 7014 7015 uvm_page_mask_or(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_map); 7016 7017 UVM_ASSERT(block_check_mappings(block)); 7018 7019 // Remove all pages that were newly-mapped from the input mask 7020 uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map); 7021 7022 return status; 7023 } 7024 7025 // Maps the GPU to the given pages which are resident on resident_id. 7026 // map_page_mask is an in/out parameter: the pages which are mapped 7027 // to resident_id are removed from the mask before returning. 7028 // 7029 // Caller must ensure that: 7030 // - Pages in map_page_mask must not be set in the corresponding pte_bits mask 7031 // for the requested protection on the mapping GPU. 7032 static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block, 7033 uvm_va_block_context_t *block_context, 7034 uvm_gpu_t *gpu, 7035 uvm_processor_id_t resident_id, 7036 uvm_page_mask_t *map_page_mask, 7037 uvm_prot_t new_prot, 7038 uvm_tracker_t *out_tracker) 7039 { 7040 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 7041 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7042 uvm_push_t push; 7043 NV_STATUS status; 7044 uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask; 7045 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id); 7046 uvm_pte_bits_gpu_t pte_bit; 7047 uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot); 7048 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 7049 block_pte_op_t pte_op; 7050 7051 UVM_ASSERT(map_page_mask); 7052 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], gpu->id)); 7053 7054 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) 7055 UVM_ASSERT(uvm_id_equal(resident_id, uvm_va_range_get_policy(va_block->va_range)->preferred_location)); 7056 7057 UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask, 7058 map_page_mask, 7059 &gpu_state->pte_bits[prot_pte_bit])); 7060 7061 // The pages which will actually change are those in the input page mask 7062 // which are resident on the target. 7063 if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask)) 7064 return NV_OK; 7065 7066 UVM_ASSERT(block_check_mapping_residency(va_block, gpu, resident_id, pages_to_map)); 7067 7068 // For PTE merge/split computation, compute all resident pages which will 7069 // have exactly new_prot after performing the mapping. 7070 uvm_page_mask_or(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_map); 7071 if (prot_pte_bit < UVM_PTE_BITS_GPU_ATOMIC) { 7072 uvm_page_mask_andnot(&block_context->scratch_page_mask, 7073 &block_context->scratch_page_mask, 7074 &gpu_state->pte_bits[prot_pte_bit + 1]); 7075 } 7076 uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask); 7077 7078 block_gpu_compute_new_pte_state(va_block, 7079 gpu, 7080 resident_id, 7081 pages_to_map, 7082 &block_context->scratch_page_mask, 7083 new_pte_state); 7084 7085 status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker); 7086 if (status != NV_OK) 7087 return status; 7088 7089 status = uvm_push_begin_acquire(gpu->channel_manager, 7090 UVM_CHANNEL_TYPE_MEMOPS, 7091 &va_block->tracker, 7092 &push, 7093 "Mapping pages in block [0x%llx, 0x%llx) as %s", 7094 va_block->start, 7095 va_block->end + 1, 7096 uvm_prot_string(new_prot)); 7097 if (status != NV_OK) 7098 return status; 7099 7100 pte_op = BLOCK_PTE_OP_MAP; 7101 if (new_pte_state->pte_is_2m) { 7102 // We're either modifying permissions of a pre-existing 2M PTE, or all 7103 // permissions match so we can merge to a new 2M PTE. 7104 block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op); 7105 } 7106 else if (gpu_state->pte_is_2m) { 7107 // Permissions on a subset of the existing 2M PTE are being upgraded, so 7108 // we have to split it into the appropriate mix of big and 4k PTEs. 7109 block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op); 7110 } 7111 else { 7112 // We're upgrading permissions on some pre-existing mix of big and 4K 7113 // PTEs into some other mix of big and 4K PTEs. 7114 block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op); 7115 } 7116 7117 // If we are mapping remotely, record the event 7118 if (va_space->tools.enabled && !uvm_id_equal(resident_id, gpu->id)) { 7119 uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(va_block); 7120 7121 UVM_ASSERT(block_context->mapping.cause != UvmEventMapRemoteCauseInvalid); 7122 7123 for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) { 7124 uvm_tools_record_map_remote(va_block, 7125 &push, 7126 gpu->id, 7127 resident_id, 7128 uvm_va_block_region_start(va_block, subregion), 7129 uvm_va_block_region_size(subregion), 7130 block_context->mapping.cause); 7131 } 7132 } 7133 7134 uvm_push_end(&push); 7135 7136 // Update GPU mapping state 7137 for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++) 7138 uvm_page_mask_or(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_map); 7139 7140 uvm_processor_mask_set(&va_block->mapped, gpu->id); 7141 7142 // If we are mapping a UVM-Lite GPU do not update maybe_mapped_pages 7143 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) 7144 uvm_page_mask_or(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, pages_to_map); 7145 7146 // Remove all pages resident on this processor from the input mask, which 7147 // were newly-mapped. 7148 uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map); 7149 7150 UVM_ASSERT(block_check_mappings(va_block)); 7151 7152 return uvm_tracker_add_push_safe(out_tracker, &push); 7153 } 7154 7155 static void map_get_allowed_destinations(uvm_va_block_t *block, 7156 uvm_va_block_context_t *va_block_context, 7157 const uvm_va_policy_t *policy, 7158 uvm_processor_id_t id, 7159 uvm_processor_mask_t *allowed_mask) 7160 { 7161 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7162 7163 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), id)) { 7164 // UVM-Lite can only map resident pages on the preferred location 7165 uvm_processor_mask_zero(allowed_mask); 7166 uvm_processor_mask_set(allowed_mask, policy->preferred_location); 7167 } 7168 else if ((uvm_va_policy_is_read_duplicate(policy, va_space) || 7169 (uvm_id_equal(policy->preferred_location, id) && 7170 !is_uvm_fault_force_sysmem_set() && 7171 !uvm_hmm_must_use_sysmem(block, va_block_context))) && 7172 uvm_va_space_processor_has_memory(va_space, id)) { 7173 // When operating under read-duplication we should only map the local 7174 // processor to cause fault-and-duplicate of remote pages. 7175 // 7176 // The same holds when this processor is the preferred location: only 7177 // create local mappings to force remote pages to fault-and-migrate. 7178 uvm_processor_mask_zero(allowed_mask); 7179 uvm_processor_mask_set(allowed_mask, id); 7180 } 7181 else { 7182 // Common case: Just map wherever the memory happens to reside 7183 uvm_processor_mask_and(allowed_mask, &block->resident, &va_space->can_access[uvm_id_value(id)]); 7184 return; 7185 } 7186 7187 // Clamp to resident and accessible processors 7188 uvm_processor_mask_and(allowed_mask, allowed_mask, &block->resident); 7189 uvm_processor_mask_and(allowed_mask, allowed_mask, &va_space->can_access[uvm_id_value(id)]); 7190 } 7191 7192 NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block, 7193 uvm_va_block_context_t *va_block_context, 7194 uvm_processor_id_t id, 7195 uvm_va_block_region_t region, 7196 const uvm_page_mask_t *map_page_mask, 7197 uvm_prot_t new_prot, 7198 UvmEventMapRemoteCause cause, 7199 uvm_tracker_t *out_tracker) 7200 { 7201 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7202 uvm_gpu_t *gpu = NULL; 7203 uvm_processor_mask_t allowed_destinations; 7204 uvm_processor_id_t resident_id; 7205 const uvm_page_mask_t *pte_mask; 7206 uvm_page_mask_t *running_page_mask = &va_block_context->mapping.map_running_page_mask; 7207 NV_STATUS status; 7208 const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region); 7209 7210 va_block_context->mapping.cause = cause; 7211 7212 UVM_ASSERT(new_prot != UVM_PROT_NONE); 7213 UVM_ASSERT(new_prot < UVM_PROT_MAX); 7214 uvm_assert_mutex_locked(&va_block->lock); 7215 7216 // Mapping is not supported on the eviction path that doesn't hold the VA 7217 // space lock. 7218 uvm_assert_rwsem_locked(&va_space->lock); 7219 7220 if (UVM_ID_IS_CPU(id)) { 7221 uvm_pte_bits_cpu_t prot_pte_bit; 7222 7223 // Check if the current thread is allowed to call vm_insert_page 7224 if (!uvm_va_block_is_hmm(va_block) && !uvm_va_range_vma_check(va_block->va_range, va_block_context->mm)) 7225 return NV_OK; 7226 7227 prot_pte_bit = get_cpu_pte_bit_index(new_prot); 7228 pte_mask = &va_block->cpu.pte_bits[prot_pte_bit]; 7229 } 7230 else { 7231 uvm_va_block_gpu_state_t *gpu_state; 7232 uvm_pte_bits_gpu_t prot_pte_bit; 7233 7234 gpu = uvm_va_space_get_gpu(va_space, id); 7235 7236 // Although this GPU UUID is registered in the VA space, it might not have a 7237 // GPU VA space registered. 7238 if (!uvm_gpu_va_space_get(va_space, gpu)) 7239 return NV_OK; 7240 7241 gpu_state = block_gpu_state_get_alloc(va_block, gpu); 7242 if (!gpu_state) 7243 return NV_ERR_NO_MEMORY; 7244 7245 prot_pte_bit = get_gpu_pte_bit_index(new_prot); 7246 pte_mask = &gpu_state->pte_bits[prot_pte_bit]; 7247 } 7248 7249 uvm_page_mask_init_from_region(running_page_mask, region, map_page_mask); 7250 7251 if (!uvm_page_mask_andnot(running_page_mask, running_page_mask, pte_mask)) 7252 return NV_OK; 7253 7254 // Map per resident location so we can more easily detect physically- 7255 // contiguous mappings. 7256 map_get_allowed_destinations(va_block, va_block_context, policy, id, &allowed_destinations); 7257 7258 for_each_closest_id(resident_id, &allowed_destinations, id, va_space) { 7259 if (UVM_ID_IS_CPU(id)) { 7260 status = block_map_cpu_to(va_block, 7261 va_block_context, 7262 resident_id, 7263 region, 7264 running_page_mask, 7265 new_prot, 7266 out_tracker); 7267 } 7268 else { 7269 status = block_map_gpu_to(va_block, 7270 va_block_context, 7271 gpu, 7272 resident_id, 7273 running_page_mask, 7274 new_prot, 7275 out_tracker); 7276 } 7277 7278 if (status != NV_OK) 7279 return status; 7280 7281 // If we've mapped all requested pages, we're done 7282 if (uvm_page_mask_region_empty(running_page_mask, region)) 7283 break; 7284 } 7285 7286 return NV_OK; 7287 } 7288 7289 // Revokes the given pages mapped by cpu. This is implemented by unmapping all 7290 // pages and mapping them later with the lower permission. This is required 7291 // because vm_insert_page can only be used for upgrades from Invalid. 7292 // 7293 // Caller must ensure that: 7294 // - Pages in revoke_page_mask must be set in the 7295 // cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE] mask. 7296 static NV_STATUS block_revoke_cpu_write(uvm_va_block_t *block, 7297 uvm_va_block_context_t *block_context, 7298 uvm_va_block_region_t region, 7299 const uvm_page_mask_t *revoke_page_mask, 7300 uvm_tracker_t *out_tracker) 7301 { 7302 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7303 uvm_va_block_region_t subregion; 7304 7305 UVM_ASSERT(revoke_page_mask); 7306 7307 UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])); 7308 7309 block_unmap_cpu(block, region, revoke_page_mask); 7310 7311 // Coalesce revocation event notification 7312 for_each_va_block_subregion_in_mask(subregion, revoke_page_mask, region) { 7313 uvm_perf_event_notify_revocation(&va_space->perf_events, 7314 block, 7315 UVM_ID_CPU, 7316 uvm_va_block_region_start(block, subregion), 7317 uvm_va_block_region_size(subregion), 7318 UVM_PROT_READ_WRITE_ATOMIC, 7319 UVM_PROT_READ_ONLY); 7320 } 7321 7322 // uvm_va_block_map will skip this remap if we aren't holding the right mm 7323 // lock. 7324 return uvm_va_block_map(block, 7325 block_context, 7326 UVM_ID_CPU, 7327 region, 7328 revoke_page_mask, 7329 UVM_PROT_READ_ONLY, 7330 UvmEventMapRemoteCauseInvalid, 7331 out_tracker); 7332 } 7333 7334 static void block_revoke_prot_gpu_perf_notify(uvm_va_block_t *block, 7335 uvm_va_block_context_t *block_context, 7336 uvm_gpu_t *gpu, 7337 uvm_prot_t prot_revoked, 7338 const uvm_page_mask_t *pages_revoked) 7339 { 7340 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7341 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 7342 uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(block); 7343 uvm_pte_bits_gpu_t pte_bit; 7344 7345 for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= get_gpu_pte_bit_index(prot_revoked); pte_bit--) { 7346 uvm_prot_t old_prot; 7347 7348 if (!uvm_page_mask_and(&block_context->scratch_page_mask, &gpu_state->pte_bits[pte_bit], pages_revoked)) 7349 continue; 7350 7351 if (pte_bit == UVM_PTE_BITS_GPU_ATOMIC) 7352 old_prot = UVM_PROT_READ_WRITE_ATOMIC; 7353 else 7354 old_prot = UVM_PROT_READ_WRITE; 7355 7356 for_each_va_block_subregion_in_mask(subregion, &block_context->scratch_page_mask, region) { 7357 uvm_perf_event_notify_revocation(&va_space->perf_events, 7358 block, 7359 gpu->id, 7360 uvm_va_block_region_start(block, subregion), 7361 uvm_va_block_region_size(subregion), 7362 old_prot, 7363 prot_revoked - 1); 7364 } 7365 } 7366 } 7367 7368 // Revokes the given pages mapped by gpu which are resident on resident_id. 7369 // revoke_page_mask is an in/out parameter: the pages which have the appropriate 7370 // permissions and are mapped to resident_id are removed from the mask before 7371 // returning. 7372 // 7373 // Caller must ensure that: 7374 // - Pages in map_page_mask must be set in the corresponding pte_bits mask for 7375 // the protection to be revoked on the mapping GPU. 7376 static NV_STATUS block_revoke_prot_gpu_to(uvm_va_block_t *va_block, 7377 uvm_va_block_context_t *block_context, 7378 uvm_gpu_t *gpu, 7379 uvm_processor_id_t resident_id, 7380 uvm_page_mask_t *revoke_page_mask, 7381 uvm_prot_t prot_to_revoke, 7382 uvm_tracker_t *out_tracker) 7383 { 7384 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 7385 uvm_push_t push; 7386 NV_STATUS status; 7387 uvm_pte_bits_gpu_t pte_bit; 7388 uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke); 7389 uvm_prot_t new_prot = prot_to_revoke - 1; 7390 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 7391 block_pte_op_t pte_op; 7392 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id); 7393 uvm_page_mask_t *pages_to_revoke = &block_context->mapping.page_mask; 7394 7395 UVM_ASSERT(revoke_page_mask); 7396 UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &gpu_state->pte_bits[prot_pte_bit])); 7397 7398 // The pages which will actually change are those in the input page mask 7399 // which are resident on the target. 7400 if (!uvm_page_mask_and(pages_to_revoke, revoke_page_mask, resident_mask)) 7401 return NV_OK; 7402 7403 UVM_ASSERT(block_check_mapping_residency(va_block, gpu, resident_id, pages_to_revoke)); 7404 7405 // For PTE merge/split computation, compute all resident pages which will 7406 // have exactly prot_to_revoke-1 after performing the revocation. 7407 uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_revoke); 7408 uvm_page_mask_andnot(&block_context->scratch_page_mask, 7409 &gpu_state->pte_bits[prot_pte_bit - 1], 7410 &block_context->scratch_page_mask); 7411 uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask); 7412 7413 block_gpu_compute_new_pte_state(va_block, 7414 gpu, 7415 resident_id, 7416 pages_to_revoke, 7417 &block_context->scratch_page_mask, 7418 new_pte_state); 7419 7420 status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker); 7421 if (status != NV_OK) 7422 return status; 7423 7424 status = uvm_push_begin_acquire(gpu->channel_manager, 7425 UVM_CHANNEL_TYPE_MEMOPS, 7426 &va_block->tracker, 7427 &push, 7428 "Revoking %s access privileges in block [0x%llx, 0x%llx) ", 7429 uvm_prot_string(prot_to_revoke), 7430 va_block->start, 7431 va_block->end + 1); 7432 if (status != NV_OK) 7433 return status; 7434 7435 pte_op = BLOCK_PTE_OP_REVOKE; 7436 if (new_pte_state->pte_is_2m) { 7437 // We're either modifying permissions of a pre-existing 2M PTE, or all 7438 // permissions match so we can merge to a new 2M PTE. 7439 block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op); 7440 } 7441 else if (gpu_state->pte_is_2m) { 7442 // Permissions on a subset of the existing 2M PTE are being downgraded, 7443 // so we have to split it into the appropriate mix of big and 4k PTEs. 7444 block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op); 7445 } 7446 else { 7447 // We're downgrading permissions on some pre-existing mix of big and 4K 7448 // PTEs into some other mix of big and 4K PTEs. 7449 block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op); 7450 } 7451 7452 uvm_push_end(&push); 7453 7454 block_revoke_prot_gpu_perf_notify(va_block, block_context, gpu, prot_to_revoke, pages_to_revoke); 7455 7456 // Update GPU mapping state 7457 for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= prot_pte_bit; pte_bit--) 7458 uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_revoke); 7459 7460 // Remove all pages resident on this processor from the input mask, which 7461 // pages which were revoked and pages which already had the correct 7462 // permissions. 7463 uvm_page_mask_andnot(revoke_page_mask, revoke_page_mask, pages_to_revoke); 7464 7465 UVM_ASSERT(block_check_mappings(va_block)); 7466 7467 return uvm_tracker_add_push_safe(out_tracker, &push); 7468 } 7469 7470 NV_STATUS uvm_va_block_revoke_prot(uvm_va_block_t *va_block, 7471 uvm_va_block_context_t *va_block_context, 7472 uvm_processor_id_t id, 7473 uvm_va_block_region_t region, 7474 const uvm_page_mask_t *revoke_page_mask, 7475 uvm_prot_t prot_to_revoke, 7476 uvm_tracker_t *out_tracker) 7477 { 7478 uvm_gpu_t *gpu; 7479 uvm_va_block_gpu_state_t *gpu_state; 7480 uvm_processor_mask_t resident_procs; 7481 uvm_processor_id_t resident_id; 7482 uvm_page_mask_t *running_page_mask = &va_block_context->mapping.revoke_running_page_mask; 7483 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7484 uvm_pte_bits_gpu_t prot_pte_bit; 7485 7486 UVM_ASSERT(prot_to_revoke > UVM_PROT_READ_ONLY); 7487 UVM_ASSERT(prot_to_revoke < UVM_PROT_MAX); 7488 uvm_assert_mutex_locked(&va_block->lock); 7489 7490 if (UVM_ID_IS_CPU(id)) { 7491 if (prot_to_revoke == UVM_PROT_READ_WRITE_ATOMIC) 7492 return NV_OK; 7493 7494 if (uvm_va_block_is_hmm(va_block)) { 7495 // Linux is responsible for CPU page table updates. 7496 uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], region); 7497 return NV_OK; 7498 } 7499 7500 uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask); 7501 7502 if (uvm_page_mask_and(running_page_mask, running_page_mask, &va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])) 7503 return block_revoke_cpu_write(va_block, va_block_context, region, running_page_mask, out_tracker); 7504 7505 return NV_OK; 7506 } 7507 7508 gpu = uvm_va_space_get_gpu(va_space, id); 7509 7510 // UVM-Lite GPUs should never have access revoked 7511 UVM_ASSERT_MSG(!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id), 7512 "GPU %s\n", uvm_gpu_name(gpu)); 7513 7514 // Return early if there are no mappings for the GPU present in the block 7515 if (!uvm_processor_mask_test(&va_block->mapped, gpu->id)) 7516 return NV_OK; 7517 7518 gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 7519 prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke); 7520 7521 uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask); 7522 7523 if (!uvm_page_mask_and(running_page_mask, running_page_mask, &gpu_state->pte_bits[prot_pte_bit])) 7524 return NV_OK; 7525 7526 // Revoke per resident location so we can more easily detect physically- 7527 // contiguous mappings. 7528 uvm_processor_mask_copy(&resident_procs, &va_block->resident); 7529 7530 for_each_closest_id(resident_id, &resident_procs, gpu->id, va_space) { 7531 NV_STATUS status = block_revoke_prot_gpu_to(va_block, 7532 va_block_context, 7533 gpu, 7534 resident_id, 7535 running_page_mask, 7536 prot_to_revoke, 7537 out_tracker); 7538 if (status != NV_OK) 7539 return status; 7540 7541 // If we've revoked all requested pages, we're done 7542 if (uvm_page_mask_region_empty(running_page_mask, region)) 7543 break; 7544 } 7545 7546 return NV_OK; 7547 } 7548 7549 NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block, 7550 uvm_va_block_context_t *va_block_context, 7551 const uvm_processor_mask_t *map_processor_mask, 7552 uvm_va_block_region_t region, 7553 const uvm_page_mask_t *map_page_mask, 7554 uvm_prot_t new_prot, 7555 UvmEventMapRemoteCause cause) 7556 { 7557 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 7558 NV_STATUS status = NV_OK; 7559 NV_STATUS tracker_status; 7560 uvm_processor_id_t id; 7561 7562 for_each_id_in_mask(id, map_processor_mask) { 7563 status = uvm_va_block_map(va_block, 7564 va_block_context, 7565 id, 7566 region, 7567 map_page_mask, 7568 new_prot, 7569 cause, 7570 &local_tracker); 7571 if (status != NV_OK) 7572 break; 7573 } 7574 7575 // Regardless of error, add the successfully-pushed mapping operations into 7576 // the block's tracker. Note that we can't overwrite the tracker because we 7577 // aren't guaranteed that the map actually pushed anything (in which case it 7578 // would've acquired the block tracker first). 7579 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 7580 uvm_tracker_deinit(&local_tracker); 7581 7582 return status == NV_OK ? tracker_status : status; 7583 } 7584 7585 NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block, 7586 uvm_va_block_context_t *va_block_context, 7587 const uvm_processor_mask_t *unmap_processor_mask, 7588 uvm_va_block_region_t region, 7589 const uvm_page_mask_t *unmap_page_mask) 7590 { 7591 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 7592 NV_STATUS status = NV_OK; 7593 NV_STATUS tracker_status; 7594 uvm_processor_id_t id; 7595 7596 // Watch out, unmap_mask could change during iteration since it could be 7597 // va_block->mapped. 7598 for_each_id_in_mask(id, unmap_processor_mask) { 7599 // Errors could either be a system-fatal error (ECC) or an allocation 7600 // retry due to PTE splitting. In either case we should stop after 7601 // hitting the first one. 7602 status = uvm_va_block_unmap(va_block, va_block_context, id, region, unmap_page_mask, &local_tracker); 7603 if (status != NV_OK) 7604 break; 7605 } 7606 7607 // See the comment in uvm_va_block_map_mask for adding to the tracker. 7608 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 7609 uvm_tracker_deinit(&local_tracker); 7610 7611 return status == NV_OK ? tracker_status : status; 7612 } 7613 7614 NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block, 7615 uvm_va_block_context_t *va_block_context, 7616 const uvm_processor_mask_t *revoke_processor_mask, 7617 uvm_va_block_region_t region, 7618 const uvm_page_mask_t *revoke_page_mask, 7619 uvm_prot_t prot_to_revoke) 7620 { 7621 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 7622 NV_STATUS status = NV_OK; 7623 NV_STATUS tracker_status; 7624 uvm_processor_id_t id; 7625 7626 for_each_id_in_mask(id, revoke_processor_mask) { 7627 status = uvm_va_block_revoke_prot(va_block, 7628 va_block_context, 7629 id, 7630 region, 7631 revoke_page_mask, 7632 prot_to_revoke, 7633 &local_tracker); 7634 if (status != NV_OK) 7635 break; 7636 } 7637 7638 // See the comment in uvm_va_block_map_mask for adding to the tracker. 7639 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 7640 uvm_tracker_deinit(&local_tracker); 7641 7642 return status == NV_OK ? tracker_status : status; 7643 } 7644 7645 // Updates the read_duplicated_pages mask in the block when the state of GPU id 7646 // is being destroyed 7647 static void update_read_duplicated_pages_mask(uvm_va_block_t *block, 7648 uvm_gpu_id_t id, 7649 uvm_va_block_gpu_state_t *gpu_state) 7650 { 7651 uvm_gpu_id_t running_id; 7652 bool first = true; 7653 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7654 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 7655 uvm_page_mask_t *running_page_mask = &block_context->update_read_duplicated_pages.running_page_mask; 7656 uvm_page_mask_t *tmp_page_mask = &block_context->scratch_page_mask; 7657 7658 uvm_page_mask_zero(&block->read_duplicated_pages); 7659 7660 for_each_id_in_mask(running_id, &block->resident) { 7661 const uvm_page_mask_t *running_residency_mask; 7662 7663 if (uvm_id_equal(running_id, id)) 7664 continue; 7665 7666 running_residency_mask = uvm_va_block_resident_mask_get(block, running_id); 7667 7668 if (first) { 7669 uvm_page_mask_copy(running_page_mask, running_residency_mask); 7670 first = false; 7671 continue; 7672 } 7673 7674 if (uvm_page_mask_and(tmp_page_mask, running_page_mask, running_residency_mask)) 7675 uvm_page_mask_or(&block->read_duplicated_pages, &block->read_duplicated_pages, tmp_page_mask); 7676 7677 uvm_page_mask_or(running_page_mask, running_page_mask, running_residency_mask); 7678 } 7679 } 7680 7681 // Unmaps all GPU mappings under this block, frees the page tables, and frees 7682 // all the GPU chunks. This simply drops the chunks on the floor, so the caller 7683 // must take care of copying the data elsewhere if it needs to remain intact. 7684 // 7685 // This serializes on the block tracker since it must unmap page tables. 7686 static void block_destroy_gpu_state(uvm_va_block_t *block, uvm_gpu_id_t id) 7687 { 7688 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id); 7689 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7690 uvm_gpu_va_space_t *gpu_va_space; 7691 uvm_gpu_t *gpu, *other_gpu; 7692 7693 if (!gpu_state) 7694 return; 7695 7696 uvm_assert_mutex_locked(&block->lock); 7697 7698 // Unmap PTEs and free page tables 7699 gpu = uvm_va_space_get_gpu(va_space, id); 7700 gpu_va_space = uvm_gpu_va_space_get(va_space, gpu); 7701 if (gpu_va_space) { 7702 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 7703 7704 uvm_va_block_remove_gpu_va_space(block, gpu_va_space, block_context); 7705 } 7706 7707 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id)); 7708 7709 // No processor should have this GPU mapped at this point 7710 UVM_ASSERT(block_check_processor_not_mapped(block, id)); 7711 7712 // We need to remove the mappings of the indirect peers from the reverse 7713 // map when the GPU state is being destroyed (for example, on 7714 // unregister_gpu) and when peer access between indirect peers is disabled. 7715 // However, we need to avoid double mapping removals. There are two 7716 // possible scenarios: 7717 // - Disable peer access first. This will remove all mappings between A and 7718 // B GPUs, and the indirect_peers bit is cleared. Thus, the later call to 7719 // unregister_gpu will not operate on that pair of GPUs. 7720 // - Unregister GPU first. This will remove all mappings from all indirect 7721 // peers to the GPU being unregistered. It will also destroy its GPU state. 7722 // Subsequent calls to disable peers will remove the mappings from the GPU 7723 // being unregistered, but never to the GPU being unregistered (since it no 7724 // longer has a valid GPU state). 7725 for_each_va_space_gpu_in_mask(other_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) 7726 block_gpu_unmap_all_chunks_indirect_peer(block, gpu, other_gpu); 7727 7728 if (gpu_state->chunks) { 7729 size_t i, num_chunks; 7730 7731 update_read_duplicated_pages_mask(block, id, gpu_state); 7732 uvm_page_mask_zero(&gpu_state->resident); 7733 block_clear_resident_processor(block, id); 7734 7735 num_chunks = block_num_gpu_chunks(block, gpu); 7736 for (i = 0; i < num_chunks; i++) { 7737 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 7738 if (!chunk) 7739 continue; 7740 7741 uvm_mmu_chunk_unmap(chunk, &block->tracker); 7742 uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker); 7743 } 7744 7745 uvm_kvfree(gpu_state->chunks); 7746 } 7747 else { 7748 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id)); 7749 } 7750 7751 7752 // Pending operations may still need the DMA memory to be mapped. 7753 uvm_tracker_wait(&block->tracker); 7754 7755 block_gpu_unmap_phys_all_cpu_pages(block, gpu); 7756 uvm_processor_mask_clear(&block->evicted_gpus, id); 7757 7758 kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state); 7759 block->gpus[uvm_id_gpu_index(id)] = NULL; 7760 } 7761 7762 static void block_put_ptes_safe(uvm_page_tree_t *tree, uvm_page_table_range_t *range) 7763 { 7764 if (range->table) { 7765 uvm_page_tree_put_ptes(tree, range); 7766 memset(range, 0, sizeof(*range)); 7767 } 7768 } 7769 7770 NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space) 7771 { 7772 uvm_assert_mutex_locked(&va_block->lock); 7773 7774 if (!gpu_va_space->ats.enabled || !va_block->cpu.ever_mapped) 7775 return NV_OK; 7776 7777 // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See 7778 // comments in pre_populate_pde1_gpu. 7779 return block_pre_populate_pde1_gpu(va_block, gpu_va_space, NULL); 7780 } 7781 7782 void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block, 7783 uvm_gpu_va_space_t *gpu_va_space, 7784 uvm_va_block_context_t *block_context) 7785 { 7786 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 7787 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 7788 uvm_gpu_t *gpu = gpu_va_space->gpu; 7789 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 7790 uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block); 7791 uvm_push_t push; 7792 NV_STATUS status; 7793 7794 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 7795 7796 if (!gpu_state) 7797 return; 7798 7799 uvm_assert_mutex_locked(&va_block->lock); 7800 7801 // Unmapping the whole block won't cause a page table split, so this should 7802 // only fail if we have a system-fatal error. 7803 status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &local_tracker); 7804 if (status != NV_OK) { 7805 UVM_ASSERT(status == uvm_global_get_status()); 7806 return; // Just leak 7807 } 7808 7809 UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id)); 7810 7811 // Reset the page tables if other allocations could reuse them 7812 if (!block_gpu_supports_2m(va_block, gpu) && 7813 !bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 7814 7815 status = uvm_push_begin_acquire(gpu->channel_manager, 7816 UVM_CHANNEL_TYPE_MEMOPS, 7817 &local_tracker, 7818 &push, 7819 "Resetting PTEs for block [0x%llx, 0x%llx)", 7820 va_block->start, 7821 va_block->end + 1); 7822 if (status != NV_OK) { 7823 UVM_ASSERT(status == uvm_global_get_status()); 7824 return; // Just leak 7825 } 7826 7827 uvm_pte_batch_begin(&push, pte_batch); 7828 uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch); 7829 7830 // When the big PTEs is active, the 4k PTEs under it are garbage. Make 7831 // them invalid so the page tree code can reuse them for other 7832 // allocations on this VA. These don't need TLB invalidates since the 7833 // big PTEs above them are active. 7834 if (gpu_state->page_table_range_4k.table) { 7835 uvm_page_mask_init_from_big_ptes(va_block, gpu, &block_context->scratch_page_mask, gpu_state->big_ptes); 7836 block_gpu_pte_clear_4k(va_block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 7837 } 7838 7839 // We unmapped all big PTEs above, which means they have the unmapped 7840 // pattern so the GPU MMU won't read 4k PTEs under them. Set them to 7841 // invalid to activate the 4ks below so new allocations using just those 7842 // 4k PTEs will work. 7843 block_gpu_pte_clear_big(va_block, gpu, gpu_state->big_ptes, 0, pte_batch, tlb_batch); 7844 7845 uvm_pte_batch_end(pte_batch); 7846 uvm_tlb_batch_end(tlb_batch, &push, UVM_MEMBAR_NONE); 7847 7848 uvm_push_end(&push); 7849 uvm_tracker_overwrite_with_push(&local_tracker, &push); 7850 } 7851 7852 // The unmap must finish before we free the page tables 7853 status = uvm_tracker_wait_deinit(&local_tracker); 7854 if (status != NV_OK) 7855 return; // System-fatal error, just leak 7856 7857 // Note that if the PTE is currently 2M with lower tables allocated but not 7858 // in use, calling put_ptes on those lower ranges will re-write the 2M entry 7859 // to be a PDE. 7860 block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_4k); 7861 block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_big); 7862 block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_2m); 7863 7864 gpu_state->pte_is_2m = false; 7865 gpu_state->initialized_big = false; 7866 gpu_state->activated_big = false; 7867 gpu_state->activated_4k = false; 7868 bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 7869 7870 UVM_ASSERT(block_check_mappings(va_block)); 7871 } 7872 7873 NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) 7874 { 7875 NV_STATUS status; 7876 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7877 7878 UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type != UVM_GPU_LINK_INVALID); 7879 uvm_assert_rwsem_locked_write(&va_space->lock); 7880 uvm_assert_mutex_locked(&va_block->lock); 7881 7882 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) { 7883 status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu0, gpu1); 7884 if (status != NV_OK) 7885 return status; 7886 7887 status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu1, gpu0); 7888 if (status != NV_OK) { 7889 block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1); 7890 return status; 7891 } 7892 } 7893 7894 // TODO: Bug 1767224: Refactor the uvm_va_block_set_accessed_by logic so we 7895 // call it here. 7896 7897 return NV_OK; 7898 } 7899 7900 void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) 7901 { 7902 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7903 NV_STATUS status; 7904 uvm_tracker_t tracker = UVM_TRACKER_INIT(); 7905 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 7906 uvm_page_mask_t *unmap_page_mask = &block_context->caller_page_mask; 7907 const uvm_page_mask_t *resident0; 7908 const uvm_page_mask_t *resident1; 7909 7910 uvm_assert_mutex_locked(&va_block->lock); 7911 7912 // See comment in block_destroy_gpu_state 7913 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) { 7914 block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1); 7915 block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu1, gpu0); 7916 } 7917 7918 // If either of the GPUs doesn't have GPU state then nothing could be mapped 7919 // between them. 7920 if (!uvm_va_block_gpu_state_get(va_block, gpu0->id) || !uvm_va_block_gpu_state_get(va_block, gpu1->id)) 7921 return; 7922 7923 resident0 = uvm_va_block_resident_mask_get(va_block, gpu0->id); 7924 resident1 = uvm_va_block_resident_mask_get(va_block, gpu1->id); 7925 7926 // Unmap all pages resident on gpu1, but not on gpu0, from gpu0 7927 if (uvm_page_mask_andnot(unmap_page_mask, resident1, resident0)) { 7928 status = block_unmap_gpu(va_block, block_context, gpu0, unmap_page_mask, &tracker); 7929 if (status != NV_OK) { 7930 // Since all PTEs unmapped by this call have the same aperture, page 7931 // splits should never be required so any failure should be the 7932 // result of a system-fatal error. 7933 UVM_ASSERT_MSG(status == uvm_global_get_status(), 7934 "Unmapping failed: %s, GPU %s\n", 7935 nvstatusToString(status), 7936 uvm_gpu_name(gpu0)); 7937 } 7938 } 7939 7940 // Unmap all pages resident on gpu0, but not on gpu1, from gpu1 7941 if (uvm_page_mask_andnot(unmap_page_mask, resident0, resident1)) { 7942 status = block_unmap_gpu(va_block, block_context, gpu1, unmap_page_mask, &tracker); 7943 if (status != NV_OK) { 7944 UVM_ASSERT_MSG(status == uvm_global_get_status(), 7945 "Unmapping failed: %s, GPU %s\n", 7946 nvstatusToString(status), 7947 uvm_gpu_name(gpu0)); 7948 } 7949 } 7950 7951 status = uvm_tracker_add_tracker_safe(&va_block->tracker, &tracker); 7952 if (status != NV_OK) 7953 UVM_ASSERT(status == uvm_global_get_status()); 7954 7955 status = uvm_tracker_wait_deinit(&tracker); 7956 if (status != NV_OK) 7957 UVM_ASSERT(status == uvm_global_get_status()); 7958 } 7959 7960 void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uvm_gpu_t *gpu) 7961 { 7962 NV_STATUS status; 7963 uvm_va_range_t *va_range = va_block->va_range; 7964 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7965 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 7966 uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block); 7967 7968 uvm_assert_mutex_locked(&va_block->lock); 7969 UVM_ASSERT(uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu->id)); 7970 7971 // If the GPU doesn't have GPU state then nothing could be mapped. 7972 if (!uvm_va_block_gpu_state_get(va_block, gpu->id)) 7973 return; 7974 7975 // In UVM-Lite mode, mappings to the preferred location are not tracked 7976 // directly, so just unmap the whole block. 7977 status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &va_block->tracker); 7978 if (status != NV_OK) { 7979 // Unmapping the whole block should not cause page splits so any failure 7980 // should be the result of a system-fatal error. 7981 UVM_ASSERT_MSG(status == uvm_global_get_status(), 7982 "Unmapping failed: %s, GPU %s\n", 7983 nvstatusToString(status), uvm_gpu_name(gpu)); 7984 } 7985 7986 status = uvm_tracker_wait(&va_block->tracker); 7987 if (status != NV_OK) { 7988 UVM_ASSERT_MSG(status == uvm_global_get_status(), 7989 "Unmapping failed: %s, GPU %s\n", 7990 nvstatusToString(status), uvm_gpu_name(gpu)); 7991 } 7992 } 7993 7994 // Evict pages from the GPU by moving each resident region to the CPU 7995 // 7996 // Notably the caller needs to support allocation-retry as 7997 // uvm_va_block_migrate_locked() requires that. 7998 static NV_STATUS block_evict_pages_from_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm) 7999 { 8000 NV_STATUS status = NV_OK; 8001 const uvm_page_mask_t *resident = uvm_va_block_resident_mask_get(va_block, gpu->id); 8002 uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block); 8003 uvm_va_block_region_t subregion; 8004 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 8005 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, mm); 8006 8007 // Move all subregions resident on the GPU to the CPU 8008 for_each_va_block_subregion_in_mask(subregion, resident, region) { 8009 if (uvm_va_block_is_hmm(va_block)) { 8010 status = uvm_hmm_va_block_evict_pages_from_gpu(va_block, 8011 gpu, 8012 block_context, 8013 resident, 8014 subregion); 8015 } 8016 else { 8017 status = uvm_va_block_migrate_locked(va_block, 8018 NULL, 8019 block_context, 8020 subregion, 8021 UVM_ID_CPU, 8022 UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP, 8023 NULL); 8024 } 8025 if (status != NV_OK) 8026 return status; 8027 } 8028 8029 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu->id)); 8030 return NV_OK; 8031 } 8032 8033 void uvm_va_block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm) 8034 { 8035 NV_STATUS status; 8036 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 8037 8038 uvm_assert_mutex_locked(&va_block->lock); 8039 8040 if (!gpu_state) 8041 return; 8042 8043 // The mappings should've already been torn down by GPU VA space unregister 8044 UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id)); 8045 UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ])); 8046 UVM_ASSERT(!block_gpu_has_page_tables(va_block, gpu)); 8047 8048 // Use UVM_VA_BLOCK_RETRY_LOCKED() as the va block lock is already taken and 8049 // we don't rely on any state of the block across the call. 8050 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL, block_evict_pages_from_gpu(va_block, gpu, mm)); 8051 if (status != NV_OK) { 8052 UVM_ERR_PRINT("Failed to evict GPU pages on GPU unregister: %s, GPU %s\n", 8053 nvstatusToString(status), 8054 uvm_gpu_name(gpu)); 8055 uvm_global_set_fatal_error(status); 8056 } 8057 8058 // This function will copy the block's tracker into each chunk then free the 8059 // chunk to PMM. If we do this before waiting for the block tracker below 8060 // we'll populate PMM's free chunks with tracker entries, which gives us 8061 // better testing coverage of chunk synchronization on GPU unregister. 8062 block_destroy_gpu_state(va_block, gpu->id); 8063 8064 // Any time a GPU is unregistered we need to make sure that there are no 8065 // pending (direct or indirect) tracker entries for that GPU left in the 8066 // block's tracker. The only way to ensure that is to wait for the whole 8067 // tracker. 8068 status = uvm_tracker_wait(&va_block->tracker); 8069 if (status != NV_OK) 8070 UVM_ASSERT(status == uvm_global_get_status()); 8071 } 8072 8073 void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm) 8074 { 8075 // Take the lock internally to not expose the caller to allocation-retry. 8076 uvm_mutex_lock(&va_block->lock); 8077 8078 uvm_va_block_unregister_gpu_locked(va_block, gpu, mm); 8079 8080 uvm_mutex_unlock(&va_block->lock); 8081 } 8082 8083 static void block_mark_region_cpu_dirty(uvm_va_block_t *va_block, uvm_va_block_region_t region) 8084 { 8085 uvm_page_index_t page_index; 8086 8087 uvm_assert_mutex_locked(&va_block->lock); 8088 8089 for_each_va_block_page_in_region_mask (page_index, &va_block->cpu.resident, region) 8090 block_mark_cpu_page_dirty(va_block, page_index); 8091 } 8092 8093 // Tears down everything within the block, but doesn't free the block itself. 8094 // Note that when uvm_va_block_kill is called, this is called twice: once for 8095 // the initial kill itself, then again when the block's ref count is eventually 8096 // destroyed. block->va_range is used to track whether the block has already 8097 // been killed. 8098 static void block_kill(uvm_va_block_t *block) 8099 { 8100 uvm_va_space_t *va_space; 8101 uvm_perf_event_data_t event_data; 8102 uvm_cpu_chunk_t *chunk; 8103 uvm_gpu_id_t id; 8104 NV_STATUS status; 8105 uvm_va_block_region_t region = uvm_va_block_region_from_block(block); 8106 uvm_page_index_t page_index; 8107 uvm_page_index_t next_page_index; 8108 8109 if (uvm_va_block_is_dead(block)) 8110 return; 8111 8112 va_space = uvm_va_block_get_va_space(block); 8113 event_data.block_destroy.block = block; 8114 uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_DESTROY, &event_data); 8115 8116 // Unmap all processors in parallel first. Unmapping the whole block won't 8117 // cause a page table split, so this should only fail if we have a system- 8118 // fatal error. 8119 if (!uvm_processor_mask_empty(&block->mapped)) { 8120 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 8121 8122 // HMM CPU mappings are controlled by Linux so no need to unmap. 8123 // Remote GPU mappings will be removed below. 8124 if (uvm_va_block_is_hmm(block) && uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) { 8125 uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]); 8126 uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]); 8127 uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU); 8128 } 8129 8130 // We could only be killed with mapped GPU state by VA range free or VA 8131 // space teardown, so it's safe to use the va_space's block_context 8132 // because both of those have the VA space lock held in write mode. 8133 status = uvm_va_block_unmap_mask(block, block_context, &block->mapped, region, NULL); 8134 UVM_ASSERT(status == uvm_global_get_status()); 8135 } 8136 8137 UVM_ASSERT(uvm_processor_mask_empty(&block->mapped)); 8138 8139 // Free the GPU page tables and chunks 8140 for_each_gpu_id(id) 8141 block_destroy_gpu_state(block, id); 8142 8143 // Wait for the GPU PTE unmaps before freeing CPU memory 8144 uvm_tracker_wait_deinit(&block->tracker); 8145 8146 // No processor should have the CPU mapped at this point 8147 UVM_ASSERT(block_check_processor_not_mapped(block, UVM_ID_CPU)); 8148 8149 // Free CPU pages 8150 for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, block) { 8151 // be conservative. 8152 // Tell the OS we wrote to the page because we sometimes clear the dirty 8153 // bit after writing to it. HMM dirty flags are managed by the kernel. 8154 if (!uvm_va_block_is_hmm(block)) 8155 uvm_cpu_chunk_mark_dirty(chunk, 0); 8156 uvm_cpu_chunk_remove_from_block(block, page_index); 8157 uvm_cpu_chunk_free(chunk); 8158 } 8159 8160 uvm_kvfree((void *)block->cpu.chunks); 8161 block->cpu.chunks = 0; 8162 8163 // Clearing the resident bit isn't strictly necessary since this block 8164 // is getting destroyed, but it keeps state consistent for assertions. 8165 uvm_page_mask_zero(&block->cpu.resident); 8166 block_clear_resident_processor(block, UVM_ID_CPU); 8167 8168 if (uvm_va_block_is_hmm(block)) 8169 uvm_va_policy_clear(block, block->start, block->end); 8170 8171 block->va_range = NULL; 8172 #if UVM_IS_CONFIG_HMM() 8173 block->hmm.va_space = NULL; 8174 #endif 8175 } 8176 8177 // Called when the block's ref count drops to 0 8178 void uvm_va_block_destroy(nv_kref_t *nv_kref) 8179 { 8180 uvm_va_block_t *block = container_of(nv_kref, uvm_va_block_t, kref); 8181 8182 // Nobody else should have a reference when freeing 8183 uvm_assert_mutex_unlocked(&block->lock); 8184 8185 uvm_mutex_lock(&block->lock); 8186 block_kill(block); 8187 uvm_mutex_unlock(&block->lock); 8188 8189 if (uvm_enable_builtin_tests) { 8190 uvm_va_block_wrapper_t *block_wrapper = container_of(block, uvm_va_block_wrapper_t, block); 8191 8192 kmem_cache_free(g_uvm_va_block_cache, block_wrapper); 8193 } 8194 else { 8195 kmem_cache_free(g_uvm_va_block_cache, block); 8196 } 8197 } 8198 8199 void uvm_va_block_kill(uvm_va_block_t *va_block) 8200 { 8201 uvm_mutex_lock(&va_block->lock); 8202 block_kill(va_block); 8203 uvm_mutex_unlock(&va_block->lock); 8204 8205 // May call block_kill again 8206 uvm_va_block_release(va_block); 8207 } 8208 8209 static void block_gpu_release_region(uvm_va_block_t *va_block, 8210 uvm_gpu_id_t gpu_id, 8211 uvm_va_block_gpu_state_t *gpu_state, 8212 uvm_page_mask_t *page_mask, 8213 uvm_va_block_region_t region) 8214 { 8215 uvm_page_index_t page_index; 8216 8217 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 8218 uvm_gpu_chunk_t *gpu_chunk = gpu_state->chunks[page_index]; 8219 8220 if (!gpu_chunk) 8221 continue; 8222 8223 // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks 8224 8225 uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker); 8226 8227 // The GPU chunk will be freed when the device private reference drops. 8228 if (uvm_page_mask_test_and_clear(&gpu_state->resident, page_index) && 8229 uvm_page_mask_empty(&gpu_state->resident)) 8230 block_clear_resident_processor(va_block, gpu_id); 8231 8232 gpu_state->chunks[page_index] = NULL; 8233 } 8234 } 8235 8236 void uvm_va_block_munmap_region(uvm_va_block_t *va_block, 8237 uvm_va_block_region_t region) 8238 { 8239 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 8240 uvm_perf_event_data_t event_data; 8241 uvm_gpu_id_t gpu_id; 8242 8243 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 8244 uvm_assert_mutex_locked(&va_block->lock); 8245 8246 // Reset thrashing state for the region. 8247 event_data.block_munmap.block = va_block; 8248 event_data.block_munmap.region = region; 8249 uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_MUNMAP, &event_data); 8250 8251 // Set a flag so that GPU fault events are flushed since they might refer 8252 // to the region being unmapped. 8253 // Note that holding the va_block lock prevents GPU VA spaces from 8254 // being removed so the registered_gpu_va_spaces mask is stable. 8255 for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) { 8256 uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id); 8257 } 8258 8259 // Release any remaining vidmem chunks in the given region. 8260 for_each_gpu_id(gpu_id) { 8261 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 8262 8263 if (!gpu_state) 8264 continue; 8265 8266 uvm_page_mask_region_clear(&gpu_state->evicted, region); 8267 if (uvm_page_mask_empty(&gpu_state->evicted)) 8268 uvm_processor_mask_clear(&va_block->evicted_gpus, gpu_id); 8269 8270 if (gpu_state->chunks) { 8271 block_gpu_release_region(va_block, gpu_id, gpu_state, NULL, region); 8272 8273 // TODO: bug 3660922: Need to update the read duplicated pages mask 8274 // when read duplication is supported for HMM. 8275 } 8276 else { 8277 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu_id)); 8278 } 8279 } 8280 8281 uvm_va_policy_clear(va_block, 8282 uvm_va_block_region_start(va_block, region), 8283 uvm_va_block_region_end(va_block, region)); 8284 } 8285 8286 static NV_STATUS block_split_presplit_ptes_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu) 8287 { 8288 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id); 8289 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing); 8290 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 8291 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu); 8292 NvU32 alloc_sizes; 8293 DECLARE_BITMAP(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 8294 uvm_page_index_t new_start_page_index = uvm_va_block_cpu_page_index(existing, new->start); 8295 size_t big_page_index; 8296 uvm_push_t push; 8297 NV_STATUS status; 8298 8299 // We only have to split to big PTEs if we're currently a 2M PTE 8300 if (existing_gpu_state->pte_is_2m) { 8301 // We can skip the split if the 2M PTE is invalid and we have no lower 8302 // PTEs. 8303 if (block_page_prot_gpu(existing, gpu, 0) == UVM_PROT_NONE && 8304 !existing_gpu_state->page_table_range_big.table && 8305 !existing_gpu_state->page_table_range_4k.table) 8306 return NV_OK; 8307 8308 alloc_sizes = big_page_size; 8309 bitmap_fill(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 8310 8311 if (!IS_ALIGNED(new->start, big_page_size)) { 8312 alloc_sizes |= UVM_PAGE_SIZE_4K; 8313 8314 big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size); 8315 __clear_bit(big_page_index, new_big_ptes); 8316 } 8317 8318 status = block_alloc_ptes_with_retry(existing, gpu, alloc_sizes, NULL); 8319 if (status != NV_OK) 8320 return status; 8321 8322 status = uvm_push_begin_acquire(gpu->channel_manager, 8323 UVM_CHANNEL_TYPE_MEMOPS, 8324 &existing->tracker, 8325 &push, 8326 "Splitting 2M PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)", 8327 existing->start, existing->end + 1, 8328 new->start, new->end + 1); 8329 if (status != NV_OK) 8330 return status; 8331 8332 block_gpu_split_2m(existing, block_context, gpu, new_big_ptes, &push); 8333 } 8334 else { 8335 big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size); 8336 8337 // If the split point is on a big page boundary, or if the split point 8338 // is not currently covered by a big PTE, we don't have to split 8339 // anything. 8340 if (IS_ALIGNED(new->start, big_page_size) || 8341 big_page_index == MAX_BIG_PAGES_PER_UVM_VA_BLOCK || 8342 !test_bit(big_page_index, existing_gpu_state->big_ptes)) 8343 return NV_OK; 8344 8345 status = block_alloc_ptes_with_retry(existing, gpu, UVM_PAGE_SIZE_4K, NULL); 8346 if (status != NV_OK) 8347 return status; 8348 8349 bitmap_zero(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 8350 __set_bit(big_page_index, new_big_ptes); 8351 8352 status = uvm_push_begin_acquire(gpu->channel_manager, 8353 UVM_CHANNEL_TYPE_MEMOPS, 8354 &existing->tracker, 8355 &push, 8356 "Splitting big PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)", 8357 existing->start, existing->end + 1, 8358 new->start, new->end + 1); 8359 if (status != NV_OK) 8360 return status; 8361 8362 block_gpu_split_big(existing, block_context, gpu, new_big_ptes, &push); 8363 } 8364 8365 uvm_push_end(&push); 8366 8367 // Adding this push to existing block tracker will cause all GPU PTE splits 8368 // to serialize on each other, but it's simpler than maintaining a separate 8369 // tracker and this path isn't performance-critical. 8370 return uvm_tracker_add_push_safe(&existing->tracker, &push); 8371 } 8372 8373 static NV_STATUS block_split_presplit_ptes(uvm_va_block_t *existing, uvm_va_block_t *new) 8374 { 8375 uvm_gpu_t *gpu; 8376 uvm_gpu_id_t id; 8377 NV_STATUS status; 8378 8379 for_each_gpu_id(id) { 8380 if (!uvm_va_block_gpu_state_get(existing, id)) 8381 continue; 8382 8383 gpu = block_get_gpu(existing, id); 8384 8385 if (block_gpu_has_page_tables(existing, gpu)) { 8386 status = block_split_presplit_ptes_gpu(existing, new, gpu); 8387 if (status != NV_OK) 8388 return status; 8389 } 8390 } 8391 8392 return NV_OK; 8393 } 8394 8395 typedef struct 8396 { 8397 // Number of chunks contained by this VA block 8398 size_t num_chunks; 8399 8400 // Index of the "interesting" chunk, either adjacent to or spanning the 8401 // split point depending on which block this is. 8402 size_t chunk_index; 8403 8404 // Size of the chunk referenced by chunk_index 8405 uvm_chunk_size_t chunk_size; 8406 } block_gpu_chunk_split_state_t; 8407 8408 static void block_gpu_chunk_get_split_state(uvm_va_block_t *block, 8409 block_gpu_chunk_split_state_t *state, 8410 NvU64 start, 8411 NvU64 end, 8412 uvm_page_index_t page_index, 8413 uvm_gpu_t *gpu) 8414 { 8415 NvU64 size = end - start + 1; 8416 state->num_chunks = block_num_gpu_chunks_range(block, start, size, gpu); 8417 state->chunk_index = block_gpu_chunk_index_range(block, start, size, gpu, page_index, &state->chunk_size); 8418 } 8419 8420 static void block_merge_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) 8421 { 8422 uvm_gpu_t *accessing_gpu; 8423 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 8424 8425 uvm_pmm_gpu_merge_chunk(&gpu->pmm, chunk); 8426 8427 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 8428 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); 8429 8430 uvm_pmm_sysmem_mappings_merge_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings, 8431 peer_addr, 8432 uvm_gpu_chunk_get_size(chunk)); 8433 } 8434 } 8435 8436 // Perform any chunk splitting and array growing required for this block split, 8437 // but don't actually move chunk pointers anywhere. 8438 static NV_STATUS block_presplit_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu) 8439 { 8440 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id); 8441 uvm_gpu_t *accessing_gpu; 8442 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing); 8443 uvm_gpu_chunk_t **temp_chunks; 8444 uvm_gpu_chunk_t *original_chunk, *curr_chunk; 8445 uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start); 8446 uvm_chunk_sizes_mask_t split_sizes; 8447 uvm_chunk_size_t subchunk_size; 8448 NV_STATUS status; 8449 block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state; 8450 8451 block_gpu_chunk_get_split_state(existing, 8452 &existing_before_state, 8453 existing->start, 8454 existing->end, 8455 split_page_index, 8456 gpu); 8457 block_gpu_chunk_get_split_state(existing, 8458 &existing_after_state, 8459 existing->start, 8460 new->start - 1, 8461 split_page_index - 1, 8462 gpu); 8463 block_gpu_chunk_get_split_state(new, 8464 &new_state, 8465 new->start, 8466 new->end, 8467 0, 8468 gpu); 8469 8470 // Even though we're splitting existing, we could wind up requiring a larger 8471 // chunks array if we split a large chunk into many smaller ones. 8472 if (existing_after_state.num_chunks > existing_before_state.num_chunks) { 8473 temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks, 8474 existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0])); 8475 if (!temp_chunks) 8476 return NV_ERR_NO_MEMORY; 8477 existing_gpu_state->chunks = temp_chunks; 8478 } 8479 8480 original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index]; 8481 8482 // If the chunk covering the split point is not populated, we're done. We've 8483 // already grown the array to cover any new chunks which may be populated 8484 // later. 8485 if (!original_chunk) 8486 return NV_OK; 8487 8488 // Figure out the splits we need to perform. Remove all sizes >= the current 8489 // size, and all sizes < the target size. Note that the resulting mask will 8490 // be 0 if the sizes match (we're already splitting at a chunk boundary). 8491 UVM_ASSERT(uvm_gpu_chunk_get_size(original_chunk) == existing_before_state.chunk_size); 8492 UVM_ASSERT(existing_before_state.chunk_size >= new_state.chunk_size); 8493 split_sizes = gpu->parent->mmu_user_chunk_sizes; 8494 split_sizes &= existing_before_state.chunk_size - 1; 8495 split_sizes &= ~(new_state.chunk_size - 1); 8496 8497 // Keep splitting the chunk covering the split point until we hit the target 8498 // size. 8499 curr_chunk = original_chunk; 8500 for_each_chunk_size_rev(subchunk_size, split_sizes) { 8501 size_t last_index, num_subchunks; 8502 8503 status = uvm_pmm_gpu_split_chunk(&gpu->pmm, curr_chunk, subchunk_size, NULL); 8504 if (status != NV_OK) 8505 goto error; 8506 8507 // Split physical GPU mappings for indirect peers 8508 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 8509 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, curr_chunk, accessing_gpu); 8510 8511 status = uvm_pmm_sysmem_mappings_split_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings, 8512 peer_addr, 8513 subchunk_size); 8514 if (status != NV_OK) 8515 goto error; 8516 } 8517 8518 if (subchunk_size == new_state.chunk_size) 8519 break; 8520 8521 // Compute the last subchunk index prior to the split point. Divide the 8522 // entire address space into units of subchunk_size, then mod by the 8523 // number of subchunks within the parent. 8524 last_index = (size_t)uvm_div_pow2_64(new->start - 1, subchunk_size); 8525 num_subchunks = (size_t)uvm_div_pow2_64(uvm_gpu_chunk_get_size(curr_chunk), subchunk_size); 8526 UVM_ASSERT(num_subchunks > 1); 8527 last_index &= num_subchunks - 1; 8528 8529 uvm_pmm_gpu_get_subchunks(&gpu->pmm, curr_chunk, last_index, 1, &curr_chunk); 8530 UVM_ASSERT(uvm_gpu_chunk_get_size(curr_chunk) == subchunk_size); 8531 } 8532 8533 // Note that existing's chunks array still has a pointer to original_chunk, 8534 // not to any newly-split subchunks. If a subsequent split failure occurs on 8535 // a later GPU we'll have to merge it back. Once we're past the preallocate 8536 // stage we'll remove it from the chunks array and move the new split chunks 8537 // in. 8538 8539 return NV_OK; 8540 8541 error: 8542 // On error we need to leave the chunk in its initial state 8543 block_merge_chunk(existing, gpu, original_chunk); 8544 8545 return status; 8546 } 8547 8548 static NV_STATUS block_split_cpu_chunk_to_64k(uvm_va_block_t *block) 8549 { 8550 uvm_cpu_chunk_storage_mixed_t *mixed; 8551 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, 0); 8552 NV_STATUS status; 8553 8554 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M); 8555 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_CHUNK); 8556 8557 mixed = uvm_kvmalloc_zero(sizeof(*mixed)); 8558 if (!mixed) 8559 return NV_ERR_NO_MEMORY; 8560 8561 status = uvm_cpu_chunk_split(chunk, (uvm_cpu_chunk_t **)&mixed->slots); 8562 if (status != NV_OK) { 8563 uvm_kvfree(mixed); 8564 return status; 8565 } 8566 8567 bitmap_fill(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK); 8568 block->cpu.chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED; 8569 return status; 8570 } 8571 8572 static NV_STATUS block_split_cpu_chunk_to_4k(uvm_va_block_t *block, uvm_page_index_t page_index) 8573 { 8574 uvm_cpu_chunk_storage_mixed_t *mixed; 8575 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 8576 uvm_cpu_chunk_t **small_chunks; 8577 size_t slot_index; 8578 NV_STATUS status; 8579 8580 UVM_ASSERT(chunk); 8581 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K); 8582 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED); 8583 8584 mixed = uvm_cpu_storage_get_ptr(block); 8585 slot_index = compute_slot_index(block, page_index); 8586 small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT); 8587 if (!small_chunks) 8588 return NV_ERR_NO_MEMORY; 8589 8590 status = uvm_cpu_chunk_split(chunk, small_chunks); 8591 if (status != NV_OK) { 8592 uvm_kvfree(small_chunks); 8593 return status; 8594 } 8595 8596 mixed->slots[slot_index] = small_chunks; 8597 clear_bit(slot_index, mixed->big_chunks); 8598 return status; 8599 } 8600 8601 static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index_t page_index) 8602 { 8603 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 8604 uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk); 8605 uvm_chunk_size_t new_size; 8606 uvm_gpu_t *gpu; 8607 NvU64 gpu_mapping_addr; 8608 uvm_processor_mask_t gpu_split_mask; 8609 uvm_gpu_id_t id; 8610 NV_STATUS status; 8611 8612 if (chunk_size == UVM_CHUNK_SIZE_2M) 8613 new_size = UVM_CHUNK_SIZE_64K; 8614 else 8615 new_size = UVM_CHUNK_SIZE_4K; 8616 8617 UVM_ASSERT(IS_ALIGNED(chunk_size, new_size)); 8618 8619 uvm_processor_mask_zero(&gpu_split_mask); 8620 for_each_gpu_id(id) { 8621 if (!uvm_va_block_gpu_state_get(block, id)) 8622 continue; 8623 8624 gpu = block_get_gpu(block, id); 8625 8626 // If the parent chunk has not been mapped, there is nothing to split. 8627 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 8628 if (gpu_mapping_addr == 0) 8629 continue; 8630 8631 status = uvm_pmm_sysmem_mappings_split_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings, 8632 gpu_mapping_addr, 8633 new_size); 8634 if (status != NV_OK) 8635 goto merge; 8636 8637 uvm_processor_mask_set(&gpu_split_mask, id); 8638 } 8639 8640 if (new_size == UVM_CHUNK_SIZE_64K) 8641 status = block_split_cpu_chunk_to_64k(block); 8642 else 8643 status = block_split_cpu_chunk_to_4k(block, page_index); 8644 8645 if (status != NV_OK) { 8646 merge: 8647 for_each_gpu_id_in_mask(id, &gpu_split_mask) { 8648 gpu = block_get_gpu(block, id); 8649 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 8650 uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings, 8651 gpu_mapping_addr, 8652 chunk_size); 8653 } 8654 } 8655 8656 return status; 8657 } 8658 8659 static NV_STATUS block_prealloc_cpu_chunk_storage(uvm_va_block_t *existing, uvm_va_block_t *new) 8660 { 8661 uvm_cpu_chunk_storage_mixed_t *existing_mixed; 8662 uvm_cpu_chunk_storage_mixed_t *new_mixed = NULL; 8663 size_t slot_offset; 8664 size_t existing_slot; 8665 NV_STATUS status = NV_OK; 8666 8667 UVM_ASSERT(uvm_cpu_storage_get_type(existing) == UVM_CPU_CHUNK_STORAGE_MIXED); 8668 existing_mixed = uvm_cpu_storage_get_ptr(existing); 8669 8670 // Pre-allocate chunk storage for the new block. By definition, the new block 8671 // will contain either 64K and/or 4K chunks. 8672 // 8673 // We do this here so there are no failures in block_split_cpu(). 8674 new_mixed = uvm_kvmalloc_zero(sizeof(*new_mixed)); 8675 if (!new_mixed) 8676 return NV_ERR_NO_MEMORY; 8677 8678 slot_offset = compute_slot_index(existing, uvm_va_block_cpu_page_index(existing, new->start)); 8679 existing_slot = slot_offset; 8680 for_each_clear_bit_from(existing_slot, existing_mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK) { 8681 size_t new_slot = existing_slot - slot_offset; 8682 8683 if (existing_mixed->slots[existing_slot]) { 8684 uvm_cpu_chunk_t **small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT); 8685 8686 if (!small_chunks) { 8687 status = NV_ERR_NO_MEMORY; 8688 goto done; 8689 } 8690 8691 new_mixed->slots[new_slot] = small_chunks; 8692 } 8693 } 8694 8695 new->cpu.chunks = (unsigned long)new_mixed | UVM_CPU_CHUNK_STORAGE_MIXED; 8696 UVM_ASSERT(status == NV_OK); 8697 8698 done: 8699 if (status != NV_OK) { 8700 for (; existing_slot > slot_offset; existing_slot--) 8701 uvm_kvfree(new_mixed->slots[existing_slot - slot_offset]); 8702 8703 uvm_kvfree(new_mixed); 8704 } 8705 8706 return status; 8707 } 8708 8709 static void block_free_cpu_chunk_storage(uvm_va_block_t *block) 8710 { 8711 if (block->cpu.chunks) { 8712 uvm_cpu_chunk_storage_mixed_t *mixed; 8713 size_t slot_index; 8714 8715 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED); 8716 mixed = uvm_cpu_storage_get_ptr(block); 8717 for (slot_index = 0; slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK; slot_index++) 8718 uvm_kvfree(mixed->slots[slot_index]); 8719 8720 uvm_kvfree(mixed); 8721 block->cpu.chunks = 0; 8722 } 8723 } 8724 8725 // Perform any CPU chunk splitting that may be required for this block split. 8726 // Just like block_presplit_gpu_chunks, no chunks are moved to the new block. 8727 static NV_STATUS block_presplit_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new) 8728 { 8729 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start); 8730 uvm_cpu_chunk_t *splitting_chunk; 8731 uvm_chunk_sizes_mask_t split_sizes = uvm_cpu_chunk_get_allocation_sizes(); 8732 uvm_chunk_size_t subchunk_size; 8733 NV_STATUS status = NV_OK; 8734 8735 UVM_ASSERT(!IS_ALIGNED(new->start, UVM_VA_BLOCK_SIZE)); 8736 splitting_chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index); 8737 8738 // If the page covering the split point has not been populated, there is no 8739 // need to split. 8740 if (!splitting_chunk) 8741 return NV_OK; 8742 8743 // If the split point is aligned on the chunk size, there is no need to 8744 // split. 8745 if (IS_ALIGNED(new->start, uvm_cpu_chunk_get_size(splitting_chunk))) 8746 return NV_OK; 8747 8748 // Remove all sizes above the chunk's current size. 8749 split_sizes &= uvm_cpu_chunk_get_size(splitting_chunk) - 1; 8750 // Remove all sizes below the alignment of the new block's start. 8751 split_sizes &= ~(IS_ALIGNED(new->start, UVM_CHUNK_SIZE_64K) ? UVM_CHUNK_SIZE_64K - 1 : 0); 8752 8753 for_each_chunk_size_rev(subchunk_size, split_sizes) { 8754 status = block_split_cpu_chunk_one(existing, page_index); 8755 if (status != NV_OK) 8756 return status; 8757 } 8758 8759 return block_prealloc_cpu_chunk_storage(existing, new); 8760 } 8761 8762 static void block_merge_cpu_chunks_to_64k(uvm_va_block_t *block, uvm_page_index_t page_index) 8763 { 8764 uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(block); 8765 size_t slot_index = compute_slot_index(block, page_index); 8766 uvm_cpu_chunk_t **small_chunks = mixed->slots[slot_index]; 8767 uvm_cpu_chunk_t *merged_chunk; 8768 8769 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED); 8770 UVM_ASSERT(small_chunks); 8771 UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks)); 8772 8773 merged_chunk = uvm_cpu_chunk_merge(small_chunks); 8774 mixed->slots[slot_index] = merged_chunk; 8775 set_bit(slot_index, mixed->big_chunks); 8776 uvm_kvfree(small_chunks); 8777 } 8778 8779 static void block_merge_cpu_chunks_to_2m(uvm_va_block_t *block, uvm_page_index_t page_index) 8780 { 8781 uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(block); 8782 uvm_cpu_chunk_t **big_chunks = (uvm_cpu_chunk_t **)&mixed->slots; 8783 uvm_cpu_chunk_t *merged_chunk; 8784 8785 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED); 8786 UVM_ASSERT(bitmap_full(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK)); 8787 8788 merged_chunk = uvm_cpu_chunk_merge(big_chunks); 8789 block->cpu.chunks = (unsigned long)merged_chunk | UVM_CPU_CHUNK_STORAGE_CHUNK; 8790 uvm_kvfree(mixed); 8791 } 8792 8793 static void block_merge_cpu_chunks_one(uvm_va_block_t *block, uvm_page_index_t page_index) 8794 { 8795 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 8796 uvm_gpu_id_t id; 8797 8798 if (uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K) { 8799 block_merge_cpu_chunks_to_64k(block, page_index); 8800 } 8801 else { 8802 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K); 8803 block_merge_cpu_chunks_to_2m(block, page_index); 8804 } 8805 8806 chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 8807 8808 for_each_gpu_id(id) { 8809 NvU64 gpu_mapping_addr; 8810 uvm_gpu_t *gpu; 8811 8812 if (!uvm_va_block_gpu_state_get(block, id)) 8813 continue; 8814 8815 gpu = block_get_gpu(block, id); 8816 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 8817 if (gpu_mapping_addr == 0) 8818 continue; 8819 8820 uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings, 8821 gpu_mapping_addr, 8822 uvm_cpu_chunk_get_size(chunk)); 8823 } 8824 } 8825 8826 static void block_merge_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new) 8827 { 8828 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start); 8829 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index); 8830 uvm_chunk_sizes_mask_t merge_sizes = uvm_cpu_chunk_get_allocation_sizes(); 8831 uvm_chunk_size_t largest_size; 8832 uvm_chunk_size_t chunk_size; 8833 uvm_chunk_size_t merge_size; 8834 size_t block_size = uvm_va_block_size(existing); 8835 8836 if (!chunk || uvm_cpu_chunk_is_physical(chunk)) 8837 return; 8838 8839 chunk_size = uvm_cpu_chunk_get_size(chunk); 8840 8841 // Remove all CPU chunk sizes above the size of the existing VA block. 8842 // Since block sizes are not always powers of 2, use the largest power of 2 8843 // less than or equal to the block size since we can't merge to a size 8844 // larger than the block's size. 8845 largest_size = rounddown_pow_of_two(block_size); 8846 merge_sizes &= (largest_size | (largest_size - 1)); 8847 8848 // Remove all CPU chunk sizes smaller than the size of the chunk being merged up. 8849 merge_sizes &= ~(chunk_size | (chunk_size - 1)); 8850 8851 for_each_chunk_size(merge_size, merge_sizes) { 8852 uvm_va_block_region_t chunk_region; 8853 8854 // The block has to fully contain the VA range after the merge. 8855 if (!uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size)) || 8856 !uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size) + merge_size - 1)) 8857 break; 8858 8859 chunk_region = uvm_va_block_chunk_region(existing, merge_size, page_index); 8860 8861 // If not all pages in the region covered by the chunk are allocated, 8862 // we can't merge. 8863 if (!uvm_page_mask_region_full(&existing->cpu.allocated, chunk_region)) 8864 break; 8865 8866 block_merge_cpu_chunks_one(existing, chunk_region.first); 8867 chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index); 8868 if (uvm_cpu_chunk_is_physical(chunk)) 8869 break; 8870 } 8871 8872 block_free_cpu_chunk_storage(new); 8873 } 8874 8875 // Pre-allocate everything which doesn't require retry on both existing and new 8876 // which will be needed to handle a split. If this fails, existing must remain 8877 // functionally unmodified. 8878 static NV_STATUS block_split_preallocate_no_retry(uvm_va_block_t *existing, uvm_va_block_t *new) 8879 { 8880 NV_STATUS status; 8881 uvm_gpu_t *gpu; 8882 uvm_gpu_id_t id; 8883 uvm_page_index_t split_page_index; 8884 uvm_va_block_test_t *block_test; 8885 8886 status = block_presplit_cpu_chunks(existing, new); 8887 if (status != NV_OK) 8888 goto error; 8889 8890 for_each_gpu_id(id) { 8891 if (!uvm_va_block_gpu_state_get(existing, id)) 8892 continue; 8893 8894 gpu = block_get_gpu(existing, id); 8895 8896 status = block_presplit_gpu_chunks(existing, new, gpu); 8897 if (status != NV_OK) 8898 goto error; 8899 8900 if (!block_gpu_state_get_alloc(new, gpu)) { 8901 status = NV_ERR_NO_MEMORY; 8902 goto error; 8903 } 8904 } 8905 8906 block_test = uvm_va_block_get_test(existing); 8907 if (block_test && block_test->inject_split_error) { 8908 block_test->inject_split_error = false; 8909 if (!uvm_va_block_is_hmm(existing)) { 8910 UVM_ASSERT(existing->va_range->inject_split_error); 8911 existing->va_range->inject_split_error = false; 8912 } 8913 status = NV_ERR_NO_MEMORY; 8914 goto error; 8915 } 8916 8917 if (uvm_va_block_is_hmm(existing)) { 8918 uvm_va_policy_node_t *node = uvm_va_policy_node_find(existing, new->start); 8919 8920 if (node && node->node.start != new->start) { 8921 status = uvm_va_policy_node_split(existing, node, new->start - 1, NULL); 8922 if (status != NV_OK) 8923 goto error; 8924 } 8925 } 8926 8927 return NV_OK; 8928 8929 error: 8930 // Merge back the chunks we split 8931 split_page_index = uvm_va_block_cpu_page_index(existing, new->start); 8932 8933 for_each_gpu_id(id) { 8934 uvm_gpu_chunk_t *chunk; 8935 size_t chunk_index; 8936 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, id); 8937 8938 if (!existing_gpu_state) 8939 continue; 8940 8941 // If the chunk spanning the split point was split, merge it back 8942 gpu = block_get_gpu(existing, id); 8943 chunk_index = block_gpu_chunk_index(existing, gpu, split_page_index, NULL); 8944 chunk = existing_gpu_state->chunks[chunk_index]; 8945 if (!chunk || chunk->state != UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT) 8946 continue; 8947 8948 block_merge_chunk(existing, gpu, chunk); 8949 8950 // We could attempt to shrink the chunks array back down, but it doesn't 8951 // hurt much to have it larger than necessary, and we'd have to handle 8952 // the shrink call failing anyway on this error path. 8953 8954 } 8955 8956 block_merge_cpu_chunks(existing, new); 8957 8958 return status; 8959 } 8960 8961 // Re-calculate the block's top-level processor masks: 8962 // - block->mapped 8963 // - block->resident 8964 // 8965 // This is called on block split. 8966 static void block_set_processor_masks(uvm_va_block_t *block) 8967 { 8968 size_t num_pages = uvm_va_block_num_cpu_pages(block); 8969 uvm_va_block_region_t block_region = uvm_va_block_region(0, num_pages); 8970 uvm_gpu_id_t id; 8971 8972 if (uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], block_region)) { 8973 UVM_ASSERT(uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], block_region)); 8974 uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU); 8975 } 8976 else { 8977 uvm_processor_mask_set(&block->mapped, UVM_ID_CPU); 8978 } 8979 8980 if (uvm_page_mask_region_empty(&block->cpu.resident, block_region)) { 8981 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 8982 8983 if (uvm_processor_mask_get_gpu_count(&va_space->can_access[UVM_ID_CPU_VALUE]) == 0) 8984 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)); 8985 8986 block_clear_resident_processor(block, UVM_ID_CPU); 8987 } 8988 else { 8989 block_set_resident_processor(block, UVM_ID_CPU); 8990 } 8991 8992 for_each_gpu_id(id) { 8993 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id); 8994 if (!gpu_state) 8995 continue; 8996 8997 if (uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], block_region)) { 8998 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], block_region)); 8999 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], block_region)); 9000 uvm_processor_mask_clear(&block->mapped, id); 9001 } 9002 else { 9003 uvm_processor_mask_set(&block->mapped, id); 9004 } 9005 9006 if (uvm_page_mask_region_empty(&gpu_state->resident, block_region)) 9007 block_clear_resident_processor(block, id); 9008 else 9009 block_set_resident_processor(block, id); 9010 9011 if (uvm_page_mask_region_empty(&gpu_state->evicted, block_region)) 9012 uvm_processor_mask_clear(&block->evicted_gpus, id); 9013 else 9014 uvm_processor_mask_set(&block->evicted_gpus, id); 9015 } 9016 } 9017 9018 // Split a PAGES_PER_UVM_VA_BLOCK sized bitmap into new and existing parts 9019 // corresponding to a block split. 9020 static void block_split_page_mask(uvm_page_mask_t *existing_mask, 9021 size_t existing_pages, 9022 uvm_page_mask_t *new_mask, 9023 size_t new_pages) 9024 { 9025 UVM_ASSERT_MSG(existing_pages + new_pages <= PAGES_PER_UVM_VA_BLOCK, "existing %zu new %zu\n", 9026 existing_pages, new_pages); 9027 9028 // The new block is always in the upper region of existing, so shift the bit 9029 // vectors down. 9030 // 9031 // Note that bitmap_shift_right requires both dst and src to be the same 9032 // size. That's ok since we don't scale them by block size. 9033 uvm_page_mask_shift_right(new_mask, existing_mask, existing_pages); 9034 uvm_page_mask_region_clear(existing_mask, uvm_va_block_region(existing_pages, existing_pages + new_pages)); 9035 } 9036 9037 // Split the CPU state within the existing block. existing's start is correct 9038 // but its end has not yet been adjusted. 9039 static void block_split_cpu(uvm_va_block_t *existing, uvm_va_block_t *new) 9040 { 9041 size_t existing_pages, new_pages = uvm_va_block_num_cpu_pages(new); 9042 uvm_pte_bits_cpu_t pte_bit; 9043 uvm_va_block_region_t block_region = uvm_va_block_region_from_block(existing); 9044 uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start); 9045 uvm_page_index_t page_index; 9046 uvm_page_index_t next_page_index; 9047 uvm_cpu_chunk_t *chunk; 9048 uvm_va_range_t *existing_va_range = existing->va_range; 9049 9050 if (existing_va_range) { 9051 UVM_ASSERT(existing->va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 9052 UVM_ASSERT(existing->va_range->type == new->va_range->type); 9053 } 9054 9055 UVM_ASSERT(existing->start < new->start); 9056 UVM_ASSERT(existing->end == new->end); 9057 9058 UVM_ASSERT(PAGE_ALIGNED(new->start)); 9059 UVM_ASSERT(PAGE_ALIGNED(existing->start)); 9060 9061 existing_pages = (new->start - existing->start) / PAGE_SIZE; 9062 9063 // We don't have to unmap the CPU since its virtual -> physical mappings 9064 // don't change. 9065 9066 page_index = uvm_va_block_next_page_in_mask(block_region, &existing->cpu.allocated, split_page_index - 1); 9067 9068 for_each_cpu_chunk_in_block_region_safe(chunk, 9069 page_index, 9070 next_page_index, 9071 existing, 9072 uvm_va_block_region(split_page_index, block_region.outer)) { 9073 uvm_page_index_t new_chunk_page_index; 9074 NV_STATUS status; 9075 9076 uvm_cpu_chunk_remove_from_block(existing, page_index); 9077 9078 // The chunk has to be adjusted for the new block before inserting it. 9079 new_chunk_page_index = page_index - split_page_index; 9080 9081 // This should never fail because all necessary storage was allocated 9082 // in block_presplit_cpu_chunks(). 9083 status = uvm_cpu_chunk_insert_in_block(new, chunk, new_chunk_page_index); 9084 UVM_ASSERT(status == NV_OK); 9085 } 9086 9087 new->cpu.ever_mapped = existing->cpu.ever_mapped; 9088 9089 block_split_page_mask(&existing->cpu.resident, existing_pages, &new->cpu.resident, new_pages); 9090 9091 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++) 9092 block_split_page_mask(&existing->cpu.pte_bits[pte_bit], existing_pages, &new->cpu.pte_bits[pte_bit], new_pages); 9093 } 9094 9095 // Fill out the blocks' chunks arrays with the chunks split by 9096 // block_presplit_gpu_chunks. 9097 static void block_copy_split_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu) 9098 { 9099 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id); 9100 uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu->id); 9101 uvm_gpu_chunk_t **temp_chunks; 9102 uvm_gpu_chunk_t *original_chunk; 9103 block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state; 9104 size_t num_pre_chunks, num_post_chunks, num_split_chunks_existing, num_split_chunks_new; 9105 uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start); 9106 size_t i; 9107 9108 block_gpu_chunk_get_split_state(existing, 9109 &existing_before_state, 9110 existing->start, 9111 existing->end, 9112 split_page_index, 9113 gpu); 9114 block_gpu_chunk_get_split_state(existing, 9115 &existing_after_state, 9116 existing->start, 9117 new->start - 1, 9118 split_page_index - 1, 9119 gpu); 9120 block_gpu_chunk_get_split_state(new, 9121 &new_state, 9122 new->start, 9123 new->end, 9124 0, 9125 gpu); 9126 9127 // General case (B is original_chunk): 9128 // split 9129 // v 9130 // existing (before) [------ A -----][------ B -----][------ C -----] 9131 // existing (after) [------ A -----][- B0 -] 9132 // new [- B1 -][------ C -----] 9133 // 9134 // Note that the logic below also handles the case of the split happening at 9135 // a chunk boundary. That case behaves as though there is no B0 chunk. 9136 9137 // Number of chunks to the left and right of original_chunk (A and C above). 9138 // Either or both of these may be 0. 9139 num_pre_chunks = existing_before_state.chunk_index; 9140 num_post_chunks = existing_before_state.num_chunks - num_pre_chunks - 1; 9141 9142 // Number of subchunks under existing's portion of original_chunk (B0 above) 9143 num_split_chunks_existing = existing_after_state.num_chunks - num_pre_chunks; 9144 9145 // Number of subchunks under new's portion of original_chunk (B1 above) 9146 num_split_chunks_new = new_state.num_chunks - num_post_chunks; 9147 9148 UVM_ASSERT(num_pre_chunks + num_split_chunks_existing > 0); 9149 UVM_ASSERT(num_split_chunks_new > 0); 9150 9151 // Copy post chunks from the end of existing into new (C above) 9152 memcpy(&new_gpu_state->chunks[num_split_chunks_new], 9153 &existing_gpu_state->chunks[existing_before_state.chunk_index + 1], 9154 num_post_chunks * sizeof(new_gpu_state->chunks[0])); 9155 9156 // Save off the original split chunk since we may overwrite the array 9157 original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index]; 9158 9159 // Fill out the new pointers 9160 if (original_chunk) { 9161 // Note that if the split happened at a chunk boundary, original_chunk 9162 // will not be split. In that case, num_split_chunks_existing will be 0 9163 // and num_split_chunks_new will be 1, so the left copy will be skipped 9164 // and the right copy will pick up the chunk. 9165 9166 // Copy left newly-split chunks into existing (B0 above). The array was 9167 // re-sized in block_presplit_gpu_chunks as necessary. 9168 size_t num_subchunks; 9169 9170 num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm, 9171 original_chunk, 9172 0, // start_index 9173 num_split_chunks_existing, 9174 &existing_gpu_state->chunks[existing_before_state.chunk_index]); 9175 UVM_ASSERT(num_subchunks == num_split_chunks_existing); 9176 9177 // Copy right newly-split chunks into new (B1 above), overwriting the 9178 // pointer to the original chunk. 9179 num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm, 9180 original_chunk, 9181 num_split_chunks_existing, // start_index 9182 num_split_chunks_new, 9183 &new_gpu_state->chunks[0]); 9184 UVM_ASSERT(num_subchunks == num_split_chunks_new); 9185 } 9186 else { 9187 // If the chunk wasn't already populated we don't need to copy pointers 9188 // anywhere, but we need to clear out stale pointers from existing's 9189 // array covering the new elements. new's chunks array was already zero- 9190 // initialized. 9191 memset(&existing_gpu_state->chunks[existing_before_state.chunk_index], 9192 0, 9193 num_split_chunks_existing * sizeof(existing_gpu_state->chunks[0])); 9194 } 9195 9196 // Since we update the reverse map information, protect it against a 9197 // concurrent lookup 9198 uvm_spin_lock(&gpu->pmm.list_lock); 9199 9200 // Update the reverse map of all the chunks that are now under the new block 9201 for (i = 0; i < new_state.num_chunks; ++i) { 9202 if (new_gpu_state->chunks[i]) { 9203 UVM_ASSERT(new_gpu_state->chunks[i]->va_block == existing); 9204 new_gpu_state->chunks[i]->va_block = new; 9205 9206 // Adjust the page_index within the VA block for the new subchunks in 9207 // the new VA block 9208 UVM_ASSERT(new_gpu_state->chunks[i]->va_block_page_index >= split_page_index); 9209 new_gpu_state->chunks[i]->va_block_page_index -= split_page_index; 9210 } 9211 } 9212 9213 uvm_spin_unlock(&gpu->pmm.list_lock); 9214 9215 // Attempt to shrink existing's chunk allocation. If the realloc fails, just 9216 // keep on using the old larger one. 9217 if (existing_after_state.num_chunks < existing_before_state.num_chunks) { 9218 temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks, 9219 existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0])); 9220 if (temp_chunks) 9221 existing_gpu_state->chunks = temp_chunks; 9222 } 9223 } 9224 9225 static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_id_t gpu_id) 9226 { 9227 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu_id); 9228 uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu_id); 9229 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing); 9230 uvm_gpu_va_space_t *gpu_va_space; 9231 uvm_gpu_t *gpu; 9232 uvm_gpu_t *accessing_gpu; 9233 size_t new_pages = uvm_va_block_num_cpu_pages(new); 9234 size_t existing_pages, existing_pages_4k, existing_pages_big, new_pages_big; 9235 uvm_pte_bits_gpu_t pte_bit; 9236 size_t num_chunks, i; 9237 uvm_cpu_chunk_t *cpu_chunk; 9238 uvm_page_index_t page_index; 9239 9240 if (!existing_gpu_state) 9241 return; 9242 9243 gpu = uvm_va_space_get_gpu(va_space, gpu_id); 9244 UVM_ASSERT(new_gpu_state); 9245 9246 new_gpu_state->force_4k_ptes = existing_gpu_state->force_4k_ptes; 9247 9248 UVM_ASSERT(PAGE_ALIGNED(new->start)); 9249 UVM_ASSERT(PAGE_ALIGNED(existing->start)); 9250 existing_pages = (new->start - existing->start) / PAGE_SIZE; 9251 9252 for_each_cpu_chunk_in_block(cpu_chunk, page_index, new) { 9253 uvm_pmm_sysmem_mappings_reparent_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, 9254 uvm_cpu_chunk_get_gpu_phys_addr(cpu_chunk, gpu->parent), 9255 new); 9256 } 9257 9258 block_copy_split_gpu_chunks(existing, new, gpu); 9259 9260 num_chunks = block_num_gpu_chunks(new, gpu); 9261 9262 // Reparent GPU mappings for indirect peers 9263 for (i = 0; i < num_chunks; ++i) { 9264 uvm_gpu_chunk_t *chunk = new_gpu_state->chunks[i]; 9265 if (!chunk) 9266 continue; 9267 9268 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 9269 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); 9270 9271 uvm_pmm_sysmem_mappings_reparent_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, 9272 peer_addr, 9273 new); 9274 } 9275 } 9276 9277 block_split_page_mask(&existing_gpu_state->resident, 9278 existing_pages, 9279 &new_gpu_state->resident, 9280 new_pages); 9281 9282 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { 9283 block_split_page_mask(&existing_gpu_state->pte_bits[pte_bit], existing_pages, 9284 &new_gpu_state->pte_bits[pte_bit], new_pages); 9285 } 9286 9287 // Adjust page table ranges. 9288 gpu_va_space = uvm_gpu_va_space_get(va_space, gpu); 9289 if (gpu_va_space) { 9290 if (existing_gpu_state->page_table_range_big.table) { 9291 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu); 9292 9293 // existing's end has not been adjusted yet 9294 existing_pages_big = range_num_big_pages(existing->start, new->start - 1, big_page_size); 9295 9296 // Take references on all big pages covered by new 9297 new_pages_big = uvm_va_block_num_big_pages(new, big_page_size); 9298 if (new_pages_big) { 9299 uvm_page_table_range_get_upper(&gpu_va_space->page_tables, 9300 &existing_gpu_state->page_table_range_big, 9301 &new_gpu_state->page_table_range_big, 9302 new_pages_big); 9303 9304 // If the split point is within a big page region, we might have 9305 // a gap since neither existing nor new can use it anymore. 9306 // Get the top N bits from existing's mask to handle that. 9307 bitmap_shift_right(new_gpu_state->big_ptes, 9308 existing_gpu_state->big_ptes, 9309 uvm_va_block_num_big_pages(existing, big_page_size) - new_pages_big, 9310 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 9311 9312 new_gpu_state->initialized_big = existing_gpu_state->initialized_big; 9313 } 9314 9315 // Drop existing's references on the big PTEs it no longer covers 9316 // now that new has references on them. Note that neither existing 9317 // nor new might have big PTEs after the split. In that case, this 9318 // shrink will free the entire old range. 9319 uvm_page_table_range_shrink(&gpu_va_space->page_tables, 9320 &existing_gpu_state->page_table_range_big, 9321 existing_pages_big); 9322 9323 if (existing_pages_big == 0) { 9324 memset(&existing_gpu_state->page_table_range_big, 0, sizeof(existing_gpu_state->page_table_range_big)); 9325 existing_gpu_state->initialized_big = false; 9326 } 9327 9328 bitmap_clear(existing_gpu_state->big_ptes, 9329 existing_pages_big, 9330 MAX_BIG_PAGES_PER_UVM_VA_BLOCK - existing_pages_big); 9331 } 9332 9333 if (existing_gpu_state->page_table_range_4k.table) { 9334 // Since existing and new share the same PDE we just need to bump 9335 // the ref-count on new's sub-range. 9336 uvm_page_table_range_get_upper(&gpu_va_space->page_tables, 9337 &existing_gpu_state->page_table_range_4k, 9338 &new_gpu_state->page_table_range_4k, 9339 uvm_va_block_size(new) / UVM_PAGE_SIZE_4K); 9340 9341 // Drop existing's references on the PTEs it no longer covers now 9342 // that new has references on them. 9343 existing_pages_4k = existing_pages * (PAGE_SIZE / UVM_PAGE_SIZE_4K); 9344 uvm_page_table_range_shrink(&gpu_va_space->page_tables, 9345 &existing_gpu_state->page_table_range_4k, 9346 existing_pages_4k); 9347 } 9348 9349 // We have to set this explicitly to handle the case of splitting an 9350 // invalid, active 2M PTE with no lower page tables allocated. 9351 if (existing_gpu_state->pte_is_2m) { 9352 UVM_ASSERT(!existing_gpu_state->page_table_range_big.table); 9353 UVM_ASSERT(!existing_gpu_state->page_table_range_4k.table); 9354 existing_gpu_state->pte_is_2m = false; 9355 } 9356 9357 // existing can't possibly cover 2MB after a split, so drop any 2M PTE 9358 // references it has. We've taken the necessary references on the lower 9359 // tables above. 9360 block_put_ptes_safe(&gpu_va_space->page_tables, &existing_gpu_state->page_table_range_2m); 9361 existing_gpu_state->activated_big = false; 9362 existing_gpu_state->activated_4k = false; 9363 } 9364 9365 block_split_page_mask(&existing_gpu_state->evicted, existing_pages, &new_gpu_state->evicted, new_pages); 9366 } 9367 9368 NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block, 9369 NvU64 new_end, 9370 uvm_va_block_t **new_va_block, 9371 uvm_va_range_t *new_va_range) 9372 { 9373 uvm_va_space_t *va_space; 9374 uvm_va_block_t *new_block = NULL; 9375 NV_STATUS status; 9376 9377 va_space = new_va_range->va_space; 9378 UVM_ASSERT(existing_va_block->va_range); 9379 UVM_ASSERT(existing_va_block->va_range->va_space == va_space); 9380 UVM_ASSERT(!uvm_va_block_is_hmm(existing_va_block)); 9381 9382 // External range types can't be split 9383 UVM_ASSERT(existing_va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 9384 UVM_ASSERT(new_va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 9385 uvm_assert_rwsem_locked_write(&va_space->lock); 9386 9387 UVM_ASSERT(new_end > existing_va_block->start); 9388 UVM_ASSERT(new_end < existing_va_block->end); 9389 UVM_ASSERT(PAGE_ALIGNED(new_end + 1)); 9390 9391 status = uvm_va_block_create(new_va_range, new_end + 1, existing_va_block->end, &new_block); 9392 if (status != NV_OK) 9393 return status; 9394 9395 // We're protected from other splits and faults by the va_space lock being 9396 // held in write mode, but that doesn't stop the reverse mapping (eviction 9397 // path) from inspecting the existing block. Stop those threads by taking 9398 // the block lock. When a reverse mapping thread takes this lock after the 9399 // split has been performed, it will have to re-inspect state and may see 9400 // that it should use the newly-split block instead. 9401 uvm_mutex_lock(&existing_va_block->lock); 9402 9403 status = uvm_va_block_split_locked(existing_va_block, new_end, new_block, new_va_range); 9404 9405 uvm_mutex_unlock(&existing_va_block->lock); 9406 9407 if (status != NV_OK) 9408 uvm_va_block_release(new_block); 9409 else if (new_va_block) 9410 *new_va_block = new_block; 9411 9412 return status; 9413 } 9414 9415 NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block, 9416 NvU64 new_end, 9417 uvm_va_block_t *new_block, 9418 uvm_va_range_t *new_va_range) 9419 { 9420 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing_va_block); 9421 uvm_gpu_id_t id; 9422 NV_STATUS status; 9423 uvm_perf_event_data_t event_data; 9424 9425 UVM_ASSERT(block_check_chunks(existing_va_block)); 9426 9427 // As soon as we update existing's reverse mappings to point to the newly- 9428 // split block, the eviction path could try to operate on the new block. 9429 // Lock that out too until new is ready. 9430 // 9431 // Note that we usually shouldn't nest block locks, but it's ok here because 9432 // we just created new_block so no other thread could possibly take it out 9433 // of order with existing's lock. 9434 uvm_mutex_lock_no_tracking(&new_block->lock); 9435 9436 // The split has to be transactional, meaning that if we fail, the existing 9437 // block must not be modified. Handle that by pre-allocating everything we 9438 // might need under both existing and new at the start so we only have a 9439 // single point of failure. 9440 9441 // Since pre-allocation might require allocating new PTEs, we have to handle 9442 // allocation retry which might drop existing's block lock. The 9443 // preallocation is split into two steps for that: the first part which 9444 // allocates and splits PTEs can handle having the block lock dropped then 9445 // re-taken. It won't modify existing_va_block other than adding new PTE 9446 // allocations and splitting existing PTEs, which is always safe. 9447 status = UVM_VA_BLOCK_RETRY_LOCKED(existing_va_block, 9448 NULL, 9449 block_split_presplit_ptes(existing_va_block, new_block)); 9450 if (status != NV_OK) 9451 goto out; 9452 9453 // Pre-allocate, stage two. This modifies existing_va_block in ways which 9454 // violate many assumptions (such as changing chunk size), but it will put 9455 // things back into place on a failure without dropping the block lock. 9456 status = block_split_preallocate_no_retry(existing_va_block, new_block); 9457 if (status != NV_OK) 9458 goto out; 9459 9460 // We'll potentially be freeing page tables, so we need to wait for any 9461 // outstanding work before we start 9462 status = uvm_tracker_wait(&existing_va_block->tracker); 9463 if (status != NV_OK) 9464 goto out; 9465 9466 // Update existing's state only once we're past all failure points 9467 9468 event_data.block_shrink.block = existing_va_block; 9469 uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_SHRINK, &event_data); 9470 9471 block_split_cpu(existing_va_block, new_block); 9472 9473 for_each_gpu_id(id) 9474 block_split_gpu(existing_va_block, new_block, id); 9475 9476 // Update the size of the existing block first so that 9477 // block_set_processor_masks can use block_{set,clear}_resident_processor 9478 // that relies on the size to be correct. 9479 existing_va_block->end = new_end; 9480 9481 block_split_page_mask(&existing_va_block->read_duplicated_pages, 9482 uvm_va_block_num_cpu_pages(existing_va_block), 9483 &new_block->read_duplicated_pages, 9484 uvm_va_block_num_cpu_pages(new_block)); 9485 9486 block_split_page_mask(&existing_va_block->maybe_mapped_pages, 9487 uvm_va_block_num_cpu_pages(existing_va_block), 9488 &new_block->maybe_mapped_pages, 9489 uvm_va_block_num_cpu_pages(new_block)); 9490 9491 block_set_processor_masks(existing_va_block); 9492 block_set_processor_masks(new_block); 9493 9494 if (uvm_va_block_is_hmm(existing_va_block)) { 9495 uvm_hmm_va_block_split_tree(existing_va_block, new_block); 9496 uvm_va_policy_node_split_move(existing_va_block, new_block); 9497 } 9498 9499 out: 9500 // Run checks on existing_va_block even on failure, since an error must 9501 // leave the block in a consistent state. 9502 UVM_ASSERT(block_check_chunks(existing_va_block)); 9503 UVM_ASSERT(block_check_mappings(existing_va_block)); 9504 if (status == NV_OK) { 9505 UVM_ASSERT(block_check_chunks(new_block)); 9506 UVM_ASSERT(block_check_mappings(new_block)); 9507 } 9508 else { 9509 block_free_cpu_chunk_storage(new_block); 9510 } 9511 9512 uvm_mutex_unlock_no_tracking(&new_block->lock); 9513 9514 return status; 9515 } 9516 9517 static bool block_region_might_read_duplicate(uvm_va_block_t *va_block, 9518 uvm_va_block_region_t region) 9519 { 9520 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9521 uvm_va_range_t *va_range = va_block->va_range; 9522 9523 if (!uvm_va_space_can_read_duplicate(va_space, NULL)) 9524 return false; 9525 9526 // TODO: Bug 3660922: need to implement HMM read duplication support. 9527 if (uvm_va_block_is_hmm(va_block) || 9528 uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_DISABLED) 9529 return false; 9530 9531 if (uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_UNSET 9532 && uvm_page_mask_region_weight(&va_block->read_duplicated_pages, region) == 0) 9533 return false; 9534 9535 return true; 9536 } 9537 9538 // Returns the new access permission for the processor that faulted or 9539 // triggered access counter notifications on the given page 9540 // 9541 // TODO: Bug 1766424: this function works on a single page at a time. This 9542 // could be changed in the future to optimize multiple faults/counters on 9543 // contiguous pages. 9544 static uvm_prot_t compute_new_permission(uvm_va_block_t *va_block, 9545 struct vm_area_struct *hmm_vma, 9546 uvm_page_index_t page_index, 9547 uvm_processor_id_t fault_processor_id, 9548 uvm_processor_id_t new_residency, 9549 uvm_fault_access_type_t access_type) 9550 { 9551 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9552 uvm_prot_t logical_prot, new_prot; 9553 9554 // TODO: Bug 1766432: Refactor into policies. Current policy is 9555 // query_promote: upgrade access privileges to avoid future faults IF 9556 // they don't trigger further revocations. 9557 new_prot = uvm_fault_access_type_to_prot(access_type); 9558 logical_prot = compute_logical_prot(va_block, hmm_vma, page_index); 9559 9560 UVM_ASSERT(logical_prot >= new_prot); 9561 9562 if (logical_prot > UVM_PROT_READ_ONLY && new_prot == UVM_PROT_READ_ONLY && 9563 !block_region_might_read_duplicate(va_block, uvm_va_block_region_for_page(page_index))) { 9564 uvm_processor_mask_t processors_with_atomic_mapping; 9565 uvm_processor_mask_t revoke_processors; 9566 9567 block_page_authorized_processors(va_block, 9568 page_index, 9569 UVM_PROT_READ_WRITE_ATOMIC, 9570 &processors_with_atomic_mapping); 9571 9572 uvm_processor_mask_andnot(&revoke_processors, 9573 &processors_with_atomic_mapping, 9574 &va_space->has_native_atomics[uvm_id_value(new_residency)]); 9575 9576 // Only check if there are no faultable processors in the revoke 9577 // processors mask. 9578 uvm_processor_mask_and(&revoke_processors, &revoke_processors, &va_space->faultable_processors); 9579 9580 if (uvm_processor_mask_empty(&revoke_processors)) 9581 new_prot = UVM_PROT_READ_WRITE; 9582 } 9583 if (logical_prot == UVM_PROT_READ_WRITE_ATOMIC && new_prot == UVM_PROT_READ_WRITE) { 9584 if (uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], fault_processor_id)) 9585 new_prot = UVM_PROT_READ_WRITE_ATOMIC; 9586 } 9587 9588 return new_prot; 9589 } 9590 9591 static NV_STATUS do_block_add_mappings_after_migration(uvm_va_block_t *va_block, 9592 uvm_va_block_context_t *va_block_context, 9593 uvm_processor_id_t new_residency, 9594 uvm_processor_id_t processor_id, 9595 const uvm_processor_mask_t *map_processors, 9596 uvm_va_block_region_t region, 9597 const uvm_page_mask_t *map_page_mask, 9598 uvm_prot_t max_prot, 9599 const uvm_processor_mask_t *thrashing_processors, 9600 uvm_tracker_t *tracker) 9601 { 9602 NV_STATUS status; 9603 uvm_processor_id_t map_processor_id; 9604 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9605 uvm_prot_t new_map_prot = max_prot; 9606 uvm_processor_mask_t map_processors_local; 9607 9608 uvm_processor_mask_copy(&map_processors_local, map_processors); 9609 9610 // Handle atomic mappings separately 9611 if (max_prot == UVM_PROT_READ_WRITE_ATOMIC) { 9612 bool this_processor_has_native_atomics; 9613 9614 this_processor_has_native_atomics = 9615 uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id); 9616 9617 if (this_processor_has_native_atomics) { 9618 uvm_processor_mask_t map_atomic_processors; 9619 9620 // Compute processors with native atomics to the residency 9621 uvm_processor_mask_and(&map_atomic_processors, 9622 &map_processors_local, 9623 &va_space->has_native_atomics[uvm_id_value(new_residency)]); 9624 9625 // Filter out these mapped processors for the next steps 9626 uvm_processor_mask_andnot(&map_processors_local, &map_processors_local, &map_atomic_processors); 9627 9628 for_each_id_in_mask(map_processor_id, &map_atomic_processors) { 9629 UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy; 9630 if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id)) 9631 cause = UvmEventMapRemoteCauseThrashing; 9632 9633 status = uvm_va_block_map(va_block, 9634 va_block_context, 9635 map_processor_id, 9636 region, 9637 map_page_mask, 9638 UVM_PROT_READ_WRITE_ATOMIC, 9639 cause, 9640 tracker); 9641 if (status != NV_OK) 9642 return status; 9643 } 9644 9645 new_map_prot = UVM_PROT_READ_WRITE; 9646 } 9647 else { 9648 if (UVM_ID_IS_CPU(processor_id)) 9649 new_map_prot = UVM_PROT_READ_WRITE; 9650 else 9651 new_map_prot = UVM_PROT_READ_ONLY; 9652 } 9653 } 9654 9655 // Map the rest of processors 9656 for_each_id_in_mask(map_processor_id, &map_processors_local) { 9657 UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy; 9658 uvm_prot_t final_map_prot; 9659 bool map_processor_has_enabled_system_wide_atomics = 9660 uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, map_processor_id); 9661 9662 // Write mappings from processors with disabled system-wide atomics are treated like atomics 9663 if (new_map_prot == UVM_PROT_READ_WRITE && !map_processor_has_enabled_system_wide_atomics) 9664 final_map_prot = UVM_PROT_READ_WRITE_ATOMIC; 9665 else 9666 final_map_prot = new_map_prot; 9667 9668 if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id)) 9669 cause = UvmEventMapRemoteCauseThrashing; 9670 9671 status = uvm_va_block_map(va_block, 9672 va_block_context, 9673 map_processor_id, 9674 region, 9675 map_page_mask, 9676 final_map_prot, 9677 cause, 9678 tracker); 9679 if (status != NV_OK) 9680 return status; 9681 } 9682 9683 return NV_OK; 9684 } 9685 9686 NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block, 9687 uvm_va_block_context_t *va_block_context, 9688 uvm_processor_id_t new_residency, 9689 uvm_processor_id_t processor_id, 9690 uvm_va_block_region_t region, 9691 const uvm_page_mask_t *map_page_mask, 9692 uvm_prot_t max_prot, 9693 const uvm_processor_mask_t *thrashing_processors) 9694 { 9695 NV_STATUS tracker_status, status = NV_OK; 9696 uvm_processor_mask_t map_other_processors, map_uvm_lite_gpus; 9697 uvm_processor_id_t map_processor_id; 9698 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9699 const uvm_page_mask_t *final_page_mask = map_page_mask; 9700 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 9701 const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region); 9702 uvm_processor_id_t preferred_location; 9703 9704 uvm_assert_mutex_locked(&va_block->lock); 9705 9706 // Read duplication takes precedence over SetAccessedBy. 9707 // 9708 // Exclude ranges with read duplication set... 9709 if (uvm_va_policy_is_read_duplicate(policy, va_space)) { 9710 status = NV_OK; 9711 goto out; 9712 } 9713 9714 // ... and pages read-duplicated by performance heuristics 9715 if (policy->read_duplication == UVM_READ_DUPLICATION_UNSET) { 9716 if (map_page_mask) { 9717 uvm_page_mask_andnot(&va_block_context->mapping.filtered_page_mask, 9718 map_page_mask, 9719 &va_block->read_duplicated_pages); 9720 } 9721 else { 9722 uvm_page_mask_complement(&va_block_context->mapping.filtered_page_mask, &va_block->read_duplicated_pages); 9723 } 9724 final_page_mask = &va_block_context->mapping.filtered_page_mask; 9725 } 9726 9727 // Add mappings for accessed_by processors and the given processor mask 9728 if (thrashing_processors) 9729 uvm_processor_mask_or(&map_other_processors, &policy->accessed_by, thrashing_processors); 9730 else 9731 uvm_processor_mask_copy(&map_other_processors, &policy->accessed_by); 9732 9733 // Only processors that can access the new location must be considered 9734 uvm_processor_mask_and(&map_other_processors, 9735 &map_other_processors, 9736 &va_space->accessible_from[uvm_id_value(new_residency)]); 9737 9738 // Exclude caller processor as it must have already been mapped 9739 uvm_processor_mask_clear(&map_other_processors, processor_id); 9740 9741 // Exclude preferred location so it won't get remote mappings 9742 preferred_location = policy->preferred_location; 9743 if (UVM_ID_IS_VALID(preferred_location) && 9744 !uvm_id_equal(new_residency, preferred_location) && 9745 uvm_va_space_processor_has_memory(va_space, preferred_location)) { 9746 uvm_processor_mask_clear(&map_other_processors, preferred_location); 9747 } 9748 9749 // Map the UVM-Lite GPUs if the new location is the preferred location. This 9750 // will only create mappings on first touch. After that they're persistent 9751 // so uvm_va_block_map will be a no-op. 9752 uvm_processor_mask_and(&map_uvm_lite_gpus, &map_other_processors, block_get_uvm_lite_gpus(va_block)); 9753 if (!uvm_processor_mask_empty(&map_uvm_lite_gpus) && 9754 uvm_id_equal(new_residency, preferred_location)) { 9755 for_each_id_in_mask(map_processor_id, &map_uvm_lite_gpus) { 9756 status = uvm_va_block_map(va_block, 9757 va_block_context, 9758 map_processor_id, 9759 region, 9760 final_page_mask, 9761 UVM_PROT_READ_WRITE_ATOMIC, 9762 UvmEventMapRemoteCauseCoherence, 9763 &local_tracker); 9764 if (status != NV_OK) 9765 goto out; 9766 } 9767 } 9768 9769 uvm_processor_mask_andnot(&map_other_processors, &map_other_processors, block_get_uvm_lite_gpus(va_block)); 9770 9771 // We can't map non-migratable pages to the CPU. If we have any, build a 9772 // new mask of migratable pages and map the CPU separately. 9773 if (uvm_processor_mask_test(&map_other_processors, UVM_ID_CPU) && 9774 !uvm_range_group_all_migratable(va_space, 9775 uvm_va_block_region_start(va_block, region), 9776 uvm_va_block_region_end(va_block, region))) { 9777 uvm_page_mask_t *migratable_mask = &va_block_context->mapping.migratable_mask; 9778 9779 uvm_range_group_migratable_page_mask(va_block, region, migratable_mask); 9780 if (uvm_page_mask_and(migratable_mask, migratable_mask, final_page_mask)) { 9781 uvm_processor_mask_t cpu_mask; 9782 uvm_processor_mask_zero(&cpu_mask); 9783 uvm_processor_mask_set(&cpu_mask, UVM_ID_CPU); 9784 9785 status = do_block_add_mappings_after_migration(va_block, 9786 va_block_context, 9787 new_residency, 9788 processor_id, 9789 &cpu_mask, 9790 region, 9791 migratable_mask, 9792 max_prot, 9793 thrashing_processors, 9794 &local_tracker); 9795 if (status != NV_OK) 9796 goto out; 9797 } 9798 9799 uvm_processor_mask_clear(&map_other_processors, UVM_ID_CPU); 9800 } 9801 9802 status = do_block_add_mappings_after_migration(va_block, 9803 va_block_context, 9804 new_residency, 9805 processor_id, 9806 &map_other_processors, 9807 region, 9808 final_page_mask, 9809 max_prot, 9810 thrashing_processors, 9811 &local_tracker); 9812 if (status != NV_OK) 9813 goto out; 9814 9815 out: 9816 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 9817 uvm_tracker_deinit(&local_tracker); 9818 return status == NV_OK ? tracker_status : status; 9819 } 9820 9821 uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block, 9822 uvm_processor_id_t processor_id, 9823 uvm_page_index_t page_index) 9824 { 9825 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9826 uvm_processor_mask_t resident_processors; 9827 NvU32 resident_processors_count; 9828 9829 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), processor_id)) 9830 return UVM_PROT_READ_WRITE_ATOMIC; 9831 9832 uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors); 9833 resident_processors_count = uvm_processor_mask_get_count(&resident_processors); 9834 9835 if (resident_processors_count == 0) { 9836 return UVM_PROT_NONE; 9837 } 9838 else if (resident_processors_count > 1) { 9839 // If there are many copies, we can only map READ ONLY 9840 // 9841 // The block state doesn't track the mapping target (aperture) of each 9842 // individual PTE, just the permissions and where the data is resident. 9843 // If the data is resident in multiple places, then we have a problem 9844 // since we can't know where the PTE points. This means we won't know 9845 // what needs to be unmapped for cases like UvmUnregisterGpu and 9846 // UvmDisablePeerAccess. 9847 // 9848 // The simple way to solve this is to enforce that a read-duplication 9849 // mapping always points to local memory. 9850 if (uvm_processor_mask_test(&resident_processors, processor_id)) 9851 return UVM_PROT_READ_ONLY; 9852 9853 return UVM_PROT_NONE; 9854 } 9855 else { 9856 uvm_processor_id_t atomic_id; 9857 uvm_processor_id_t residency; 9858 uvm_processor_mask_t atomic_mappings; 9859 uvm_processor_mask_t write_mappings; 9860 9861 // Search the id of the processor with the only resident copy 9862 residency = uvm_processor_mask_find_first_id(&resident_processors); 9863 UVM_ASSERT(UVM_ID_IS_VALID(residency)); 9864 9865 // If we cannot map the processor with the resident copy, exit 9866 if (!uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(residency)], processor_id)) 9867 return UVM_PROT_NONE; 9868 9869 // Fast path: if the page is not mapped anywhere else, it can be safely 9870 // mapped with RWA permission 9871 if (!uvm_page_mask_test(&va_block->maybe_mapped_pages, page_index)) 9872 return UVM_PROT_READ_WRITE_ATOMIC; 9873 9874 block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings); 9875 9876 // Exclude processors with system-wide atomics disabled from atomic_mappings 9877 uvm_processor_mask_and(&atomic_mappings, 9878 &atomic_mappings, 9879 &va_space->system_wide_atomics_enabled_processors); 9880 9881 // Exclude the processor for which the mapping protections are being computed 9882 uvm_processor_mask_clear(&atomic_mappings, processor_id); 9883 9884 // If there is any processor with atomic mapping, check if it has native atomics to the processor 9885 // with the resident copy. If it does not, we can only map READ ONLY 9886 atomic_id = uvm_processor_mask_find_first_id(&atomic_mappings); 9887 if (UVM_ID_IS_VALID(atomic_id) && 9888 !uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], atomic_id)) { 9889 return UVM_PROT_READ_ONLY; 9890 } 9891 9892 block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE, &write_mappings); 9893 9894 // Exclude the processor for which the mapping protections are being computed 9895 uvm_processor_mask_clear(&write_mappings, processor_id); 9896 9897 // At this point, any processor with atomic mappings either has native 9898 // atomics support to the processor with the resident copy or has 9899 // disabled system-wide atomics. If the requesting processor has 9900 // disabled system-wide atomics or has native atomics to that processor, 9901 // we can map with ATOMIC privileges. Likewise, if there are no other 9902 // processors with WRITE or ATOMIC mappings, we can map with ATOMIC 9903 // privileges. For HMM, don't allow GPU atomic access to remote mapped 9904 // system memory even if there are no write mappings since CPU access 9905 // can be upgraded without notification. 9906 if (!uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, processor_id) || 9907 uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id) || 9908 (uvm_processor_mask_empty(&write_mappings) && !uvm_va_block_is_hmm(va_block))) { 9909 return UVM_PROT_READ_WRITE_ATOMIC; 9910 } 9911 9912 return UVM_PROT_READ_WRITE; 9913 } 9914 } 9915 9916 NV_STATUS uvm_va_block_add_mappings(uvm_va_block_t *va_block, 9917 uvm_va_block_context_t *va_block_context, 9918 uvm_processor_id_t processor_id, 9919 uvm_va_block_region_t region, 9920 const uvm_page_mask_t *page_mask, 9921 UvmEventMapRemoteCause cause) 9922 { 9923 uvm_va_range_t *va_range = va_block->va_range; 9924 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9925 NV_STATUS status = NV_OK; 9926 uvm_page_index_t page_index; 9927 uvm_range_group_range_iter_t iter; 9928 uvm_prot_t prot_to_map; 9929 9930 if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) { 9931 if (!uvm_va_range_vma_check(va_range, va_block_context->mm)) 9932 return NV_OK; 9933 9934 uvm_range_group_range_migratability_iter_first(va_space, 9935 uvm_va_block_region_start(va_block, region), 9936 uvm_va_block_region_end(va_block, region), 9937 &iter); 9938 } 9939 9940 for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map) 9941 va_block_context->mask_by_prot[prot_to_map - 1].count = 0; 9942 9943 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 9944 // Read duplication takes precedence over SetAccessedBy. Exclude pages 9945 // read-duplicated by performance heuristics 9946 if (uvm_page_mask_test(&va_block->read_duplicated_pages, page_index)) 9947 continue; 9948 9949 prot_to_map = uvm_va_block_page_compute_highest_permission(va_block, processor_id, page_index); 9950 if (prot_to_map == UVM_PROT_NONE) 9951 continue; 9952 9953 if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) { 9954 while (uvm_va_block_cpu_page_index(va_block, iter.end) < page_index) { 9955 uvm_range_group_range_migratability_iter_next(va_space, 9956 &iter, 9957 uvm_va_block_region_end(va_block, region)); 9958 } 9959 9960 if (!iter.migratable) 9961 continue; 9962 } 9963 9964 if (va_block_context->mask_by_prot[prot_to_map - 1].count++ == 0) 9965 uvm_page_mask_zero(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask); 9966 9967 uvm_page_mask_set(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask, page_index); 9968 } 9969 9970 for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map) { 9971 if (va_block_context->mask_by_prot[prot_to_map - 1].count == 0) 9972 continue; 9973 9974 status = uvm_va_block_map(va_block, 9975 va_block_context, 9976 processor_id, 9977 region, 9978 &va_block_context->mask_by_prot[prot_to_map - 1].page_mask, 9979 prot_to_map, 9980 cause, 9981 &va_block->tracker); 9982 if (status != NV_OK) 9983 break; 9984 } 9985 9986 return status; 9987 } 9988 9989 static bool can_read_duplicate(uvm_va_block_t *va_block, 9990 uvm_page_index_t page_index, 9991 const uvm_va_policy_t *policy, 9992 const uvm_perf_thrashing_hint_t *thrashing_hint) 9993 { 9994 if (uvm_va_policy_is_read_duplicate(policy, uvm_va_block_get_va_space(va_block))) 9995 return true; 9996 9997 if (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED && 9998 uvm_page_mask_test(&va_block->read_duplicated_pages, page_index) && 9999 thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_PIN) 10000 return true; 10001 10002 return false; 10003 } 10004 10005 // TODO: Bug 1827400: If the faulting processor has support for native 10006 // atomics to the current location and the faults on the page were 10007 // triggered by atomic accesses only, we keep the current residency. 10008 // This is a short-term solution to exercise remote atomics over 10009 // NVLINK when possible (not only when preferred location is set to 10010 // the remote GPU) as they are much faster than relying on page 10011 // faults and permission downgrades, which cause thrashing. In the 10012 // future, the thrashing detection/prevention heuristics should 10013 // detect and handle this case. 10014 static bool map_remote_on_atomic_fault(uvm_va_space_t *va_space, 10015 NvU32 access_type_mask, 10016 uvm_processor_id_t processor_id, 10017 uvm_processor_id_t residency) 10018 { 10019 // This policy can be enabled/disabled using a module parameter 10020 if (!uvm_perf_map_remote_on_native_atomics_fault) 10021 return false; 10022 10023 // Only consider atomics faults 10024 if (uvm_fault_access_type_mask_lowest(access_type_mask) < UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK) 10025 return false; 10026 10027 // We cannot differentiate CPU writes from atomics. We exclude CPU faults 10028 // from the logic explained above in order to avoid mapping CPU to vidmem 10029 // memory due to a write. 10030 if (UVM_ID_IS_CPU(processor_id)) 10031 return false; 10032 10033 // On P9 systems (which have native HW support for system-wide atomics), we 10034 // have determined experimentally that placing memory on a GPU yields the 10035 // best performance on most cases (since CPU can cache vidmem but not vice 10036 // versa). Therefore, don't map remotely if the current residency is 10037 // sysmem. 10038 if (UVM_ID_IS_CPU(residency)) 10039 return false; 10040 10041 return uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id); 10042 } 10043 10044 // TODO: Bug 1766424: this function works on a single page at a time. This 10045 // could be changed in the future to optimize multiple faults or access 10046 // counter notifications on contiguous pages. 10047 static uvm_processor_id_t block_select_residency(uvm_va_block_t *va_block, 10048 uvm_va_block_context_t *va_block_context, 10049 uvm_page_index_t page_index, 10050 uvm_processor_id_t processor_id, 10051 NvU32 access_type_mask, 10052 const uvm_va_policy_t *policy, 10053 const uvm_perf_thrashing_hint_t *thrashing_hint, 10054 uvm_service_operation_t operation, 10055 bool *read_duplicate) 10056 { 10057 uvm_processor_id_t closest_resident_processor; 10058 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10059 bool may_read_duplicate; 10060 uvm_processor_id_t preferred_location; 10061 10062 // TODO: Bug 3660968: Remove uvm_hmm_force_sysmem_set() check as soon as 10063 // HMM migration is implemented VMAs other than anonymous memory. 10064 if (is_uvm_fault_force_sysmem_set() || uvm_hmm_must_use_sysmem(va_block, va_block_context)) { 10065 *read_duplicate = false; 10066 return UVM_ID_CPU; 10067 } 10068 10069 may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint); 10070 10071 // Read/prefetch faults on a VA range with read duplication enabled 10072 // always create a copy of the page on the faulting processor's memory. 10073 // Note that access counters always use UVM_FAULT_ACCESS_TYPE_PREFETCH, 10074 // which will lead to read duplication if it is enabled. 10075 *read_duplicate = may_read_duplicate && 10076 (uvm_fault_access_type_mask_highest(access_type_mask) <= UVM_FAULT_ACCESS_TYPE_READ); 10077 10078 if (*read_duplicate) 10079 return processor_id; 10080 10081 *read_duplicate = false; 10082 10083 // If read-duplication is active in the page but we are not 10084 // read-duplicating because the access type is not a read or a prefetch, 10085 // the faulting processor should get a local copy 10086 if (may_read_duplicate) 10087 return processor_id; 10088 10089 // If the faulting processor is the preferred location always migrate 10090 preferred_location = policy->preferred_location; 10091 if (uvm_id_equal(processor_id, preferred_location)) { 10092 if (thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_NONE) { 10093 UVM_ASSERT(thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN); 10094 if (uvm_va_space_processor_has_memory(va_space, processor_id)) 10095 UVM_ASSERT(uvm_id_equal(thrashing_hint->pin.residency, processor_id)); 10096 } 10097 10098 return processor_id; 10099 } 10100 10101 // If the faulting processor is the CPU, HMM has to migrate the block to 10102 // system memory. 10103 // TODO: Bug 3900021: [UVM-HMM] investigate thrashing improvements. 10104 if (UVM_ID_IS_CPU(processor_id) && uvm_va_block_is_hmm(va_block)) 10105 return processor_id; 10106 10107 if (thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN) { 10108 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(thrashing_hint->pin.residency)], 10109 processor_id)); 10110 return thrashing_hint->pin.residency; 10111 } 10112 10113 closest_resident_processor = uvm_va_block_page_get_closest_resident(va_block, page_index, processor_id); 10114 10115 // If the page is not resident anywhere, select the preferred location as 10116 // long as the preferred location is accessible from the faulting processor. 10117 // Otherwise select the faulting processor. 10118 if (UVM_ID_IS_INVALID(closest_resident_processor)) { 10119 if (UVM_ID_IS_VALID(preferred_location) && 10120 uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], 10121 processor_id)) { 10122 return preferred_location; 10123 } 10124 10125 return processor_id; 10126 } 10127 10128 // AccessedBy mappings might have not been created for the CPU if the thread 10129 // which made the memory resident did not have the proper references on the 10130 // mm_struct (for example, the GPU fault handling path when 10131 // uvm_va_space_mm_enabled() is false). 10132 // 10133 // Also, in uvm_migrate_*, we implement a two-pass scheme in which 10134 // AccessedBy mappings may be delayed to the second pass. This can produce 10135 // faults even if the faulting processor is in the accessed_by mask. 10136 // 10137 // Here, we keep it on the current residency and we just add the missing 10138 // mapping. 10139 if (uvm_processor_mask_test(&policy->accessed_by, processor_id) && 10140 uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) && 10141 operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS) { 10142 return closest_resident_processor; 10143 } 10144 10145 // Check if we should map the closest resident processor remotely on atomic 10146 // fault 10147 if (map_remote_on_atomic_fault(va_space, access_type_mask, processor_id, closest_resident_processor)) 10148 return closest_resident_processor; 10149 10150 // If the processor has access to the preferred location, and the page is 10151 // not resident on the accessing processor, move it to the preferred 10152 // location. 10153 if (!uvm_id_equal(closest_resident_processor, processor_id) && 10154 UVM_ID_IS_VALID(preferred_location) && 10155 uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id)) 10156 return preferred_location; 10157 10158 // Check if we should map the closest resident processor remotely on remote CPU fault 10159 // 10160 // When faulting on CPU, there's a linux process on behalf of it, which is associated 10161 // with a unique VM pointed by current->mm. A block of memory residing on GPU is also 10162 // associated with VM, pointed by va_block_context->mm. If they match, it's a regular 10163 // (local) fault, and we may want to migrate a page from GPU to CPU. 10164 // If it's a 'remote' fault, i.e. linux process differs from one associated with block 10165 // VM, we might preserve residence. 10166 // 10167 // Establishing a remote fault without access counters means the memory could stay in 10168 // the wrong spot for a long time, which is why we prefer to avoid creating remote 10169 // mappings. However when NIC accesses a memory residing on GPU, it's worth to keep it 10170 // in place for NIC accesses. 10171 // 10172 // The logic that's used to detect remote faulting also keeps memory in place for 10173 // ptrace accesses. We would prefer to control those policies separately, but the 10174 // NIC case takes priority. 10175 if (UVM_ID_IS_CPU(processor_id) && 10176 uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) && 10177 va_block_context->mm != current->mm) { 10178 UVM_ASSERT(va_block_context->mm != NULL); 10179 return closest_resident_processor; 10180 } 10181 10182 // If the page is resident on a processor other than the preferred location, 10183 // or the faulting processor can't access the preferred location, we select 10184 // the faulting processor as the new residency. 10185 return processor_id; 10186 } 10187 10188 uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block, 10189 uvm_va_block_context_t *va_block_context, 10190 uvm_page_index_t page_index, 10191 uvm_processor_id_t processor_id, 10192 NvU32 access_type_mask, 10193 const uvm_va_policy_t *policy, 10194 const uvm_perf_thrashing_hint_t *thrashing_hint, 10195 uvm_service_operation_t operation, 10196 bool *read_duplicate) 10197 { 10198 uvm_processor_id_t id; 10199 10200 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, 10201 va_block_context->hmm.vma, 10202 uvm_va_block_region_for_page(page_index))); 10203 10204 id = block_select_residency(va_block, 10205 va_block_context, 10206 page_index, 10207 processor_id, 10208 access_type_mask, 10209 policy, 10210 thrashing_hint, 10211 operation, 10212 read_duplicate); 10213 10214 // If the intended residency doesn't have memory, fall back to the CPU. 10215 if (!block_processor_has_memory(va_block, id)) { 10216 *read_duplicate = false; 10217 return UVM_ID_CPU; 10218 } 10219 10220 return id; 10221 } 10222 10223 static bool check_access_counters_dont_revoke(uvm_va_block_t *block, 10224 uvm_va_block_context_t *block_context, 10225 uvm_va_block_region_t region, 10226 const uvm_processor_mask_t *revoke_processors, 10227 const uvm_page_mask_t *revoke_page_mask, 10228 uvm_prot_t revoke_prot) 10229 { 10230 uvm_processor_id_t id; 10231 for_each_id_in_mask(id, revoke_processors) { 10232 const uvm_page_mask_t *mapped_with_prot = block_map_with_prot_mask_get(block, id, revoke_prot); 10233 10234 uvm_page_mask_and(&block_context->caller_page_mask, revoke_page_mask, mapped_with_prot); 10235 10236 UVM_ASSERT(uvm_page_mask_region_weight(&block_context->caller_page_mask, region) == 0); 10237 } 10238 10239 return true; 10240 } 10241 10242 // Update service_context->prefetch_hint, service_context->per_processor_masks, 10243 // and service_context->region. 10244 static void uvm_va_block_get_prefetch_hint(uvm_va_block_t *va_block, 10245 const uvm_va_policy_t *policy, 10246 uvm_service_block_context_t *service_context) 10247 { 10248 uvm_processor_id_t new_residency; 10249 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10250 10251 // Performance heuristics policy: we only consider prefetching when there 10252 // are migrations to a single processor, only. 10253 if (uvm_processor_mask_get_count(&service_context->resident_processors) == 1) { 10254 uvm_page_index_t page_index; 10255 uvm_page_mask_t *new_residency_mask; 10256 10257 new_residency = uvm_processor_mask_find_first_id(&service_context->resident_processors); 10258 new_residency_mask = &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency; 10259 10260 // Update prefetch tracking structure with the pages that will migrate 10261 // due to faults 10262 uvm_perf_prefetch_get_hint_va_block(va_block, 10263 &service_context->block_context, 10264 new_residency, 10265 new_residency_mask, 10266 service_context->region, 10267 &service_context->prefetch_bitmap_tree, 10268 &service_context->prefetch_hint); 10269 10270 // Obtain the prefetch hint and give a fake fault access type to the 10271 // prefetched pages 10272 if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency)) { 10273 const uvm_page_mask_t *prefetch_pages_mask = &service_context->prefetch_hint.prefetch_pages_mask; 10274 10275 for_each_va_block_page_in_mask(page_index, prefetch_pages_mask, va_block) { 10276 UVM_ASSERT(!uvm_page_mask_test(new_residency_mask, page_index)); 10277 10278 service_context->access_type[page_index] = UVM_FAULT_ACCESS_TYPE_PREFETCH; 10279 10280 if (uvm_va_policy_is_read_duplicate(policy, va_space) || 10281 (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED && 10282 uvm_page_mask_test(&va_block->read_duplicated_pages, page_index))) { 10283 if (service_context->read_duplicate_count++ == 0) 10284 uvm_page_mask_zero(&service_context->read_duplicate_mask); 10285 10286 uvm_page_mask_set(&service_context->read_duplicate_mask, page_index); 10287 } 10288 } 10289 10290 uvm_page_mask_or(new_residency_mask, new_residency_mask, prefetch_pages_mask); 10291 service_context->region = uvm_va_block_region_from_mask(va_block, new_residency_mask); 10292 } 10293 } 10294 else { 10295 service_context->prefetch_hint.residency = UVM_ID_INVALID; 10296 } 10297 } 10298 10299 NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id, 10300 uvm_processor_id_t new_residency, 10301 uvm_va_block_t *va_block, 10302 uvm_va_block_retry_t *block_retry, 10303 uvm_service_block_context_t *service_context) 10304 { 10305 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10306 uvm_processor_mask_t *all_involved_processors = 10307 &service_context->block_context.make_resident.all_involved_processors; 10308 uvm_page_mask_t *new_residency_mask = 10309 &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency; 10310 uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency; 10311 uvm_page_mask_t *caller_page_mask = &service_context->block_context.caller_page_mask; 10312 uvm_make_resident_cause_t cause; 10313 NV_STATUS status; 10314 10315 // 1- Migrate pages 10316 switch (service_context->operation) { 10317 case UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS: 10318 cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT; 10319 break; 10320 case UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS: 10321 cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT; 10322 break; 10323 case UVM_SERVICE_OPERATION_ACCESS_COUNTERS: 10324 cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER; 10325 break; 10326 default: 10327 UVM_ASSERT_MSG(false, "Invalid operation value %d\n", service_context->operation); 10328 // Set cause to silence compiler warning that it may be unused. 10329 cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER; 10330 break; 10331 } 10332 10333 // Reset masks before all of the make_resident calls 10334 uvm_page_mask_zero(did_migrate_mask); 10335 uvm_processor_mask_zero(all_involved_processors); 10336 10337 // Handle read duplication first so that the caller_page_mask will be free 10338 // to use below and still valid in uvm_va_block_service_finish(). 10339 // TODO: Bug 3660922: need to implement HMM read duplication support. 10340 if (service_context->read_duplicate_count != 0 && 10341 uvm_page_mask_and(caller_page_mask, 10342 new_residency_mask, 10343 &service_context->read_duplicate_mask)) { 10344 status = uvm_va_block_make_resident_read_duplicate(va_block, 10345 block_retry, 10346 &service_context->block_context, 10347 new_residency, 10348 service_context->region, 10349 caller_page_mask, 10350 &service_context->prefetch_hint.prefetch_pages_mask, 10351 cause); 10352 if (status != NV_OK) 10353 return status; 10354 } 10355 10356 if (service_context->read_duplicate_count == 0 || 10357 uvm_page_mask_andnot(caller_page_mask, new_residency_mask, &service_context->read_duplicate_mask)) { 10358 if (service_context->read_duplicate_count == 0) 10359 uvm_page_mask_copy(caller_page_mask, new_residency_mask); 10360 status = uvm_va_block_make_resident_copy(va_block, 10361 block_retry, 10362 &service_context->block_context, 10363 new_residency, 10364 service_context->region, 10365 caller_page_mask, 10366 &service_context->prefetch_hint.prefetch_pages_mask, 10367 cause); 10368 if (status != NV_OK) 10369 return status; 10370 } 10371 10372 if (UVM_ID_IS_CPU(processor_id) && !uvm_processor_mask_empty(all_involved_processors)) 10373 service_context->cpu_fault.did_migrate = true; 10374 10375 // 2- Check for ECC errors on all GPUs involved in the migration if CPU is 10376 // the destination. Migrations in response to CPU faults are special 10377 // because they're on the only path (apart from tools) where CUDA is not 10378 // involved and wouldn't have a chance to do its own ECC checking. 10379 if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS && 10380 UVM_ID_IS_CPU(new_residency) && 10381 !uvm_processor_mask_empty(all_involved_processors)) { 10382 uvm_gpu_t *gpu; 10383 10384 // Before checking for ECC errors, make sure all of the GPU work 10385 // is finished. Creating mappings on the CPU would have to wait 10386 // for the tracker anyway so this shouldn't hurt performance. 10387 status = uvm_tracker_wait(&va_block->tracker); 10388 if (status != NV_OK) 10389 return status; 10390 10391 for_each_va_space_gpu_in_mask(gpu, va_space, all_involved_processors) { 10392 // We cannot call into RM here so use the no RM ECC check. 10393 status = uvm_gpu_check_ecc_error_no_rm(gpu); 10394 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) { 10395 // In case we need to call into RM to be sure whether 10396 // there is an ECC error or not, signal that to the 10397 // caller by adding the GPU to the mask. 10398 // 10399 // In that case the ECC error might be noticed only after 10400 // the CPU mappings have been already created below, 10401 // exposing different CPU threads to the possibly corrupt 10402 // data, but this thread will fault eventually and that's 10403 // considered to be an acceptable trade-off between 10404 // performance and ECC error containment. 10405 uvm_processor_mask_set(&service_context->cpu_fault.gpus_to_check_for_ecc, gpu->id); 10406 status = NV_OK; 10407 } 10408 if (status != NV_OK) 10409 return status; 10410 } 10411 } 10412 10413 return NV_OK; 10414 } 10415 10416 NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id, 10417 uvm_va_block_t *va_block, 10418 uvm_service_block_context_t *service_context) 10419 { 10420 uvm_processor_id_t new_residency = service_context->block_context.make_resident.dest_id; 10421 uvm_page_mask_t *new_residency_mask = 10422 &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency; 10423 uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency; 10424 uvm_page_mask_t *caller_page_mask = &service_context->block_context.caller_page_mask; 10425 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10426 uvm_prot_t new_prot; 10427 uvm_page_index_t page_index; 10428 NV_STATUS status; 10429 10430 // Update residency. 10431 if (service_context->read_duplicate_count == 0 || !uvm_page_mask_empty(caller_page_mask)) 10432 uvm_va_block_make_resident_finish(va_block, 10433 &service_context->block_context, 10434 service_context->region, 10435 caller_page_mask); 10436 10437 uvm_page_mask_andnot(&service_context->did_not_migrate_mask, new_residency_mask, did_migrate_mask); 10438 10439 // The loops below depend on the enums having the following values in order 10440 // to index into service_context->mappings_by_prot[]. 10441 BUILD_BUG_ON(UVM_PROT_READ_ONLY != 1); 10442 BUILD_BUG_ON(UVM_PROT_READ_WRITE != 2); 10443 BUILD_BUG_ON(UVM_PROT_READ_WRITE_ATOMIC != 3); 10444 BUILD_BUG_ON(UVM_PROT_MAX != 4); 10445 10446 // 1- Compute mapping protections for the requesting processor on the new 10447 // residency. 10448 for (new_prot = UVM_PROT_READ_ONLY; new_prot < UVM_PROT_MAX; ++new_prot) 10449 service_context->mappings_by_prot[new_prot - 1].count = 0; 10450 10451 for_each_va_block_page_in_region_mask(page_index, new_residency_mask, service_context->region) { 10452 new_prot = compute_new_permission(va_block, 10453 service_context->block_context.hmm.vma, 10454 page_index, 10455 processor_id, 10456 new_residency, 10457 service_context->access_type[page_index]); 10458 10459 if (service_context->mappings_by_prot[new_prot - 1].count++ == 0) 10460 uvm_page_mask_zero(&service_context->mappings_by_prot[new_prot - 1].page_mask); 10461 10462 uvm_page_mask_set(&service_context->mappings_by_prot[new_prot - 1].page_mask, page_index); 10463 } 10464 10465 // 2- Revoke permissions 10466 // 10467 // NOTE: uvm_va_block_make_resident_copy destroys mappings to old locations. 10468 // Thus, we need to revoke only if residency did not change and we 10469 // are mapping higher than READ ONLY. 10470 for (new_prot = UVM_PROT_READ_WRITE; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) { 10471 bool pages_need_revocation; 10472 uvm_processor_mask_t revoke_processors; 10473 uvm_prot_t revoke_prot; 10474 bool this_processor_has_enabled_atomics; 10475 10476 if (service_context->mappings_by_prot[new_prot - 1].count == 0) 10477 continue; 10478 10479 pages_need_revocation = uvm_page_mask_and(&service_context->revocation_mask, 10480 &service_context->did_not_migrate_mask, 10481 &service_context->mappings_by_prot[new_prot - 1].page_mask); 10482 if (!pages_need_revocation) 10483 continue; 10484 10485 uvm_processor_mask_and(&revoke_processors, &va_block->mapped, &va_space->faultable_processors); 10486 10487 // Do not revoke the processor that took the fault 10488 uvm_processor_mask_clear(&revoke_processors, processor_id); 10489 10490 this_processor_has_enabled_atomics = uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, 10491 processor_id); 10492 10493 // Atomic operations on processors with system-wide atomics 10494 // disabled or with native atomics access to new_residency 10495 // behave like writes. 10496 if (new_prot == UVM_PROT_READ_WRITE || 10497 !this_processor_has_enabled_atomics || 10498 uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id)) { 10499 10500 // Exclude processors with native atomics on the resident copy 10501 uvm_processor_mask_andnot(&revoke_processors, 10502 &revoke_processors, 10503 &va_space->has_native_atomics[uvm_id_value(new_residency)]); 10504 10505 // Exclude processors with disabled system-wide atomics 10506 uvm_processor_mask_and(&revoke_processors, 10507 &revoke_processors, 10508 &va_space->system_wide_atomics_enabled_processors); 10509 } 10510 10511 if (UVM_ID_IS_CPU(processor_id)) { 10512 revoke_prot = UVM_PROT_READ_WRITE_ATOMIC; 10513 } 10514 else { 10515 revoke_prot = (new_prot == UVM_PROT_READ_WRITE_ATOMIC)? UVM_PROT_READ_WRITE: 10516 UVM_PROT_READ_WRITE_ATOMIC; 10517 } 10518 10519 // UVM-Lite processors must always have RWA mappings 10520 if (uvm_processor_mask_andnot(&revoke_processors, &revoke_processors, block_get_uvm_lite_gpus(va_block))) { 10521 // Access counters should never trigger revocations apart from 10522 // read-duplication, which are performed in the calls to 10523 // uvm_va_block_make_resident_read_duplicate, above. 10524 if (service_context->operation == UVM_SERVICE_OPERATION_ACCESS_COUNTERS) { 10525 UVM_ASSERT(check_access_counters_dont_revoke(va_block, 10526 &service_context->block_context, 10527 service_context->region, 10528 &revoke_processors, 10529 &service_context->revocation_mask, 10530 revoke_prot)); 10531 } 10532 10533 // Downgrade other processors' mappings 10534 status = uvm_va_block_revoke_prot_mask(va_block, 10535 &service_context->block_context, 10536 &revoke_processors, 10537 service_context->region, 10538 &service_context->revocation_mask, 10539 revoke_prot); 10540 if (status != NV_OK) 10541 return status; 10542 } 10543 } 10544 10545 // 3- Map requesting processor with the necessary privileges 10546 for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) { 10547 const uvm_page_mask_t *map_prot_mask = &service_context->mappings_by_prot[new_prot - 1].page_mask; 10548 10549 if (service_context->mappings_by_prot[new_prot - 1].count == 0) 10550 continue; 10551 10552 // 3.1 - Unmap CPU pages 10553 // HMM cpu mappings can be upgraded at any time without notification 10554 // so no need to downgrade first. 10555 if (service_context->operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS && 10556 UVM_ID_IS_CPU(processor_id) && 10557 !uvm_va_block_is_hmm(va_block)) { 10558 // The kernel can downgrade managed CPU mappings at any time without 10559 // notifying us, which means our PTE state could be stale. We 10560 // handle this by unmapping the CPU PTE and re-mapping it again. 10561 // 10562 // A CPU fault is unexpected if: 10563 // curr_prot == RW || (!is_write && curr_prot == RO) 10564 status = uvm_va_block_unmap(va_block, 10565 &service_context->block_context, 10566 UVM_ID_CPU, 10567 service_context->region, 10568 map_prot_mask, 10569 NULL); 10570 if (status != NV_OK) 10571 return status; 10572 } 10573 10574 // 3.2 - Add new mappings 10575 10576 // The faulting processor can be mapped remotely due to user policy or 10577 // the thrashing mitigation heuristics. Therefore, we set the cause 10578 // accordingly in each case. 10579 10580 // Map pages that are thrashing first 10581 if (service_context->thrashing_pin_count > 0 && va_space->tools.enabled) { 10582 uvm_page_mask_t *helper_page_mask = &service_context->block_context.caller_page_mask; 10583 bool pages_need_mapping = uvm_page_mask_and(helper_page_mask, 10584 map_prot_mask, 10585 &service_context->thrashing_pin_mask); 10586 if (pages_need_mapping) { 10587 status = uvm_va_block_map(va_block, 10588 &service_context->block_context, 10589 processor_id, 10590 service_context->region, 10591 helper_page_mask, 10592 new_prot, 10593 UvmEventMapRemoteCauseThrashing, 10594 &va_block->tracker); 10595 if (status != NV_OK) 10596 return status; 10597 10598 // Remove thrashing pages from the map mask 10599 pages_need_mapping = uvm_page_mask_andnot(helper_page_mask, 10600 map_prot_mask, 10601 &service_context->thrashing_pin_mask); 10602 if (!pages_need_mapping) 10603 continue; 10604 10605 map_prot_mask = helper_page_mask; 10606 } 10607 } 10608 10609 status = uvm_va_block_map(va_block, 10610 &service_context->block_context, 10611 processor_id, 10612 service_context->region, 10613 map_prot_mask, 10614 new_prot, 10615 UvmEventMapRemoteCausePolicy, 10616 &va_block->tracker); 10617 if (status != NV_OK) 10618 return status; 10619 } 10620 10621 // 4- If pages did migrate, map SetAccessedBy processors, except for 10622 // UVM-Lite 10623 for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) { 10624 bool pages_need_mapping; 10625 10626 if (service_context->mappings_by_prot[new_prot - 1].count == 0) 10627 continue; 10628 10629 pages_need_mapping = uvm_page_mask_and(caller_page_mask, 10630 new_residency_mask, 10631 &service_context->mappings_by_prot[new_prot - 1].page_mask); 10632 if (!pages_need_mapping) 10633 continue; 10634 10635 // Map pages that are thrashing 10636 if (service_context->thrashing_pin_count > 0) { 10637 uvm_page_index_t page_index; 10638 10639 for_each_va_block_page_in_region_mask(page_index, 10640 &service_context->thrashing_pin_mask, 10641 service_context->region) { 10642 uvm_processor_mask_t *map_thrashing_processors = NULL; 10643 NvU64 page_addr = uvm_va_block_cpu_page_address(va_block, page_index); 10644 10645 // Check protection type 10646 if (!uvm_page_mask_test(caller_page_mask, page_index)) 10647 continue; 10648 10649 map_thrashing_processors = uvm_perf_thrashing_get_thrashing_processors(va_block, page_addr); 10650 10651 status = uvm_va_block_add_mappings_after_migration(va_block, 10652 &service_context->block_context, 10653 new_residency, 10654 processor_id, 10655 uvm_va_block_region_for_page(page_index), 10656 caller_page_mask, 10657 new_prot, 10658 map_thrashing_processors); 10659 if (status != NV_OK) 10660 return status; 10661 } 10662 10663 pages_need_mapping = uvm_page_mask_andnot(caller_page_mask, 10664 caller_page_mask, 10665 &service_context->thrashing_pin_mask); 10666 if (!pages_need_mapping) 10667 continue; 10668 } 10669 10670 // Map the rest of pages in a single shot 10671 status = uvm_va_block_add_mappings_after_migration(va_block, 10672 &service_context->block_context, 10673 new_residency, 10674 processor_id, 10675 service_context->region, 10676 caller_page_mask, 10677 new_prot, 10678 NULL); 10679 if (status != NV_OK) 10680 return status; 10681 } 10682 10683 return NV_OK; 10684 } 10685 10686 NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id, 10687 uvm_va_block_t *va_block, 10688 uvm_va_block_retry_t *block_retry, 10689 uvm_service_block_context_t *service_context) 10690 { 10691 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10692 uvm_processor_id_t new_residency; 10693 NV_STATUS status = NV_OK; 10694 10695 uvm_assert_mutex_locked(&va_block->lock); 10696 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, 10697 service_context->block_context.hmm.vma, 10698 service_context->region)); 10699 10700 // GPU fault servicing must be done under the VA space read lock. GPU fault 10701 // servicing is required for RM to make forward progress, and we allow other 10702 // threads to call into RM while holding the VA space lock in read mode. If 10703 // we took the VA space lock in write mode on the GPU fault service path, 10704 // we could deadlock because the thread in RM which holds the VA space lock 10705 // for read wouldn't be able to complete until fault servicing completes. 10706 if (service_context->operation != UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS || UVM_ID_IS_CPU(processor_id)) 10707 uvm_assert_rwsem_locked(&va_space->lock); 10708 else 10709 uvm_assert_rwsem_locked_read(&va_space->lock); 10710 10711 uvm_va_block_get_prefetch_hint(va_block, 10712 uvm_va_policy_get_region(va_block, service_context->region), 10713 service_context); 10714 10715 for_each_id_in_mask(new_residency, &service_context->resident_processors) { 10716 if (uvm_va_block_is_hmm(va_block)) { 10717 status = uvm_hmm_va_block_service_locked(processor_id, new_residency, va_block, block_retry, service_context); 10718 if (status != NV_OK) 10719 break; 10720 10721 continue; 10722 } 10723 10724 status = uvm_va_block_service_copy(processor_id, new_residency, va_block, block_retry, service_context); 10725 if (status != NV_OK) 10726 break; 10727 10728 status = uvm_va_block_service_finish(processor_id, va_block, service_context); 10729 if (status != NV_OK) 10730 break; 10731 } 10732 10733 return status; 10734 } 10735 10736 NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block, 10737 uvm_va_block_context_t *va_block_context, 10738 uvm_processor_id_t processor_id, 10739 uvm_page_index_t page_index, 10740 uvm_fault_type_t access_type, 10741 bool allow_migration) 10742 { 10743 uvm_va_range_t *va_range = va_block->va_range; 10744 uvm_prot_t access_prot = uvm_fault_access_type_to_prot(access_type); 10745 10746 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, 10747 va_block_context->hmm.vma, 10748 uvm_va_block_region_for_page(page_index))); 10749 10750 // CPU permissions are checked later by block_map_cpu_page. 10751 // 10752 // TODO: Bug 1766124: permissions are checked by block_map_cpu_page because 10753 // it can also be called from change_pte. Make change_pte call this 10754 // function and only check CPU permissions here. 10755 if (UVM_ID_IS_GPU(processor_id)) { 10756 if (va_range && uvm_va_range_is_managed_zombie(va_range)) 10757 return NV_ERR_INVALID_ADDRESS; 10758 10759 // GPU faults only check vma permissions if a mm is registered with the 10760 // VA space (ie. uvm_va_space_mm_retain_lock(va_space) != NULL) or if 10761 // uvm_enable_builtin_tests is set, because the Linux kernel can change 10762 // vm_flags at any moment (for example on mprotect) and here we are not 10763 // guaranteed to have vma->vm_mm->mmap_lock. During tests we ensure that 10764 // this scenario does not happen. 10765 if (((va_block->hmm.va_space && va_block->hmm.va_space->va_space_mm.mm) || uvm_enable_builtin_tests) && 10766 (access_prot > compute_logical_prot(va_block, va_block_context->hmm.vma, page_index))) 10767 return NV_ERR_INVALID_ACCESS_TYPE; 10768 } 10769 10770 // Non-migratable range: 10771 // - CPU accesses are always fatal, regardless of the VA range residency 10772 // - GPU accesses are fatal if the GPU can't map the preferred location 10773 if (!allow_migration) { 10774 UVM_ASSERT(!uvm_va_block_is_hmm(va_block)); 10775 10776 if (UVM_ID_IS_CPU(processor_id)) { 10777 return NV_ERR_INVALID_OPERATION; 10778 } 10779 else { 10780 uvm_va_space_t *va_space = va_range->va_space; 10781 10782 return uvm_processor_mask_test( 10783 &va_space->accessible_from[uvm_id_value(uvm_va_range_get_policy(va_range)->preferred_location)], 10784 processor_id)? 10785 NV_OK : NV_ERR_INVALID_ACCESS_TYPE; 10786 } 10787 } 10788 10789 return NV_OK; 10790 } 10791 10792 // Check if we are faulting on a page with valid permissions to check if we can 10793 // skip fault handling. See uvm_va_block_t::cpu::fault_authorized for more 10794 // details 10795 static bool skip_cpu_fault_with_valid_permissions(uvm_va_block_t *va_block, 10796 uvm_page_index_t page_index, 10797 uvm_fault_access_type_t fault_access_type) 10798 { 10799 // TODO: Bug 3900038: is skip_cpu_fault_with_valid_permissions() needed for 10800 // HMM? 10801 if (uvm_va_block_is_hmm(va_block)) 10802 return false; 10803 10804 if (block_page_is_processor_authorized(va_block, 10805 page_index, 10806 UVM_ID_CPU, 10807 uvm_fault_access_type_to_prot(fault_access_type))) { 10808 NvU64 now = NV_GETTIME(); 10809 pid_t pid = current->pid; 10810 10811 // Latch the pid/timestamp/page_index values for the first time 10812 if (!va_block->cpu.fault_authorized.first_fault_stamp) { 10813 va_block->cpu.fault_authorized.first_fault_stamp = now; 10814 va_block->cpu.fault_authorized.first_pid = pid; 10815 va_block->cpu.fault_authorized.page_index = page_index; 10816 10817 return true; 10818 } 10819 10820 // If the same thread shows up again, this means that the kernel 10821 // downgraded the page's PTEs. Service the fault to force a remap of 10822 // the page. 10823 if (va_block->cpu.fault_authorized.first_pid == pid && 10824 va_block->cpu.fault_authorized.page_index == page_index) { 10825 va_block->cpu.fault_authorized.first_fault_stamp = 0; 10826 } 10827 else { 10828 // If the window has expired, clear the information and service the 10829 // fault. Otherwise, just return 10830 if (now - va_block->cpu.fault_authorized.first_fault_stamp > uvm_perf_authorized_cpu_fault_tracking_window_ns) 10831 va_block->cpu.fault_authorized.first_fault_stamp = 0; 10832 else 10833 return true; 10834 } 10835 } 10836 10837 return false; 10838 } 10839 10840 static NV_STATUS block_cpu_fault_locked(uvm_va_block_t *va_block, 10841 uvm_va_block_retry_t *va_block_retry, 10842 NvU64 fault_addr, 10843 uvm_fault_access_type_t fault_access_type, 10844 uvm_service_block_context_t *service_context) 10845 { 10846 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10847 NV_STATUS status = NV_OK; 10848 uvm_page_index_t page_index; 10849 uvm_perf_thrashing_hint_t thrashing_hint; 10850 uvm_processor_id_t new_residency; 10851 bool read_duplicate; 10852 const uvm_va_policy_t *policy; 10853 10854 uvm_assert_rwsem_locked(&va_space->lock); 10855 10856 UVM_ASSERT(fault_addr >= va_block->start); 10857 UVM_ASSERT(fault_addr <= va_block->end); 10858 10859 uvm_assert_mmap_lock_locked(service_context->block_context.mm); 10860 10861 policy = uvm_va_policy_get(va_block, fault_addr); 10862 10863 if (service_context->num_retries == 0) { 10864 // notify event to tools/performance heuristics 10865 uvm_perf_event_notify_cpu_fault(&va_space->perf_events, 10866 va_block, 10867 policy->preferred_location, 10868 fault_addr, 10869 fault_access_type > UVM_FAULT_ACCESS_TYPE_READ, 10870 KSTK_EIP(current)); 10871 } 10872 10873 // Check logical permissions 10874 page_index = uvm_va_block_cpu_page_index(va_block, fault_addr); 10875 status = uvm_va_block_check_logical_permissions(va_block, 10876 &service_context->block_context, 10877 UVM_ID_CPU, 10878 page_index, 10879 fault_access_type, 10880 uvm_range_group_address_migratable(va_space, fault_addr)); 10881 if (status != NV_OK) 10882 return status; 10883 10884 uvm_processor_mask_zero(&service_context->cpu_fault.gpus_to_check_for_ecc); 10885 10886 if (skip_cpu_fault_with_valid_permissions(va_block, page_index, fault_access_type)) 10887 return NV_OK; 10888 10889 thrashing_hint = uvm_perf_thrashing_get_hint(va_block, fault_addr, UVM_ID_CPU); 10890 // Throttling is implemented by sleeping in the fault handler on the CPU 10891 if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) { 10892 service_context->cpu_fault.wakeup_time_stamp = thrashing_hint.throttle.end_time_stamp; 10893 return NV_WARN_MORE_PROCESSING_REQUIRED; 10894 } 10895 10896 service_context->read_duplicate_count = 0; 10897 service_context->thrashing_pin_count = 0; 10898 service_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS; 10899 10900 if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) { 10901 uvm_page_mask_zero(&service_context->thrashing_pin_mask); 10902 uvm_page_mask_set(&service_context->thrashing_pin_mask, page_index); 10903 service_context->thrashing_pin_count = 1; 10904 } 10905 10906 // Compute new residency and update the masks 10907 new_residency = uvm_va_block_select_residency(va_block, 10908 &service_context->block_context, 10909 page_index, 10910 UVM_ID_CPU, 10911 uvm_fault_access_type_mask_bit(fault_access_type), 10912 policy, 10913 &thrashing_hint, 10914 UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS, 10915 &read_duplicate); 10916 10917 // Initialize the minimum necessary state in the fault service context 10918 uvm_processor_mask_zero(&service_context->resident_processors); 10919 10920 // Set new residency and update the masks 10921 uvm_processor_mask_set(&service_context->resident_processors, new_residency); 10922 10923 // The masks need to be fully zeroed as the fault region may grow due to prefetching 10924 uvm_page_mask_zero(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency); 10925 uvm_page_mask_set(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index); 10926 10927 if (read_duplicate) { 10928 uvm_page_mask_zero(&service_context->read_duplicate_mask); 10929 uvm_page_mask_set(&service_context->read_duplicate_mask, page_index); 10930 service_context->read_duplicate_count = 1; 10931 } 10932 10933 service_context->access_type[page_index] = fault_access_type; 10934 10935 service_context->region = uvm_va_block_region_for_page(page_index); 10936 10937 status = uvm_va_block_service_locked(UVM_ID_CPU, va_block, va_block_retry, service_context); 10938 10939 ++service_context->num_retries; 10940 10941 return status; 10942 } 10943 10944 NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block, 10945 NvU64 fault_addr, 10946 bool is_write, 10947 uvm_service_block_context_t *service_context) 10948 { 10949 NV_STATUS status; 10950 uvm_va_block_retry_t va_block_retry; 10951 uvm_fault_access_type_t fault_access_type; 10952 10953 if (is_write) 10954 fault_access_type = UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG; 10955 else 10956 fault_access_type = UVM_FAULT_ACCESS_TYPE_READ; 10957 10958 service_context->num_retries = 0; 10959 service_context->cpu_fault.did_migrate = false; 10960 10961 // We have to use vm_insert_page instead of handing the page to the kernel 10962 // and letting it insert the mapping, and we must do that while holding the 10963 // lock on this VA block. Otherwise there will be a window in which we think 10964 // we've mapped the page but the CPU mapping hasn't actually been created 10965 // yet. During that window a GPU fault event could arrive and claim 10966 // ownership of that VA, "unmapping" it. Then later the kernel would 10967 // eventually establish the mapping, and we'd end up with both CPU and GPU 10968 // thinking they each owned the page. 10969 // 10970 // This function must only be called when it's safe to call vm_insert_page. 10971 // That is, there must be a reference held on the vma's vm_mm, and 10972 // vm_mm->mmap_lock is held in at least read mode. Note that current->mm 10973 // might not be vma->vm_mm. 10974 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, 10975 &va_block_retry, 10976 block_cpu_fault_locked(va_block, 10977 &va_block_retry, 10978 fault_addr, 10979 fault_access_type, 10980 service_context)); 10981 return status; 10982 } 10983 10984 NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t **out_block) 10985 { 10986 uvm_va_range_t *va_range; 10987 uvm_va_block_t *block; 10988 size_t index; 10989 10990 va_range = uvm_va_range_find(va_space, addr); 10991 if (!va_range) 10992 return uvm_hmm_va_block_find(va_space, addr, out_block); 10993 10994 UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS || 10995 uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND); 10996 10997 if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) 10998 return NV_ERR_INVALID_ADDRESS; 10999 11000 index = uvm_va_range_block_index(va_range, addr); 11001 block = uvm_va_range_block(va_range, index); 11002 if (!block) 11003 return NV_ERR_OBJECT_NOT_FOUND; 11004 11005 *out_block = block; 11006 return NV_OK; 11007 } 11008 11009 NV_STATUS uvm_va_block_find_create_in_range(uvm_va_space_t *va_space, 11010 uvm_va_range_t *va_range, 11011 NvU64 addr, 11012 uvm_va_block_t **out_block) 11013 { 11014 size_t index; 11015 11016 if (uvm_enable_builtin_tests && atomic_dec_if_positive(&va_space->test.va_block_allocation_fail_nth) == 0) 11017 return NV_ERR_NO_MEMORY; 11018 11019 UVM_ASSERT(va_range); 11020 UVM_ASSERT(addr >= va_range->node.start); 11021 UVM_ASSERT(addr <= va_range->node.end); 11022 11023 UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS || 11024 uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND); 11025 11026 if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) 11027 return NV_ERR_INVALID_ADDRESS; 11028 11029 index = uvm_va_range_block_index(va_range, addr); 11030 return uvm_va_range_block_create(va_range, index, out_block); 11031 } 11032 11033 NV_STATUS uvm_va_block_find_create_managed(uvm_va_space_t *va_space, 11034 NvU64 addr, 11035 uvm_va_block_t **out_block) 11036 { 11037 uvm_va_range_t *va_range = uvm_va_range_find(va_space, addr); 11038 11039 if (va_range) 11040 return uvm_va_block_find_create_in_range(va_space, va_range, addr, out_block); 11041 else 11042 return NV_ERR_INVALID_ADDRESS; 11043 } 11044 11045 NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space, 11046 NvU64 addr, 11047 struct vm_area_struct **hmm_vma, 11048 uvm_va_block_t **out_block) 11049 { 11050 uvm_va_range_t *va_range = uvm_va_range_find(va_space, addr); 11051 11052 if (hmm_vma) 11053 *hmm_vma = NULL; 11054 11055 if (va_range) 11056 return uvm_va_block_find_create_in_range(va_space, va_range, addr, out_block); 11057 else 11058 return uvm_hmm_va_block_find_create(va_space, addr, hmm_vma, out_block); 11059 } 11060 11061 // Launch a synchronous, encrypted copy between GPU and CPU. 11062 // 11063 // The copy entails a GPU-side encryption (relying on the Copy Engine), and a 11064 // CPU-side decryption step, such that the destination CPU buffer pointed by 11065 // dst_plain will contain the unencrypted (plain text) contents. The destination 11066 // buffer can be in protected or unprotected sysmem, while the source buffer 11067 // must be in protected vidmem. 11068 // 11069 // The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE. 11070 // 11071 // The input tracker, if not NULL, is internally acquired by the push 11072 // responsible for the encrypted copy. 11073 __attribute__ ((format(printf, 6, 7))) 11074 static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu, 11075 void *dst_plain, 11076 uvm_gpu_address_t src_gpu_address, 11077 size_t size, 11078 uvm_tracker_t *tracker, 11079 const char *format, 11080 ...) 11081 { 11082 NV_STATUS status; 11083 UvmCslIv decrypt_iv; 11084 uvm_push_t push; 11085 uvm_conf_computing_dma_buffer_t *dma_buffer; 11086 uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address; 11087 void *src_cipher, *auth_tag; 11088 va_list args; 11089 11090 UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); 11091 UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE); 11092 11093 status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL); 11094 if (status != NV_OK) 11095 return status; 11096 11097 va_start(args, format); 11098 status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args); 11099 va_end(args); 11100 11101 if (status != NV_OK) 11102 goto out; 11103 11104 uvm_conf_computing_log_gpu_encryption(push.channel, &decrypt_iv); 11105 11106 dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu); 11107 auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu); 11108 gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address); 11109 11110 status = uvm_push_end_and_wait(&push); 11111 if (status != NV_OK) 11112 goto out; 11113 11114 src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc); 11115 auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag); 11116 status = uvm_conf_computing_cpu_decrypt(push.channel, dst_plain, src_cipher, &decrypt_iv, size, auth_tag); 11117 11118 out: 11119 uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL); 11120 return status; 11121 } 11122 11123 // Launch a synchronous, encrypted copy between CPU and GPU. 11124 // 11125 // The source CPU buffer pointed by src_plain contains the unencrypted (plain 11126 // text) contents; the function internally performs a CPU-side encryption step 11127 // before launching the GPU-side CE decryption. The source buffer can be in 11128 // protected or unprotected sysmem, while the destination buffer must be in 11129 // protected vidmem. 11130 // 11131 // The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE. 11132 // 11133 // The input tracker, if not NULL, is internally acquired by the push 11134 // responsible for the encrypted copy. 11135 __attribute__ ((format(printf, 6, 7))) 11136 static NV_STATUS encrypted_memcopy_cpu_to_gpu(uvm_gpu_t *gpu, 11137 uvm_gpu_address_t dst_gpu_address, 11138 void *src_plain, 11139 size_t size, 11140 uvm_tracker_t *tracker, 11141 const char *format, 11142 ...) 11143 { 11144 NV_STATUS status; 11145 uvm_push_t push; 11146 uvm_conf_computing_dma_buffer_t *dma_buffer; 11147 uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address; 11148 void *dst_cipher, *auth_tag; 11149 va_list args; 11150 11151 UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); 11152 UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE); 11153 11154 status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL); 11155 if (status != NV_OK) 11156 return status; 11157 11158 va_start(args, format); 11159 status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args); 11160 va_end(args); 11161 11162 if (status != NV_OK) 11163 goto out; 11164 11165 dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc); 11166 auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag); 11167 uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag); 11168 11169 src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu); 11170 auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu); 11171 gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address); 11172 11173 status = uvm_push_end_and_wait(&push); 11174 11175 out: 11176 uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL); 11177 return status; 11178 } 11179 11180 static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block, 11181 uvm_gpu_t *gpu, 11182 uvm_gpu_address_t dst_gpu_address, 11183 NvU64 dst, 11184 uvm_mem_t *src_mem, 11185 size_t size) 11186 { 11187 NV_STATUS status; 11188 uvm_push_t push; 11189 uvm_gpu_address_t src_gpu_address; 11190 11191 if (uvm_conf_computing_mode_enabled(gpu)) { 11192 return encrypted_memcopy_cpu_to_gpu(gpu, 11193 dst_gpu_address, 11194 uvm_mem_get_cpu_addr_kernel(src_mem), 11195 size, 11196 &va_block->tracker, 11197 "Encrypted write to [0x%llx, 0x%llx)", 11198 dst, 11199 dst + size); 11200 } 11201 11202 status = uvm_push_begin_acquire(gpu->channel_manager, 11203 UVM_CHANNEL_TYPE_CPU_TO_GPU, 11204 &va_block->tracker, 11205 &push, 11206 "Direct write to [0x%llx, 0x%llx)", 11207 dst, 11208 dst + size); 11209 if (status != NV_OK) 11210 return status; 11211 11212 src_gpu_address = uvm_mem_gpu_address_virtual_kernel(src_mem, gpu); 11213 gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size); 11214 return uvm_push_end_and_wait(&push); 11215 } 11216 11217 NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block, 11218 uvm_va_block_context_t *block_context, 11219 NvU64 dst, 11220 uvm_mem_t *src_mem, 11221 size_t size) 11222 { 11223 NV_STATUS status; 11224 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, dst); 11225 NvU64 page_offset = dst & (PAGE_SIZE - 1); 11226 uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, page_index, UVM_ID_CPU); 11227 uvm_va_block_region_t region = uvm_va_block_region_for_page(page_index); 11228 11229 uvm_assert_mutex_locked(&va_block->lock); 11230 UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Write spans multiple pages: dst 0x%llx, size 0x%zx\n", dst, size); 11231 11232 if (UVM_ID_IS_INVALID(proc)) 11233 proc = UVM_ID_CPU; 11234 11235 // Use make_resident() in all cases to break read-duplication, but 11236 // block_retry can be NULL as if the page is not resident yet we will make 11237 // it resident on the CPU. 11238 // Notably we don't care about coherence with respect to atomics from other 11239 // processors. 11240 status = uvm_va_block_make_resident(va_block, 11241 NULL, 11242 block_context, 11243 proc, 11244 region, 11245 NULL, 11246 NULL, 11247 UVM_MAKE_RESIDENT_CAUSE_API_TOOLS); 11248 11249 if (status != NV_OK) 11250 return status; 11251 11252 if (UVM_ID_IS_CPU(proc)) { 11253 char *mapped_page; 11254 struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, page_index); 11255 void *src = uvm_mem_get_cpu_addr_kernel(src_mem); 11256 11257 status = uvm_tracker_wait(&va_block->tracker); 11258 if (status != NV_OK) 11259 return status; 11260 11261 mapped_page = (char *)kmap(page); 11262 memcpy(mapped_page + page_offset, src, size); 11263 kunmap(page); 11264 11265 return NV_OK; 11266 } 11267 else { 11268 uvm_gpu_t *dst_gpu; 11269 uvm_gpu_address_t dst_gpu_address; 11270 11271 UVM_ASSERT(UVM_ID_IS_GPU(proc)); 11272 11273 dst_gpu = block_get_gpu(va_block, proc); 11274 11275 dst_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), dst_gpu); 11276 dst_gpu_address.address += page_offset; 11277 11278 return va_block_write_cpu_to_gpu(va_block, dst_gpu, dst_gpu_address, dst, src_mem, size); 11279 } 11280 } 11281 11282 static NV_STATUS va_block_read_gpu_to_cpu(uvm_va_block_t *va_block, 11283 uvm_mem_t *dst_mem, 11284 uvm_gpu_t *gpu, 11285 uvm_gpu_address_t src_gpu_address, 11286 NvU64 src, 11287 size_t size) 11288 { 11289 NV_STATUS status; 11290 uvm_push_t push; 11291 uvm_gpu_address_t dst_gpu_address; 11292 11293 if (uvm_conf_computing_mode_enabled(gpu)) { 11294 return encrypted_memcopy_gpu_to_cpu(gpu, 11295 uvm_mem_get_cpu_addr_kernel(dst_mem), 11296 src_gpu_address, 11297 size, 11298 &va_block->tracker, 11299 "Encrypted read from [0x%llx, 0x%llx)", 11300 src, 11301 src + size); 11302 } 11303 11304 status = uvm_push_begin_acquire(gpu->channel_manager, 11305 UVM_CHANNEL_TYPE_GPU_TO_CPU, 11306 &va_block->tracker, 11307 &push, 11308 "Direct read from [0x%llx, 0x%llx)", 11309 src, 11310 src + size); 11311 if (status != NV_OK) 11312 return status; 11313 11314 dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dst_mem, gpu); 11315 gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size); 11316 return uvm_push_end_and_wait(&push); 11317 } 11318 11319 NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block, uvm_mem_t *dst_mem, NvU64 src, size_t size) 11320 { 11321 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, src); 11322 NvU64 page_offset = src & (PAGE_SIZE - 1); 11323 uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, page_index, UVM_ID_CPU); 11324 void *dst = uvm_mem_get_cpu_addr_kernel(dst_mem); 11325 11326 uvm_assert_mutex_locked(&va_block->lock); 11327 UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Read spans multiple pages: src 0x%llx, size 0x%zx\n", src, size); 11328 11329 if (UVM_ID_IS_INVALID(proc)) { 11330 memset(dst, 0, size); 11331 return NV_OK; 11332 } 11333 else if (UVM_ID_IS_CPU(proc)) { 11334 NV_STATUS status; 11335 char *mapped_page; 11336 struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, page_index); 11337 11338 status = uvm_tracker_wait(&va_block->tracker); 11339 if (status != NV_OK) 11340 return status; 11341 11342 mapped_page = (char *)kmap(page); 11343 memcpy(dst, mapped_page + page_offset, size); 11344 kunmap(page); 11345 11346 return NV_OK; 11347 } 11348 else { 11349 uvm_gpu_address_t src_gpu_address; 11350 uvm_gpu_t *gpu = block_get_gpu(va_block, proc); 11351 11352 src_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), gpu); 11353 src_gpu_address.address += page_offset; 11354 11355 return va_block_read_gpu_to_cpu(va_block, dst_mem, gpu, src_gpu_address, src, size); 11356 } 11357 } 11358 11359 // Deferred work item reestablishing accessed by mappings after eviction. On 11360 // GPUs with access counters enabled, the evicted GPU will also get remote 11361 // mappings. 11362 static void block_add_eviction_mappings(void *args) 11363 { 11364 uvm_va_block_t *va_block = (uvm_va_block_t*)args; 11365 uvm_va_space_t *va_space; 11366 uvm_processor_id_t id; 11367 uvm_va_block_context_t *block_context = NULL; 11368 struct mm_struct *mm = NULL; 11369 11370 uvm_mutex_lock(&va_block->lock); 11371 va_space = uvm_va_block_get_va_space_maybe_dead(va_block); 11372 uvm_mutex_unlock(&va_block->lock); 11373 11374 if (!va_space) { 11375 // Block has been killed in the meantime 11376 goto done; 11377 } 11378 11379 mm = uvm_va_space_mm_retain_lock(va_space); 11380 11381 block_context = uvm_va_block_context_alloc(mm); 11382 if (!block_context) 11383 goto done; 11384 11385 // The block wasn't dead when we checked above and that's enough to 11386 // guarantee that the VA space is still around, because 11387 // uvm_va_space_destroy() flushes the associated nv_kthread_q, and that 11388 // flush waits for this function call to finish. 11389 uvm_va_space_down_read(va_space); 11390 11391 // Now that we have the VA space lock held, we can check whether the block 11392 // is still alive since the VA space write lock is needed to kill blocks. 11393 if (uvm_va_block_is_dead(va_block)) 11394 goto unlock; 11395 11396 if (uvm_va_block_is_hmm(va_block)) { 11397 uvm_hmm_block_add_eviction_mappings(va_space, va_block, block_context); 11398 } 11399 else { 11400 uvm_va_range_t *va_range = va_block->va_range; 11401 NV_STATUS status = NV_OK; 11402 11403 for_each_id_in_mask(id, &uvm_va_range_get_policy(va_range)->accessed_by) { 11404 status = uvm_va_block_set_accessed_by(va_block, block_context, id); 11405 if (status != NV_OK) 11406 break; 11407 } 11408 11409 if (status == NV_OK && uvm_va_space_map_remote_on_eviction(va_space)) { 11410 uvm_processor_mask_t map_processors; 11411 11412 // Exclude the processors that have been already mapped due to 11413 // AccessedBy 11414 uvm_processor_mask_andnot(&map_processors, 11415 &va_block->evicted_gpus, 11416 &uvm_va_range_get_policy(va_range)->accessed_by); 11417 11418 for_each_gpu_id_in_mask(id, &map_processors) { 11419 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id); 11420 uvm_va_block_gpu_state_t *gpu_state; 11421 11422 if (!gpu->parent->access_counters_supported) 11423 continue; 11424 11425 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 11426 UVM_ASSERT(gpu_state); 11427 11428 // TODO: Bug 2096389: uvm_va_block_add_mappings does not add 11429 // remote mappings to read-duplicated pages. Add support for it 11430 // or create a new function. 11431 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, NULL, 11432 uvm_va_block_add_mappings(va_block, 11433 block_context, 11434 id, 11435 uvm_va_block_region_from_block(va_block), 11436 &gpu_state->evicted, 11437 UvmEventMapRemoteCauseEviction)); 11438 if (status != NV_OK) 11439 break; 11440 } 11441 } 11442 11443 if (status != NV_OK) { 11444 UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s, processor %s\n", 11445 va_block->start, 11446 va_block->end, 11447 nvstatusToString(status), 11448 uvm_va_space_processor_name(va_space, id)); 11449 } 11450 } 11451 11452 unlock: 11453 uvm_va_space_up_read(va_space); 11454 uvm_va_block_context_free(block_context); 11455 11456 done: 11457 uvm_va_space_mm_release_unlock(va_space, mm); 11458 uvm_va_block_release(va_block); 11459 } 11460 11461 static void block_add_eviction_mappings_entry(void *args) 11462 { 11463 UVM_ENTRY_VOID(block_add_eviction_mappings(args)); 11464 } 11465 11466 NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block, 11467 uvm_gpu_t *gpu, 11468 uvm_gpu_chunk_t *root_chunk, 11469 uvm_tracker_t *tracker) 11470 { 11471 NV_STATUS status = NV_OK; 11472 NvU32 i; 11473 uvm_va_block_gpu_state_t *gpu_state; 11474 uvm_va_block_region_t chunk_region; 11475 size_t num_gpu_chunks = block_num_gpu_chunks(va_block, gpu); 11476 size_t chunks_to_evict = 0; 11477 uvm_va_block_context_t *block_context; 11478 uvm_page_mask_t *pages_to_evict; 11479 uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block); 11480 uvm_va_space_t *va_space = uvm_va_block_get_va_space_maybe_dead(va_block); 11481 struct mm_struct *mm; 11482 bool accessed_by_set = false; 11483 11484 uvm_assert_mutex_locked(&va_block->lock); 11485 11486 // The block might have been killed in the meantime 11487 if (!va_space) 11488 return NV_OK; 11489 11490 gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 11491 if (!gpu_state) 11492 return NV_OK; 11493 11494 if (va_block_test && va_block_test->inject_eviction_error) { 11495 va_block_test->inject_eviction_error = false; 11496 return NV_ERR_NO_MEMORY; 11497 } 11498 11499 // We cannot take this block's VA space or mmap_lock locks on the eviction 11500 // path, however, we retain mm in order to support accounting of CPU memory 11501 // allocations. If mappings need to be created, 11502 // block_add_eviction_mappings() will be scheduled below. 11503 mm = uvm_va_space_mm_retain(va_space); 11504 block_context = uvm_va_block_context_alloc(mm); 11505 if (!block_context) { 11506 if (mm) 11507 uvm_va_space_mm_release(va_space); 11508 return NV_ERR_NO_MEMORY; 11509 } 11510 11511 pages_to_evict = &block_context->caller_page_mask; 11512 uvm_page_mask_zero(pages_to_evict); 11513 chunk_region.outer = 0; 11514 11515 // Find all chunks that are subchunks of the root chunk 11516 for (i = 0; i < num_gpu_chunks; ++i) { 11517 uvm_chunk_size_t chunk_size; 11518 size_t chunk_index = block_gpu_chunk_index(va_block, gpu, chunk_region.outer, &chunk_size); 11519 UVM_ASSERT(chunk_index == i); 11520 chunk_region.first = chunk_region.outer; 11521 chunk_region.outer = chunk_region.first + chunk_size / PAGE_SIZE; 11522 11523 if (!gpu_state->chunks[i]) 11524 continue; 11525 if (!uvm_gpu_chunk_same_root(gpu_state->chunks[i], root_chunk)) 11526 continue; 11527 11528 if (uvm_va_block_is_hmm(va_block)) { 11529 status = uvm_hmm_va_block_evict_chunk_prep(va_block, block_context, gpu_state->chunks[i], chunk_region); 11530 if (status != NV_OK) 11531 break; 11532 } 11533 11534 uvm_page_mask_region_fill(pages_to_evict, chunk_region); 11535 ++chunks_to_evict; 11536 } 11537 11538 if (chunks_to_evict == 0) 11539 goto out; 11540 11541 // Only move pages resident on the GPU 11542 uvm_page_mask_and(pages_to_evict, pages_to_evict, uvm_va_block_resident_mask_get(va_block, gpu->id)); 11543 uvm_processor_mask_zero(&block_context->make_resident.all_involved_processors); 11544 11545 if (uvm_va_block_is_hmm(va_block)) { 11546 status = uvm_hmm_va_block_evict_chunks(va_block, 11547 block_context, 11548 pages_to_evict, 11549 uvm_va_block_region_from_block(va_block), 11550 &accessed_by_set); 11551 } 11552 else { 11553 const uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range); 11554 accessed_by_set = uvm_processor_mask_get_count(&policy->accessed_by) > 0; 11555 11556 // TODO: Bug 1765193: make_resident() breaks read-duplication, but it's 11557 // not necessary to do so for eviction. Add a version that unmaps only 11558 // the processors that have mappings to the pages being evicted. 11559 status = uvm_va_block_make_resident(va_block, 11560 NULL, 11561 block_context, 11562 UVM_ID_CPU, 11563 uvm_va_block_region_from_block(va_block), 11564 pages_to_evict, 11565 NULL, 11566 UVM_MAKE_RESIDENT_CAUSE_EVICTION); 11567 } 11568 if (status != NV_OK) 11569 goto out; 11570 11571 // VA space lock may not be held and hence we cannot reestablish any 11572 // mappings here and need to defer it to a work queue. 11573 // 11574 // Reading the accessed_by mask without the VA space lock is safe because 11575 // adding a new processor to the mask triggers going over all the VA blocks 11576 // in the range and locking them. And we hold one of the VA block's locks. 11577 // 11578 // If uvm_va_range_set_accessed_by() hasn't called 11579 // uvm_va_block_set_accessed_by() for this block yet then it will take care 11580 // of adding the mapping after we are done. If it already did then we are 11581 // guaranteed to see the new processor in the accessed_by mask because we 11582 // locked the block's lock that the thread calling 11583 // uvm_va_range_set_accessed_by() unlocked after updating the mask. 11584 // 11585 // If a processor gets removed from the mask then we might not notice and 11586 // schedule the work item anyway, but that's benign as 11587 // block_add_eviction_mappings() re-examines the mask. 11588 // 11589 // Checking if access counters migrations are enabled on a VA space is racy 11590 // without holding the VA space lock. However, this is fine as 11591 // block_add_eviction_mappings() reexamines the value with the VA space 11592 // lock being held. 11593 if (accessed_by_set || (gpu->parent->access_counters_supported && uvm_va_space_map_remote_on_eviction(va_space))) { 11594 // Always retain the VA block first so that it's safe for the deferred 11595 // callback to release it immediately after it runs. 11596 uvm_va_block_retain(va_block); 11597 11598 if (!nv_kthread_q_schedule_q_item(&g_uvm_global.global_q, 11599 &va_block->eviction_mappings_q_item)) { 11600 // And release it if no new callback was scheduled 11601 uvm_va_block_release_no_destroy(va_block); 11602 } 11603 } 11604 11605 status = uvm_tracker_add_tracker_safe(tracker, &va_block->tracker); 11606 if (status != NV_OK) 11607 goto out; 11608 11609 for (i = 0; i < num_gpu_chunks; ++i) { 11610 uvm_gpu_id_t accessing_gpu_id; 11611 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 11612 11613 if (!chunk) 11614 continue; 11615 if (!uvm_gpu_chunk_same_root(chunk, root_chunk)) 11616 continue; 11617 11618 // Remove the mappings of indirect peers from the reverse map. We 11619 // access the indirect peer mask from the VA space without holding the 11620 // VA space lock. Therefore, we can race with enable_peer/disable_peer 11621 // operations. However this is fine: 11622 // 11623 // The enable_peer sequence is as follows: 11624 // 11625 // set_bit in va_space->indirect_peers 11626 // uvm_va_block_enable_peer; 11627 // 11628 // - If we read the mask BEFORE it is set or AFTER the mapping has 11629 // been added to the map there is no race. 11630 // - If we read the mask AFTER it is set but BEFORE adding the mapping 11631 // to the reverse map, we will try to remove it although it is not 11632 // there yet. Therefore, we use 11633 // uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction, which does 11634 // not check if the mapping is present in the reverse map. 11635 // 11636 // The disable_peer sequence is as follows: 11637 // 11638 // uvm_va_block_disable_peer; 11639 // clear_bit in va_space->indirect_peers 11640 // 11641 // - If we read the mask BEFORE the mapping has been added to the map 11642 // or AFTER the bit has been cleared, there is no race. 11643 // - If we read the mask AFTER the mapping has been removed and BEFORE 11644 // the bit is cleared, we will try to remove the mapping, too. 11645 // Again, uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction works 11646 // in this scenario. 11647 // Obtain the uvm_gpu_t directly via the parent GPU's id since indirect 11648 // peers are not supported when SMC is enabled. 11649 for_each_gpu_id_in_mask(accessing_gpu_id, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 11650 uvm_gpu_t *accessing_gpu = uvm_va_space_get_gpu(va_space, accessing_gpu_id); 11651 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); 11652 11653 uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction(&accessing_gpu->pmm_reverse_sysmem_mappings, 11654 peer_addr); 11655 } 11656 11657 uvm_mmu_chunk_unmap(chunk, tracker); 11658 11659 uvm_pmm_gpu_mark_chunk_evicted(&gpu->pmm, gpu_state->chunks[i]); 11660 gpu_state->chunks[i] = NULL; 11661 } 11662 11663 out: 11664 uvm_va_block_context_free(block_context); 11665 if (mm) 11666 uvm_va_space_mm_release(va_space); 11667 11668 return status; 11669 } 11670 11671 static NV_STATUS block_gpu_force_4k_ptes(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu) 11672 { 11673 uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu); 11674 uvm_push_t push; 11675 NV_STATUS status; 11676 11677 // See comment in uvm_va_block_set_cancel 11678 UVM_ASSERT(!gpu->parent->fault_cancel_va_supported); 11679 11680 if (!gpu_state) 11681 return NV_ERR_NO_MEMORY; 11682 11683 // Force all pages to be 4K and prevent future upgrades during cancel 11684 gpu_state->force_4k_ptes = true; 11685 11686 // If we have no page tables we're done. For fault cancel we need to make 11687 // sure that fatal faults are on different 4k PTEs than non-fatal faults, 11688 // and we need to service all non-fatal faults before issuing the cancel. So 11689 // either all faults are fatal and we have no PTEs (we're PROT_NONE), or 11690 // we'll allocate PTEs later when we service the non-fatal faults. Those 11691 // PTEs will be 4k since force_4k_ptes is set. 11692 if (!block_gpu_has_page_tables(block, gpu)) 11693 return NV_OK; 11694 11695 // Are we 4k already? 11696 if (!gpu_state->pte_is_2m && bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 11697 return NV_OK; 11698 11699 status = block_alloc_ptes_with_retry(block, gpu, UVM_PAGE_SIZE_4K, NULL); 11700 if (status != NV_OK) 11701 return status; 11702 11703 status = uvm_push_begin_acquire(gpu->channel_manager, 11704 UVM_CHANNEL_TYPE_MEMOPS, 11705 &block->tracker, 11706 &push, 11707 "Forcing 4k PTEs on block [0x%llx, 0x%llx)", 11708 block->start, 11709 block->end + 1); 11710 if (status != NV_OK) 11711 return status; 11712 11713 if (gpu_state->pte_is_2m) 11714 block_gpu_split_2m(block, block_context, gpu, NULL, &push); 11715 else 11716 block_gpu_split_big(block, block_context, gpu, gpu_state->big_ptes, &push); 11717 11718 uvm_push_end(&push); 11719 11720 UVM_ASSERT(block_check_mappings(block)); 11721 11722 return uvm_tracker_add_push_safe(&block->tracker, &push); 11723 } 11724 11725 NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu) 11726 { 11727 uvm_assert_mutex_locked(&va_block->lock); 11728 11729 // Volta+ devices support a global VA cancel method that does not require 11730 // 4k PTEs. Thus, skip doing this PTE splitting, particularly because it 11731 // could result in 4k PTEs on P9 systems which otherwise would never need 11732 // them. 11733 if (gpu->parent->fault_cancel_va_supported) 11734 return NV_OK; 11735 11736 return block_gpu_force_4k_ptes(va_block, block_context, gpu); 11737 } 11738 11739 NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *params, struct file *filp) 11740 { 11741 uvm_va_space_t *va_space = uvm_va_space_get(filp); 11742 struct mm_struct *mm; 11743 uvm_va_block_t *va_block; 11744 uvm_va_block_test_t *va_block_test; 11745 NV_STATUS status = NV_OK; 11746 11747 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 11748 uvm_va_space_down_read(va_space); 11749 11750 if (mm) 11751 status = uvm_va_block_find_create(va_space, params->lookup_address, NULL, &va_block); 11752 else 11753 status = uvm_va_block_find_create_managed(va_space, params->lookup_address, &va_block); 11754 11755 if (status != NV_OK) 11756 goto out; 11757 11758 va_block_test = uvm_va_block_get_test(va_block); 11759 UVM_ASSERT(va_block_test); 11760 11761 uvm_mutex_lock(&va_block->lock); 11762 11763 if (params->page_table_allocation_retry_force_count) 11764 va_block_test->page_table_allocation_retry_force_count = params->page_table_allocation_retry_force_count; 11765 11766 if (params->user_pages_allocation_retry_force_count) 11767 va_block_test->user_pages_allocation_retry_force_count = params->user_pages_allocation_retry_force_count; 11768 11769 if (params->cpu_chunk_allocation_size_mask) { 11770 if (params->cpu_chunk_allocation_size_mask & ~UVM_CPU_CHUNK_SIZES || 11771 !(params->cpu_chunk_allocation_size_mask & PAGE_SIZE)) { 11772 status = NV_ERR_INVALID_ARGUMENT; 11773 goto block_unlock; 11774 } 11775 11776 va_block_test->cpu_chunk_allocation_size_mask = params->cpu_chunk_allocation_size_mask & UVM_CPU_CHUNK_SIZES; 11777 } 11778 11779 if (params->eviction_error) 11780 va_block_test->inject_eviction_error = params->eviction_error; 11781 11782 if (params->cpu_pages_allocation_error_count) 11783 va_block_test->inject_cpu_pages_allocation_error_count = params->cpu_pages_allocation_error_count; 11784 11785 if (params->populate_error) 11786 va_block_test->inject_populate_error = params->populate_error; 11787 11788 block_unlock: 11789 uvm_mutex_unlock(&va_block->lock); 11790 11791 out: 11792 uvm_va_space_up_read(va_space); 11793 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 11794 return status; 11795 } 11796 11797 static uvm_prot_t g_uvm_test_pte_mapping_to_prot[UVM_TEST_PTE_MAPPING_MAX] = 11798 { 11799 [UVM_TEST_PTE_MAPPING_INVALID] = UVM_PROT_NONE, 11800 [UVM_TEST_PTE_MAPPING_READ_ONLY] = UVM_PROT_READ_ONLY, 11801 [UVM_TEST_PTE_MAPPING_READ_WRITE] = UVM_PROT_READ_WRITE, 11802 [UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC] = UVM_PROT_READ_WRITE_ATOMIC, 11803 }; 11804 11805 static UVM_TEST_PTE_MAPPING g_uvm_prot_to_test_pte_mapping[UVM_PROT_MAX] = 11806 { 11807 [UVM_PROT_NONE] = UVM_TEST_PTE_MAPPING_INVALID, 11808 [UVM_PROT_READ_ONLY] = UVM_TEST_PTE_MAPPING_READ_ONLY, 11809 [UVM_PROT_READ_WRITE] = UVM_TEST_PTE_MAPPING_READ_WRITE, 11810 [UVM_PROT_READ_WRITE_ATOMIC] = UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC, 11811 }; 11812 11813 NV_STATUS uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS *params, struct file *filp) 11814 { 11815 uvm_va_space_t *va_space = uvm_va_space_get(filp); 11816 uvm_va_block_t *block; 11817 struct mm_struct *mm; 11818 NV_STATUS status = NV_OK; 11819 uvm_prot_t curr_prot, new_prot; 11820 uvm_gpu_t *gpu = NULL; 11821 uvm_processor_id_t id; 11822 uvm_tracker_t local_tracker; 11823 uvm_va_block_region_t region; 11824 uvm_va_block_context_t *block_context = NULL; 11825 11826 if (!PAGE_ALIGNED(params->va)) 11827 return NV_ERR_INVALID_ADDRESS; 11828 11829 if (params->mapping >= UVM_TEST_PTE_MAPPING_MAX) 11830 return NV_ERR_INVALID_ARGUMENT; 11831 11832 new_prot = g_uvm_test_pte_mapping_to_prot[params->mapping]; 11833 11834 // mmap_lock isn't needed for invalidating CPU mappings, but it will be 11835 // needed for inserting them. 11836 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 11837 uvm_va_space_down_read(va_space); 11838 11839 if (uvm_uuid_is_cpu(¶ms->uuid)) { 11840 id = UVM_ID_CPU; 11841 } 11842 else { 11843 gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->uuid); 11844 if (!gpu) { 11845 status = NV_ERR_INVALID_DEVICE; 11846 goto out; 11847 } 11848 11849 // Check if the GPU can access the VA 11850 if (!uvm_gpu_can_address(gpu, params->va, PAGE_SIZE)) { 11851 status = NV_ERR_OUT_OF_RANGE; 11852 goto out; 11853 } 11854 11855 id = gpu->id; 11856 } 11857 11858 block_context = uvm_va_block_context_alloc(mm); 11859 if (!block_context) { 11860 status = NV_ERR_NO_MEMORY; 11861 goto out; 11862 } 11863 11864 if (mm) 11865 status = uvm_va_block_find_create(va_space, params->va, &block_context->hmm.vma, &block); 11866 else 11867 status = uvm_va_block_find_create_managed(va_space, params->va, &block); 11868 11869 if (status != NV_OK) 11870 goto out; 11871 11872 // TODO: Bug 3912902: UvmTestChangePteMapping() doesn't work on CPU. 11873 if (UVM_ID_IS_CPU(id) && uvm_va_block_is_hmm(block)) 11874 goto out; 11875 11876 uvm_mutex_lock(&block->lock); 11877 11878 region = uvm_va_block_region_from_start_size(block, params->va, PAGE_SIZE); 11879 curr_prot = block_page_prot(block, id, region.first); 11880 11881 if (new_prot == curr_prot) { 11882 status = NV_OK; 11883 goto out_block; 11884 } 11885 11886 // TODO: Bug 1766124: Upgrades might require revoking other processors' 11887 // access privileges. We just fail for now. Only downgrades are 11888 // supported. If we allowed upgrades, we would need to check the mm 11889 // like we do for revocation below. 11890 if (new_prot > curr_prot) { 11891 status = NV_ERR_INVALID_OPERATION; 11892 goto out_block; 11893 } 11894 11895 if (new_prot == UVM_PROT_NONE) { 11896 status = uvm_va_block_unmap(block, block_context, id, region, NULL, &block->tracker); 11897 } 11898 else { 11899 UVM_ASSERT(block_is_page_resident_anywhere(block, region.first)); 11900 11901 // Revoking CPU mappings performs a combination of unmap + map. The map 11902 // portion requires a valid mm. 11903 if (UVM_ID_IS_CPU(id) && !uvm_va_range_vma_check(block->va_range, mm)) { 11904 status = NV_ERR_INVALID_STATE; 11905 } 11906 else { 11907 status = uvm_va_block_revoke_prot(block, 11908 block_context, 11909 id, 11910 region, 11911 NULL, 11912 new_prot + 1, 11913 &block->tracker); 11914 } 11915 } 11916 11917 out_block: 11918 if (status == NV_OK) 11919 status = uvm_tracker_init_from(&local_tracker, &block->tracker); 11920 11921 uvm_mutex_unlock(&block->lock); 11922 11923 if (status == NV_OK) 11924 status = uvm_tracker_wait_deinit(&local_tracker); 11925 11926 out: 11927 uvm_va_space_up_read(va_space); 11928 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 11929 11930 uvm_va_block_context_free(block_context); 11931 11932 return status; 11933 } 11934 11935 NV_STATUS uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS *params, struct file *filp) 11936 { 11937 uvm_va_space_t *va_space = uvm_va_space_get(filp); 11938 uvm_va_block_t *va_block; 11939 uvm_va_range_t *va_range; 11940 struct mm_struct *mm; 11941 size_t index; 11942 NV_STATUS status = NV_OK; 11943 11944 BUILD_BUG_ON(UVM_TEST_VA_BLOCK_SIZE != UVM_VA_BLOCK_SIZE); 11945 11946 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 11947 uvm_va_space_down_read(va_space); 11948 11949 va_range = uvm_va_range_find(va_space, params->lookup_address); 11950 if (!va_range) { 11951 status = uvm_hmm_va_block_find(va_space, params->lookup_address, &va_block); 11952 if (status == NV_ERR_OBJECT_NOT_FOUND) { 11953 status = uvm_hmm_va_block_range_bounds(va_space, 11954 mm, 11955 params->lookup_address, 11956 ¶ms->va_block_start, 11957 ¶ms->va_block_end, 11958 NULL); 11959 goto out; 11960 } 11961 else if (status != NV_OK) { 11962 goto out; 11963 } 11964 } 11965 else { 11966 index = uvm_va_range_block_index(va_range, params->lookup_address); 11967 va_block = uvm_va_range_block(va_range, index); 11968 if (!va_block) { 11969 status = NV_ERR_OBJECT_NOT_FOUND; 11970 goto out; 11971 } 11972 } 11973 11974 params->va_block_start = va_block->start; 11975 params->va_block_end = va_block->end; 11976 11977 out: 11978 uvm_va_space_up_read(va_space); 11979 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 11980 return status; 11981 } 11982 11983 NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params, struct file *filp) 11984 { 11985 NV_STATUS status = NV_OK; 11986 uvm_va_space_t *va_space = uvm_va_space_get(filp); 11987 uvm_va_range_t *va_range; 11988 uvm_va_block_t *block = NULL; 11989 struct mm_struct *mm; 11990 NvU32 count = 0; 11991 uvm_processor_mask_t resident_on_mask; 11992 uvm_processor_id_t id; 11993 uvm_page_index_t page_index; 11994 unsigned release_block_count = 0; 11995 NvU64 addr = UVM_ALIGN_DOWN(params->lookup_address, PAGE_SIZE); 11996 size_t index; 11997 11998 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 11999 uvm_va_space_down_read(va_space); 12000 12001 // Inline uvm_va_block_find() to get the va_range. 12002 va_range = uvm_va_range_find(va_space, addr); 12003 if (!va_range) { 12004 NvU64 start, end; 12005 12006 status = uvm_hmm_va_block_find(va_space, addr, &block); 12007 if (status != NV_OK) { 12008 if (status != NV_ERR_OBJECT_NOT_FOUND) 12009 goto out; 12010 status = uvm_hmm_va_block_range_bounds(va_space, mm, addr, &start, &end, params); 12011 goto out; 12012 } 12013 // Update current CPU mapping information. 12014 status = uvm_hmm_va_block_update_residency_info(block, mm, addr, false); 12015 if (status != NV_OK) { 12016 block = NULL; 12017 goto out; 12018 } 12019 } 12020 else if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) { 12021 status = NV_ERR_INVALID_ADDRESS; 12022 goto out; 12023 } 12024 else { 12025 index = uvm_va_range_block_index(va_range, addr); 12026 block = uvm_va_range_block(va_range, index); 12027 if (!block) { 12028 params->resident_on_count = 0; 12029 params->populated_on_count = 0; 12030 params->mapped_on_count = 0; 12031 12032 status = NV_OK; 12033 12034 goto out; 12035 } 12036 } 12037 12038 uvm_mutex_lock(&block->lock); 12039 12040 page_index = uvm_va_block_cpu_page_index(block, addr); 12041 uvm_va_block_page_resident_processors(block, page_index, &resident_on_mask); 12042 12043 for_each_id_in_mask(id, &resident_on_mask) { 12044 block_phys_page_t block_page = block_phys_page(id, page_index); 12045 uvm_va_space_processor_uuid(va_space, ¶ms->resident_on[count], id); 12046 params->resident_physical_size[count] = block_phys_page_size(block, block_page); 12047 if (UVM_ID_IS_CPU(id)) { 12048 params->resident_physical_address[count] = page_to_phys(uvm_cpu_chunk_get_cpu_page(block, page_index)); 12049 } 12050 else { 12051 params->resident_physical_address[count] = 12052 block_phys_page_address(block, block_page, uvm_va_space_get_gpu(va_space, id)).address; 12053 } 12054 ++count; 12055 } 12056 params->resident_on_count = count; 12057 12058 count = 0; 12059 for_each_id_in_mask(id, &block->mapped) { 12060 uvm_processor_id_t processor_to_map; 12061 block_phys_page_t block_page; 12062 NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index); 12063 12064 if (page_size == 0) 12065 continue; 12066 12067 uvm_va_space_processor_uuid(va_space, ¶ms->mapped_on[count], id); 12068 12069 params->mapping_type[count] = g_uvm_prot_to_test_pte_mapping[block_page_prot(block, id, page_index)]; 12070 UVM_ASSERT(params->mapping_type[count] != UVM_TEST_PTE_MAPPING_INVALID); 12071 processor_to_map = block_get_processor_to_map(block, id, page_index); 12072 block_page = block_phys_page(processor_to_map, page_index); 12073 12074 if (!UVM_ID_IS_CPU(id)) { 12075 uvm_gpu_phys_address_t gpu_phys_addr = block_phys_page_address(block, 12076 block_page, 12077 uvm_va_space_get_gpu(va_space, id)); 12078 params->mapping_physical_address[count] = gpu_phys_addr.address; 12079 } 12080 else { 12081 struct page *page = block_page_get(block, block_page); 12082 12083 params->mapping_physical_address[count] = page_to_phys(page); 12084 } 12085 12086 params->page_size[count] = page_size; 12087 ++count; 12088 } 12089 12090 if (params->resident_on_count == 1) { 12091 if (uvm_processor_mask_test(&resident_on_mask, UVM_ID_CPU)) { 12092 if (uvm_pmm_sysmem_mappings_indirect_supported()) { 12093 for_each_gpu_id(id) { 12094 NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index); 12095 uvm_reverse_map_t sysmem_page; 12096 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 12097 size_t num_pages; 12098 uvm_gpu_t *gpu; 12099 12100 if (!uvm_va_block_gpu_state_get(block, id)) 12101 continue; 12102 12103 gpu = uvm_va_space_get_gpu(va_space, id); 12104 12105 if (!gpu->parent->access_counters_supported) 12106 continue; 12107 12108 num_pages = uvm_pmm_sysmem_mappings_dma_to_virt(&gpu->pmm_reverse_sysmem_mappings, 12109 uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent), 12110 uvm_cpu_chunk_get_size(chunk), 12111 &sysmem_page, 12112 1); 12113 if (page_size > 0) 12114 UVM_ASSERT(num_pages == 1); 12115 else 12116 UVM_ASSERT(num_pages <= 1); 12117 12118 if (num_pages == 1) { 12119 UVM_ASSERT(sysmem_page.va_block == block); 12120 UVM_ASSERT(uvm_reverse_map_start(&sysmem_page) <= addr); 12121 UVM_ASSERT(uvm_reverse_map_end(&sysmem_page) > addr); 12122 12123 ++release_block_count; 12124 } 12125 } 12126 } 12127 } 12128 else { 12129 uvm_gpu_id_t id = uvm_processor_mask_find_first_id(&resident_on_mask); 12130 uvm_reverse_map_t gpu_mapping; 12131 size_t num_pages; 12132 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id); 12133 uvm_gpu_phys_address_t phys_addr; 12134 12135 phys_addr = uvm_va_block_gpu_phys_page_address(block, page_index, gpu); 12136 num_pages = uvm_pmm_gpu_phys_to_virt(&gpu->pmm, phys_addr.address, PAGE_SIZE, &gpu_mapping); 12137 12138 // Chunk may be in TEMP_PINNED state so it may not have a VA block 12139 // assigned. In that case, we don't get a valid translation. 12140 if (num_pages > 0) { 12141 UVM_ASSERT(num_pages == 1); 12142 UVM_ASSERT(gpu_mapping.va_block == block); 12143 UVM_ASSERT(uvm_reverse_map_start(&gpu_mapping) == addr); 12144 12145 ++release_block_count; 12146 } 12147 } 12148 } 12149 12150 params->mapped_on_count = count; 12151 12152 count = 0; 12153 for_each_processor_id(id) { 12154 if (!block_processor_page_is_populated(block, id, page_index)) 12155 continue; 12156 12157 uvm_va_space_processor_uuid(va_space, ¶ms->populated_on[count], id); 12158 ++count; 12159 } 12160 params->populated_on_count = count; 12161 12162 out: 12163 if (block) { 12164 if (!params->is_async && status == NV_OK) 12165 status = uvm_tracker_wait(&block->tracker); 12166 uvm_mutex_unlock(&block->lock); 12167 while (release_block_count--) 12168 uvm_va_block_release(block); 12169 } 12170 uvm_va_space_up_read(va_space); 12171 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 12172 return status; 12173 } 12174 12175 void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block) 12176 { 12177 block_mark_region_cpu_dirty(va_block, uvm_va_block_region_from_block(va_block)); 12178 } 12179