1 /******************************************************************************* 2 Copyright (c) 2015-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_linux.h" 25 #include "uvm_common.h" 26 #include "uvm_api.h" 27 #include "uvm_gpu.h" 28 #include "uvm_va_space.h" 29 #include "uvm_va_range.h" 30 #include "uvm_va_block.h" 31 #include "uvm_hal_types.h" 32 #include "uvm_kvmalloc.h" 33 #include "uvm_tools.h" 34 #include "uvm_push.h" 35 #include "uvm_hal.h" 36 #include "uvm_perf_thrashing.h" 37 #include "uvm_perf_prefetch.h" 38 #include "uvm_mem.h" 39 #include "uvm_gpu_access_counters.h" 40 #include "uvm_va_space_mm.h" 41 #include "uvm_test_ioctl.h" 42 #include "uvm_conf_computing.h" 43 44 typedef enum 45 { 46 BLOCK_PTE_OP_MAP, 47 BLOCK_PTE_OP_REVOKE, 48 BLOCK_PTE_OP_COUNT 49 } block_pte_op_t; 50 51 static NvU64 uvm_perf_authorized_cpu_fault_tracking_window_ns = 300000; 52 53 static struct kmem_cache *g_uvm_va_block_cache __read_mostly; 54 static struct kmem_cache *g_uvm_va_block_gpu_state_cache __read_mostly; 55 static struct kmem_cache *g_uvm_page_mask_cache __read_mostly; 56 static struct kmem_cache *g_uvm_va_block_context_cache __read_mostly; 57 58 static int uvm_fault_force_sysmem __read_mostly = 0; 59 module_param(uvm_fault_force_sysmem, int, S_IRUGO|S_IWUSR); 60 MODULE_PARM_DESC(uvm_fault_force_sysmem, "Force (1) using sysmem storage for pages that faulted. Default: 0."); 61 62 static int uvm_perf_map_remote_on_eviction __read_mostly = 1; 63 module_param(uvm_perf_map_remote_on_eviction, int, S_IRUGO); 64 65 // Caching is always disabled for mappings to remote memory. The following two 66 // module parameters can be used to force caching for GPU peer/sysmem mappings. 67 // 68 // However, it is important to note that it may not be safe to enable caching 69 // in the general case so the enablement should only be used for experiments. 70 static unsigned uvm_exp_gpu_cache_peermem __read_mostly = 0; 71 module_param(uvm_exp_gpu_cache_peermem, uint, S_IRUGO); 72 MODULE_PARM_DESC(uvm_exp_gpu_cache_peermem, 73 "Force caching for mappings to peer memory. " 74 "This is an experimental parameter that may cause correctness issues if used."); 75 76 static unsigned uvm_exp_gpu_cache_sysmem __read_mostly = 0; 77 module_param(uvm_exp_gpu_cache_sysmem, uint, S_IRUGO); 78 MODULE_PARM_DESC(uvm_exp_gpu_cache_sysmem, 79 "Force caching for mappings to system memory. " 80 "This is an experimental parameter that may cause correctness issues if used."); 81 82 static void block_add_eviction_mappings_entry(void *args); 83 84 uvm_va_space_t *uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t *va_block) 85 { 86 #if UVM_IS_CONFIG_HMM() 87 if (va_block->hmm.va_space) 88 return va_block->hmm.va_space; 89 #endif 90 91 if (va_block->va_range) 92 return va_block->va_range->va_space; 93 94 return NULL; 95 } 96 97 uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block) 98 { 99 uvm_va_space_t *va_space; 100 101 UVM_ASSERT(!uvm_va_block_is_dead(va_block)); 102 103 va_space = uvm_va_block_get_va_space_maybe_dead(va_block); 104 UVM_ASSERT(va_space); 105 106 return va_space; 107 } 108 109 bool uvm_va_block_check_policy_is_valid(uvm_va_block_t *va_block, 110 const uvm_va_policy_t *policy, 111 uvm_va_block_region_t region) 112 { 113 uvm_assert_mutex_locked(&va_block->lock); 114 115 if (uvm_va_block_is_hmm(va_block)) { 116 const uvm_va_policy_node_t *node; 117 118 if (uvm_va_policy_is_default(policy)) { 119 // There should only be the default policy within the region. 120 node = uvm_va_policy_node_iter_first(va_block, 121 uvm_va_block_region_start(va_block, region), 122 uvm_va_block_region_end(va_block, region)); 123 UVM_ASSERT(!node); 124 } 125 else { 126 // The policy node should cover the region. 127 node = uvm_va_policy_node_from_policy(policy); 128 UVM_ASSERT(node->node.start <= uvm_va_block_region_start(va_block, region)); 129 UVM_ASSERT(node->node.end >= uvm_va_block_region_end(va_block, region)); 130 } 131 } 132 else { 133 UVM_ASSERT(policy == uvm_va_range_get_policy(va_block->va_range)); 134 } 135 136 return true; 137 } 138 139 static NvU64 block_gpu_pte_flag_cacheable(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t resident_id) 140 { 141 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 142 143 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 144 145 // Local vidmem is always cached 146 if (uvm_id_equal(resident_id, gpu->id)) 147 return UVM_MMU_PTE_FLAGS_CACHED; 148 149 if (UVM_ID_IS_CPU(resident_id)) 150 return uvm_exp_gpu_cache_sysmem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED; 151 152 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(gpu->id)], resident_id)); 153 154 return uvm_exp_gpu_cache_peermem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED; 155 } 156 157 static uvm_gpu_t *block_get_gpu(uvm_va_block_t *block, uvm_gpu_id_t gpu_id) 158 { 159 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 160 161 return uvm_va_space_get_gpu(va_space, gpu_id); 162 } 163 164 static const char *block_processor_name(uvm_va_block_t *block, uvm_processor_id_t id) 165 { 166 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 167 168 return uvm_va_space_processor_name(va_space, id); 169 } 170 171 static bool block_processor_has_memory(uvm_va_block_t *block, uvm_processor_id_t id) 172 { 173 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 174 175 return uvm_va_space_processor_has_memory(va_space, id); 176 } 177 178 static bool is_uvm_fault_force_sysmem_set(void) 179 { 180 // Only enforce this during testing 181 return uvm_enable_builtin_tests && uvm_fault_force_sysmem != 0; 182 } 183 184 bool uvm_va_space_map_remote_on_eviction(uvm_va_space_t *va_space) 185 { 186 return uvm_perf_map_remote_on_eviction && 187 uvm_va_space_has_access_counter_migrations(va_space); 188 } 189 190 static const uvm_processor_mask_t *block_get_uvm_lite_gpus(uvm_va_block_t *va_block) 191 { 192 // Note that for HMM we always return a pointer to a zero bitmap 193 // (not allocated on the stack) since uvm_lite GPUs are not supported. 194 static const uvm_processor_mask_t uvm_lite_gpus = {}; 195 196 if (uvm_va_block_is_hmm(va_block)) 197 return &uvm_lite_gpus; 198 else 199 return &va_block->va_range->uvm_lite_gpus; 200 } 201 202 void uvm_va_block_retry_init(uvm_va_block_retry_t *retry) 203 { 204 if (!retry) 205 return; 206 207 uvm_tracker_init(&retry->tracker); 208 INIT_LIST_HEAD(&retry->used_chunks); 209 INIT_LIST_HEAD(&retry->free_chunks); 210 } 211 212 // The bottom bit of uvm_va_block_t::chunks is used to indicate how CPU chunks 213 // are stored. 214 // 215 // CPU chunk storage is handled in three different ways depending on the 216 // type of chunks the VA block owns. This is done to minimize the memory 217 // required to hold metadata. 218 typedef enum 219 { 220 // The uvm_va_block_t::chunk pointer points to a single 2MB 221 // CPU chunk. 222 UVM_CPU_CHUNK_STORAGE_CHUNK = 0, 223 224 // The uvm_va_block_t::chunks pointer points to a 225 // structure of mixed (64K and 4K) chunks. 226 UVM_CPU_CHUNK_STORAGE_MIXED, 227 UVM_CPU_CHUNK_STORAGE_COUNT, 228 } uvm_cpu_chunk_storage_type_t; 229 230 #define UVM_CPU_CHUNK_STORAGE_MASK 0x1 231 232 // The maximum number of slots in the mixed chunk mode (64K + 4K chunks) is 233 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK. Any leading/trailing misaligned pages will 234 // be stored in the first/last entry, respectively. 235 #define MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK MAX_BIG_PAGES_PER_UVM_VA_BLOCK 236 237 #define MAX_SMALL_CHUNKS_PER_BIG_SLOT (UVM_MIN_BIG_PAGE_SIZE / PAGE_SIZE) 238 239 // This structure is used when a VA block contains 64K or a mix of 64K and 4K 240 // CPU chunks. 241 // For every 64K CPU chunks, big_chunks will have its corresponding bit set 242 // and the corresponding index in slots will point directly to the 243 // uvm_cpu_chunk_t structure. 244 // 245 // For 4K CPU chunks, the corresponding bit in big_chunks will be clear and 246 // the element in slots will point to an array of 16 uvm_cpu_chunk_t pointers. 247 typedef struct { 248 DECLARE_BITMAP(big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK); 249 void *slots[MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK]; 250 } uvm_cpu_chunk_storage_mixed_t; 251 252 static uvm_va_block_region_t uvm_cpu_chunk_block_region(uvm_va_block_t *va_block, 253 uvm_cpu_chunk_t *chunk, 254 uvm_page_index_t page_index) 255 { 256 UVM_ASSERT(chunk); 257 return uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index); 258 } 259 260 static void *uvm_cpu_storage_get_ptr(uvm_va_block_t *block) 261 { 262 return (void *)(block->cpu.chunks & ~UVM_CPU_CHUNK_STORAGE_MASK); 263 } 264 265 static uvm_cpu_chunk_storage_type_t uvm_cpu_storage_get_type(uvm_va_block_t *block) 266 { 267 return block->cpu.chunks & UVM_CPU_CHUNK_STORAGE_MASK; 268 } 269 270 static uvm_page_index_t compute_page_prefix(uvm_va_block_t *va_block, uvm_chunk_size_t size) 271 { 272 return (UVM_ALIGN_UP(va_block->start, size) - va_block->start) / PAGE_SIZE; 273 } 274 275 static size_t compute_slot_index(uvm_va_block_t *va_block, uvm_page_index_t page_index) 276 { 277 uvm_va_block_region_t block_region = uvm_va_block_region_from_block(va_block); 278 uvm_page_index_t prefix; 279 size_t slot_index; 280 281 UVM_ASSERT(page_index < block_region.outer); 282 prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K); 283 284 if (page_index < prefix) 285 return 0; 286 287 slot_index = ((page_index - prefix) / MAX_SMALL_CHUNKS_PER_BIG_SLOT) + !!prefix; 288 UVM_ASSERT(slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK); 289 290 return slot_index; 291 } 292 293 static size_t compute_small_index(uvm_va_block_t *va_block, uvm_page_index_t page_index) 294 { 295 size_t prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K); 296 297 if (page_index < prefix) 298 return page_index; 299 300 return (page_index - prefix) % MAX_SMALL_CHUNKS_PER_BIG_SLOT; 301 } 302 303 NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, 304 uvm_cpu_chunk_t *chunk, 305 uvm_page_index_t page_index) 306 { 307 uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk); 308 uvm_va_block_region_t chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk)); 309 size_t slot_index; 310 uvm_cpu_chunk_storage_mixed_t *mixed; 311 uvm_cpu_chunk_t **chunks = NULL; 312 313 // We only want to use the bottom bit of a pointer. 314 BUILD_BUG_ON(UVM_CPU_CHUNK_STORAGE_COUNT > 2); 315 316 // We want to protect against two threads manipulating the VA block's CPU 317 // chunks at the same time. However, when a block is split, the new block's 318 // lock is locked without tracking. So, we can't use 319 // uvm_assert_mutex_locked(). 320 UVM_ASSERT(mutex_is_locked(&va_block->lock.m)); 321 322 if (chunk_size == UVM_CHUNK_SIZE_2M) { 323 UVM_ASSERT(uvm_va_block_size(va_block) == UVM_PAGE_SIZE_2M); 324 UVM_ASSERT(!va_block->cpu.chunks); 325 va_block->cpu.chunks = (unsigned long)chunk | UVM_CPU_CHUNK_STORAGE_CHUNK; 326 } 327 else { 328 if (!va_block->cpu.chunks) { 329 mixed = uvm_kvmalloc_zero(sizeof(*mixed)); 330 if (!mixed) 331 return NV_ERR_NO_MEMORY; 332 333 va_block->cpu.chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED; 334 } 335 336 UVM_ASSERT(uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_MIXED); 337 mixed = uvm_cpu_storage_get_ptr(va_block); 338 slot_index = compute_slot_index(va_block, page_index); 339 UVM_ASSERT(compute_slot_index(va_block, page_index + uvm_cpu_chunk_num_pages(chunk) - 1) == slot_index); 340 UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks)); 341 342 if (chunk_size == UVM_CHUNK_SIZE_64K) { 343 mixed->slots[slot_index] = chunk; 344 set_bit(slot_index, mixed->big_chunks); 345 } 346 else { 347 size_t small_index; 348 349 UVM_ASSERT(chunk_size == UVM_CHUNK_SIZE_4K); 350 chunks = mixed->slots[slot_index]; 351 352 if (!chunks) { 353 chunks = uvm_kvmalloc_zero(sizeof(*chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT); 354 if (!chunks) 355 return NV_ERR_NO_MEMORY; 356 mixed->slots[slot_index] = chunks; 357 } 358 359 small_index = compute_small_index(va_block, page_index); 360 chunks[small_index] = chunk; 361 } 362 } 363 364 uvm_page_mask_region_fill(&va_block->cpu.allocated, chunk_region); 365 return NV_OK; 366 } 367 368 uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block, uvm_page_index_t page_index) 369 { 370 uvm_cpu_chunk_storage_mixed_t *mixed; 371 uvm_cpu_chunk_t *chunk; 372 uvm_cpu_chunk_t **chunks; 373 size_t slot_index; 374 375 UVM_ASSERT(page_index < uvm_va_block_num_cpu_pages(va_block)); 376 if (!uvm_page_mask_test(&va_block->cpu.allocated, page_index)) 377 return NULL; 378 379 UVM_ASSERT(va_block->cpu.chunks); 380 381 if (uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_CHUNK) { 382 return uvm_cpu_storage_get_ptr(va_block); 383 } 384 else { 385 mixed = uvm_cpu_storage_get_ptr(va_block); 386 slot_index = compute_slot_index(va_block, page_index); 387 UVM_ASSERT(mixed->slots[slot_index] != NULL); 388 if (test_bit(slot_index, mixed->big_chunks)) 389 return mixed->slots[slot_index]; 390 391 chunks = mixed->slots[slot_index]; 392 chunk = chunks[compute_small_index(va_block, page_index)]; 393 } 394 395 UVM_ASSERT(chunk); 396 return chunk; 397 } 398 399 void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, 400 uvm_page_index_t page_index) 401 { 402 uvm_cpu_chunk_storage_mixed_t *mixed; 403 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); 404 uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index); 405 size_t slot_index; 406 uvm_cpu_chunk_t **chunks; 407 408 // We want to protect against two threads manipulating the VA block's CPU 409 // chunks at the same time. However, when a block is split, the new block's 410 // lock is locked without tracking. So, we can't use 411 // uvm_assert_mutex_locked(). 412 UVM_ASSERT(mutex_is_locked(&va_block->lock.m)); 413 UVM_ASSERT(va_block->cpu.chunks); 414 UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == uvm_cpu_chunk_num_pages(chunk)); 415 416 if (uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_CHUNK) { 417 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M); 418 UVM_ASSERT(uvm_cpu_storage_get_ptr(va_block) == chunk); 419 va_block->cpu.chunks = 0; 420 } 421 else { 422 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) != UVM_CHUNK_SIZE_2M); 423 mixed = uvm_cpu_storage_get_ptr(va_block); 424 slot_index = compute_slot_index(va_block, page_index); 425 UVM_ASSERT(mixed->slots[slot_index] != NULL); 426 427 if (test_bit(slot_index, mixed->big_chunks)) { 428 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K); 429 UVM_ASSERT(mixed->slots[slot_index] == chunk); 430 mixed->slots[slot_index] = NULL; 431 clear_bit(slot_index, mixed->big_chunks); 432 } 433 else { 434 size_t small_index; 435 436 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K); 437 chunks = mixed->slots[slot_index]; 438 small_index = compute_small_index(va_block, page_index); 439 UVM_ASSERT(chunks[small_index] == chunk); 440 chunks[small_index] = NULL; 441 442 for (small_index = 0; small_index < MAX_SMALL_CHUNKS_PER_BIG_SLOT; small_index++) { 443 if (chunks[small_index]) 444 break; 445 } 446 447 if (small_index == MAX_SMALL_CHUNKS_PER_BIG_SLOT) { 448 uvm_kvfree(chunks); 449 mixed->slots[slot_index] = NULL; 450 } 451 } 452 } 453 454 uvm_page_mask_region_clear(&va_block->cpu.allocated, chunk_region); 455 456 if (uvm_page_mask_empty(&va_block->cpu.allocated) && va_block->cpu.chunks) { 457 uvm_kvfree(uvm_cpu_storage_get_ptr(va_block)); 458 va_block->cpu.chunks = 0; 459 } 460 } 461 462 struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index) 463 { 464 uvm_va_block_region_t chunk_region; 465 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); 466 467 UVM_ASSERT(chunk); 468 UVM_ASSERT(chunk->page); 469 chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index); 470 return chunk->page + (page_index - chunk_region.first); 471 } 472 473 static uvm_cpu_chunk_t *uvm_cpu_chunk_first_in_region(uvm_va_block_t *va_block, 474 uvm_va_block_region_t region, 475 uvm_page_index_t *first_chunk_page) 476 { 477 uvm_cpu_chunk_t *chunk = NULL; 478 uvm_page_index_t page_index; 479 480 page_index = uvm_va_block_first_page_in_mask(region, &va_block->cpu.allocated); 481 if (page_index < region.outer) 482 chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); 483 484 if (first_chunk_page && chunk) { 485 uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index); 486 *first_chunk_page = chunk_region.first; 487 } 488 489 return chunk; 490 } 491 492 #define for_each_cpu_chunk_in_block_region(chunk, page_index, va_block, region) \ 493 for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), &(page_index)); \ 494 (chunk) != NULL; \ 495 (chunk) = uvm_cpu_chunk_first_in_region((va_block), \ 496 uvm_va_block_region((page_index) + uvm_cpu_chunk_num_pages((chunk)), \ 497 (region).outer), \ 498 &(page_index))) 499 500 #define for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, region) \ 501 for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), &(page_index)), \ 502 (next_page_index) = (page_index) + (chunk ? uvm_cpu_chunk_num_pages(chunk) : 0); \ 503 (chunk) != NULL; \ 504 (chunk) = uvm_cpu_chunk_first_in_region((va_block), \ 505 uvm_va_block_region((next_page_index), (region).outer), \ 506 &(page_index)), \ 507 (next_page_index) = (page_index) + ((chunk) ? uvm_cpu_chunk_num_pages((chunk)) : 0)) 508 509 #define for_each_cpu_chunk_in_block(chunk, page_index, va_block) \ 510 for_each_cpu_chunk_in_block_region((chunk), (page_index), (va_block), uvm_va_block_region_from_block((va_block))) 511 512 #define for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, va_block) \ 513 for_each_cpu_chunk_in_block_region_safe((chunk), \ 514 (page_index), \ 515 (next_page_index), \ 516 (va_block), \ 517 uvm_va_block_region_from_block((va_block))) 518 519 struct vm_area_struct *uvm_va_block_find_vma_region(uvm_va_block_t *va_block, 520 struct mm_struct *mm, 521 NvU64 start, 522 uvm_va_block_region_t *region) 523 { 524 struct vm_area_struct *vma; 525 NvU64 end; 526 527 if (start > va_block->end) 528 return NULL; 529 530 vma = find_vma_intersection(mm, start, va_block->end + 1); 531 if (!vma) 532 return NULL; 533 534 if (start < vma->vm_start) 535 start = vma->vm_start; 536 537 end = vma->vm_end - 1; 538 if (end > va_block->end) 539 end = va_block->end; 540 541 *region = uvm_va_block_region_from_start_end(va_block, start, end); 542 543 return vma; 544 } 545 546 static bool block_check_cpu_chunks(uvm_va_block_t *block) 547 { 548 uvm_cpu_chunk_t *chunk; 549 size_t alloced_pages = 0; 550 uvm_va_block_region_t prev_region = { 0 }; 551 uvm_page_index_t page_index; 552 553 for_each_cpu_chunk_in_block(chunk, page_index, block) { 554 uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(block, chunk, page_index); 555 size_t num_chunk_pages = uvm_cpu_chunk_num_pages(chunk); 556 uvm_page_index_t chunk_page; 557 558 UVM_ASSERT(prev_region.outer <= chunk_region.first); 559 UVM_ASSERT(IS_ALIGNED(uvm_va_block_region_start(block, chunk_region), uvm_cpu_chunk_get_size(chunk))); 560 UVM_ASSERT(chunk_region.outer <= uvm_va_block_num_cpu_pages(block)); 561 562 alloced_pages += uvm_cpu_chunk_num_pages(chunk); 563 UVM_ASSERT(uvm_page_mask_region_full(&block->cpu.allocated, chunk_region)); 564 prev_region = chunk_region; 565 566 for (chunk_page = page_index; chunk_page < page_index + num_chunk_pages; chunk_page++) 567 UVM_ASSERT(uvm_cpu_chunk_get_chunk_for_page(block, chunk_page) == chunk); 568 } 569 570 UVM_ASSERT(alloced_pages == uvm_page_mask_weight(&block->cpu.allocated)); 571 572 return true; 573 } 574 575 // Frees any left-over free chunks and unpins all the used chunks 576 void uvm_va_block_retry_deinit(uvm_va_block_retry_t *retry, uvm_va_block_t *va_block) 577 { 578 uvm_gpu_t *gpu; 579 uvm_gpu_chunk_t *gpu_chunk; 580 uvm_gpu_chunk_t *next_chunk; 581 582 if (!retry) 583 return; 584 585 uvm_tracker_deinit(&retry->tracker); 586 587 // Free any unused chunks 588 list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->free_chunks, list) { 589 list_del_init(&gpu_chunk->list); 590 gpu = uvm_gpu_chunk_get_gpu(gpu_chunk); 591 uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL); 592 } 593 594 // Unpin all the used chunks now that we are done 595 list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->used_chunks, list) { 596 list_del_init(&gpu_chunk->list); 597 gpu = uvm_gpu_chunk_get_gpu(gpu_chunk); 598 // HMM should have already moved allocated blocks to the referenced 599 // state so any left over were not migrated and should be freed. 600 if (uvm_va_block_is_hmm(va_block)) 601 uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL); 602 else 603 uvm_pmm_gpu_unpin_allocated(&gpu->pmm, gpu_chunk, va_block); 604 } 605 } 606 607 static void block_retry_add_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk) 608 { 609 list_add_tail(&gpu_chunk->list, &retry->free_chunks); 610 } 611 612 static void block_retry_add_used_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk) 613 { 614 list_add_tail(&gpu_chunk->list, &retry->used_chunks); 615 } 616 617 static uvm_gpu_chunk_t *block_retry_get_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_t *gpu, uvm_chunk_size_t size) 618 { 619 uvm_gpu_chunk_t *gpu_chunk; 620 621 list_for_each_entry(gpu_chunk, &retry->free_chunks, list) { 622 if (uvm_gpu_chunk_get_gpu(gpu_chunk) == gpu && uvm_gpu_chunk_get_size(gpu_chunk) == size) { 623 list_del_init(&gpu_chunk->list); 624 return gpu_chunk; 625 } 626 } 627 628 return NULL; 629 } 630 631 // Encapsulates a reference to a physical page belonging to a specific processor 632 // within a VA block. 633 typedef struct 634 { 635 // Processor the page is on 636 uvm_processor_id_t processor; 637 638 // The page index 639 uvm_page_index_t page_index; 640 } block_phys_page_t; 641 642 static block_phys_page_t block_phys_page(uvm_processor_id_t processor, uvm_page_index_t page_index) 643 { 644 return (block_phys_page_t){ processor, page_index }; 645 } 646 647 NV_STATUS uvm_va_block_init(void) 648 { 649 if (uvm_enable_builtin_tests) 650 g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_wrapper_t", uvm_va_block_wrapper_t); 651 else 652 g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_t", uvm_va_block_t); 653 654 if (!g_uvm_va_block_cache) 655 return NV_ERR_NO_MEMORY; 656 657 g_uvm_va_block_gpu_state_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_gpu_state_t", uvm_va_block_gpu_state_t); 658 if (!g_uvm_va_block_gpu_state_cache) 659 return NV_ERR_NO_MEMORY; 660 661 g_uvm_page_mask_cache = NV_KMEM_CACHE_CREATE("uvm_page_mask_t", uvm_page_mask_t); 662 if (!g_uvm_page_mask_cache) 663 return NV_ERR_NO_MEMORY; 664 665 g_uvm_va_block_context_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_context_t", uvm_va_block_context_t); 666 if (!g_uvm_va_block_context_cache) 667 return NV_ERR_NO_MEMORY; 668 669 return NV_OK; 670 } 671 672 void uvm_va_block_exit(void) 673 { 674 kmem_cache_destroy_safe(&g_uvm_va_block_context_cache); 675 kmem_cache_destroy_safe(&g_uvm_page_mask_cache); 676 kmem_cache_destroy_safe(&g_uvm_va_block_gpu_state_cache); 677 kmem_cache_destroy_safe(&g_uvm_va_block_cache); 678 } 679 680 uvm_va_block_context_t *uvm_va_block_context_alloc(struct mm_struct *mm) 681 { 682 uvm_va_block_context_t *block_context = kmem_cache_alloc(g_uvm_va_block_context_cache, NV_UVM_GFP_FLAGS); 683 if (block_context) 684 uvm_va_block_context_init(block_context, mm); 685 686 return block_context; 687 } 688 689 void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context) 690 { 691 if (va_block_context) 692 kmem_cache_free(g_uvm_va_block_context_cache, va_block_context); 693 } 694 695 // Convert from page_index to chunk_index. The goal is for each system page in 696 // the region [start, start + size) to be covered by the largest naturally- 697 // aligned user chunk size. 698 size_t uvm_va_block_gpu_chunk_index_range(NvU64 start, 699 NvU64 size, 700 uvm_gpu_t *gpu, 701 uvm_page_index_t page_index, 702 uvm_chunk_size_t *out_chunk_size) 703 { 704 uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes; 705 uvm_chunk_size_t chunk_size, final_chunk_size; 706 size_t num_chunks, num_chunks_total; 707 NvU64 addr, end, aligned_start, aligned_addr, aligned_end, temp_size; 708 709 UVM_ASSERT(PAGE_ALIGNED(start)); 710 UVM_ASSERT(PAGE_ALIGNED(size)); 711 UVM_ASSERT(size > 0); 712 UVM_ASSERT(size <= UVM_CHUNK_SIZE_2M); 713 UVM_ASSERT(UVM_ALIGN_DOWN(start, UVM_CHUNK_SIZE_2M) == UVM_ALIGN_DOWN(start + size - 1, UVM_CHUNK_SIZE_2M)); 714 BUILD_BUG_ON(UVM_VA_BLOCK_SIZE != UVM_CHUNK_SIZE_2M); 715 716 // PAGE_SIZE needs to be the lowest natively-supported chunk size in the 717 // mask, since we never deal with chunk sizes smaller than that (although we 718 // may have PTEs mapping pages smaller than that). 719 UVM_ASSERT(uvm_chunk_find_first_size(chunk_sizes) == PAGE_SIZE); 720 721 // Optimize the ideal Pascal+ case: the whole block is covered by a single 722 // 2M page. 723 if ((chunk_sizes & UVM_CHUNK_SIZE_2M) && size == UVM_CHUNK_SIZE_2M) { 724 UVM_ASSERT(IS_ALIGNED(start, UVM_CHUNK_SIZE_2M)); 725 final_chunk_size = UVM_CHUNK_SIZE_2M; 726 num_chunks_total = 0; 727 goto out; 728 } 729 730 // Only one 2M chunk can fit within a VA block on any GPU architecture, so 731 // remove that size from consideration. 732 chunk_sizes &= ~UVM_CHUNK_SIZE_2M; 733 734 // Next common case: the whole block is aligned and sized to perfectly fit 735 // the largest page size. 736 final_chunk_size = uvm_chunk_find_last_size(chunk_sizes); 737 if (IS_ALIGNED(start, final_chunk_size) && IS_ALIGNED(size, final_chunk_size)) { 738 num_chunks_total = (size_t)uvm_div_pow2_64(page_index * PAGE_SIZE, final_chunk_size); 739 goto out; 740 } 741 742 // We didn't hit our special paths. Do it the hard way. 743 744 num_chunks_total = 0; 745 addr = start + page_index * PAGE_SIZE; 746 end = start + size; 747 final_chunk_size = 0; 748 UVM_ASSERT(addr < end); 749 750 // The below loop collapses almost completely when chunk_size == PAGE_SIZE 751 // since in that lowest-common-denominator case everything is already 752 // aligned. Skip it and handle that specially after the loop. 753 // 754 // Note that since we removed 2M already above, this loop will only iterate 755 // once on x86 Pascal+ since only 64K is left. 756 chunk_sizes &= ~PAGE_SIZE; 757 758 // This loop calculates the number of chunks between start and addr by 759 // calculating the number of whole chunks of each size between them, 760 // starting with the largest allowed chunk size. This requires fewer 761 // iterations than if we began from start and kept calculating the next 762 // larger chunk size boundary. 763 for_each_chunk_size_rev(chunk_size, chunk_sizes) { 764 aligned_start = UVM_ALIGN_UP(start, chunk_size); 765 aligned_addr = UVM_ALIGN_DOWN(addr, chunk_size); 766 aligned_end = UVM_ALIGN_DOWN(end, chunk_size); 767 768 // If addr and start are within the same chunk, try smaller 769 if (aligned_start > aligned_addr) 770 continue; 771 772 // If addr and end are not in the same chunk, then addr is covered by a 773 // single chunk of the current size. Ignore smaller boundaries between 774 // addr and aligned_addr. 775 if (aligned_addr < aligned_end && final_chunk_size == 0) { 776 addr = aligned_addr; 777 final_chunk_size = chunk_size; 778 } 779 780 // How many chunks of this size are between start and addr? Note that 781 // this might be 0 since aligned_addr and aligned_start could be in the 782 // same chunk. 783 num_chunks = uvm_div_pow2_32(((NvU32)aligned_addr - aligned_start), chunk_size); 784 num_chunks_total += num_chunks; 785 786 // We've already accounted for these chunks, so "remove" them by 787 // bringing start, addr, and end closer together to calculate the 788 // remaining chunk sizes. 789 temp_size = num_chunks * chunk_size; 790 addr -= temp_size; 791 end -= temp_size; 792 793 // Once there's no separation between addr and start, and we've 794 // successfully found the right chunk size when taking end into account, 795 // we're done. 796 if (addr == start && final_chunk_size) 797 break; 798 } 799 800 // Handle PAGE_SIZE cleanup since we skipped it in the loop 801 num_chunks_total += (addr - start) / PAGE_SIZE; 802 if (final_chunk_size == 0) 803 final_chunk_size = PAGE_SIZE; 804 805 out: 806 if (out_chunk_size) 807 *out_chunk_size = final_chunk_size; 808 809 return num_chunks_total; 810 } 811 812 static size_t block_gpu_chunk_index_range(uvm_va_block_t *va_block, 813 NvU64 start, 814 NvU64 size, 815 uvm_gpu_t *gpu, 816 uvm_page_index_t page_index, 817 uvm_chunk_size_t *out_chunk_size) 818 { 819 if (uvm_va_block_is_hmm(va_block)) { 820 if (out_chunk_size) 821 *out_chunk_size = PAGE_SIZE; 822 return page_index; 823 } 824 825 return uvm_va_block_gpu_chunk_index_range(start, size, gpu, page_index, out_chunk_size); 826 } 827 828 static size_t block_gpu_chunk_index(uvm_va_block_t *block, 829 uvm_gpu_t *gpu, 830 uvm_page_index_t page_index, 831 uvm_chunk_size_t *out_chunk_size) 832 { 833 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 834 uvm_chunk_size_t size; 835 uvm_gpu_chunk_t *chunk; 836 size_t index; 837 838 index = block_gpu_chunk_index_range(block, block->start, uvm_va_block_size(block), gpu, page_index, &size); 839 840 UVM_ASSERT(size >= PAGE_SIZE); 841 842 if (gpu_state) { 843 UVM_ASSERT(gpu_state->chunks); 844 chunk = gpu_state->chunks[index]; 845 if (chunk) { 846 UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == size); 847 UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED); 848 UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_FREE); 849 } 850 } 851 852 if (out_chunk_size) 853 *out_chunk_size = size; 854 855 return index; 856 } 857 858 // Compute the size of the chunk known to start at start_page_index 859 static uvm_chunk_size_t block_gpu_chunk_size(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t start_page_index) 860 { 861 uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes; 862 uvm_chunk_sizes_mask_t start_alignments, pow2_leq_size, allowed_sizes; 863 NvU64 start = uvm_va_block_cpu_page_address(block, start_page_index); 864 NvU64 size = block->end - start + 1; 865 866 if (uvm_va_block_is_hmm(block)) 867 return PAGE_SIZE; 868 869 // Create a mask of all sizes for which start is aligned. x ^ (x-1) yields a 870 // mask of the rightmost 1 bit in x, as well as all trailing 0 bits in x. 871 // Example: 1011000 -> 0001111 872 start_alignments = (uvm_chunk_sizes_mask_t)(start ^ (start - 1)); 873 874 // Next, compute all sizes (powers of two) which are <= size. 875 pow2_leq_size = (uvm_chunk_sizes_mask_t)rounddown_pow_of_two(size); 876 pow2_leq_size |= pow2_leq_size - 1; 877 878 // Now and them all together to get our list of GPU-supported chunk sizes 879 // which are aligned to start and will fit within size. 880 allowed_sizes = chunk_sizes & start_alignments & pow2_leq_size; 881 882 // start and size must always be aligned to at least the smallest supported 883 // chunk size (PAGE_SIZE). 884 UVM_ASSERT(allowed_sizes >= PAGE_SIZE); 885 886 // Take the largest allowed size 887 return uvm_chunk_find_last_size(allowed_sizes); 888 } 889 890 static size_t block_num_gpu_chunks(uvm_va_block_t *block, uvm_gpu_t *gpu) 891 { 892 return block_gpu_chunk_index(block, gpu, uvm_va_block_cpu_page_index(block, block->end), NULL) + 1; 893 } 894 895 static size_t block_num_gpu_chunks_range(uvm_va_block_t *block, NvU64 start, NvU64 size, uvm_gpu_t *gpu) 896 { 897 uvm_page_index_t last_page_index = (size_t)((size / PAGE_SIZE) - 1); 898 return block_gpu_chunk_index_range(block, start, size, gpu, last_page_index, NULL) + 1; 899 } 900 901 uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU64 address) 902 { 903 size_t chunk_index; 904 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 905 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, address); 906 907 uvm_assert_mutex_locked(&va_block->lock); 908 909 if (!gpu_state) 910 return NULL; 911 912 chunk_index = block_gpu_chunk_index(va_block, gpu, page_index, NULL); 913 914 return gpu_state->chunks[chunk_index]; 915 } 916 917 NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range, 918 NvU64 start, 919 NvU64 end, 920 uvm_va_block_t **out_block) 921 { 922 uvm_va_block_t *block = NULL; 923 NvU64 size = end - start + 1; 924 925 UVM_ASSERT(PAGE_ALIGNED(start)); 926 UVM_ASSERT(PAGE_ALIGNED(end + 1)); 927 UVM_ASSERT(PAGE_ALIGNED(size)); 928 UVM_ASSERT(size > 0); 929 UVM_ASSERT(size <= UVM_VA_BLOCK_SIZE); 930 931 if (va_range) { 932 // Create a managed va_block. 933 UVM_ASSERT(start >= va_range->node.start); 934 UVM_ASSERT(end <= va_range->node.end); 935 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 936 } 937 938 // Blocks can't span a block alignment boundary 939 UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end)); 940 941 if (uvm_enable_builtin_tests) { 942 uvm_va_block_wrapper_t *block_wrapper = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS); 943 944 if (block_wrapper) 945 block = &block_wrapper->block; 946 } 947 else { 948 block = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS); 949 } 950 951 if (!block) 952 return NV_ERR_NO_MEMORY; 953 954 nv_kref_init(&block->kref); 955 uvm_mutex_init(&block->lock, UVM_LOCK_ORDER_VA_BLOCK); 956 block->start = start; 957 block->end = end; 958 block->va_range = va_range; 959 uvm_tracker_init(&block->tracker); 960 block->prefetch_info.last_migration_proc_id = UVM_ID_INVALID; 961 962 nv_kthread_q_item_init(&block->eviction_mappings_q_item, block_add_eviction_mappings_entry, block); 963 964 *out_block = block; 965 return NV_OK; 966 } 967 968 static void cpu_chunk_remove_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu) 969 { 970 NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 971 if (gpu_mapping_addr == 0) 972 return; 973 974 uvm_pmm_sysmem_mappings_remove_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, gpu_mapping_addr); 975 uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu->parent); 976 } 977 978 static NV_STATUS cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, 979 uvm_va_block_t *block, 980 uvm_page_index_t page_index, 981 uvm_gpu_t *gpu) 982 { 983 NV_STATUS status; 984 uvm_chunk_size_t chunk_size; 985 986 // When the Confidential Computing feature is enabled the transfers don't 987 // use the DMA mapping of CPU chunks (since it's protected memory), but 988 // the DMA address of the unprotected dma buffer. 989 if (uvm_conf_computing_mode_enabled(gpu)) 990 return NV_OK; 991 992 status = uvm_cpu_chunk_map_gpu(chunk, gpu); 993 if (status != NV_OK) 994 return status; 995 996 chunk_size = uvm_cpu_chunk_get_size(chunk); 997 998 // TODO: Bug 3744779: Handle benign assertion in 999 // pmm_sysmem_mappings_remove_gpu_mapping() in case of a 1000 // failure. 1001 status = uvm_pmm_sysmem_mappings_add_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, 1002 uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent), 1003 uvm_va_block_cpu_page_address(block, page_index), 1004 chunk_size, 1005 block, 1006 UVM_ID_CPU); 1007 if (status != NV_OK) 1008 cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu); 1009 1010 return status; 1011 } 1012 1013 static void block_gpu_unmap_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu) 1014 { 1015 uvm_cpu_chunk_t *chunk; 1016 uvm_page_index_t page_index; 1017 1018 for_each_cpu_chunk_in_block(chunk, page_index, block) 1019 cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu); 1020 } 1021 1022 static NV_STATUS block_gpu_map_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu) 1023 { 1024 NV_STATUS status; 1025 uvm_cpu_chunk_t *chunk; 1026 NvU64 block_mapping_size = uvm_va_block_size(block); 1027 uvm_page_index_t page_index; 1028 1029 UVM_ASSERT(IS_ALIGNED(block_mapping_size, UVM_PAGE_SIZE_4K)); 1030 1031 for_each_cpu_chunk_in_block(chunk, page_index, block) { 1032 UVM_ASSERT_MSG(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent) == 0, 1033 "GPU%u DMA address 0x%llx\n", 1034 uvm_id_value(gpu->id), 1035 uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent)); 1036 1037 status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, page_index, gpu); 1038 if (status != NV_OK) 1039 goto error; 1040 } 1041 1042 return NV_OK; 1043 1044 error: 1045 block_gpu_unmap_phys_all_cpu_pages(block, gpu); 1046 return status; 1047 } 1048 1049 static NV_STATUS block_sysmem_mappings_add_gpu_chunk(uvm_va_block_t *block, 1050 uvm_gpu_t *local_gpu, 1051 uvm_gpu_chunk_t *chunk, 1052 uvm_gpu_t *accessing_gpu) 1053 { 1054 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu); 1055 return uvm_pmm_sysmem_mappings_add_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, 1056 peer_addr, 1057 block->start + chunk->va_block_page_index * PAGE_SIZE, 1058 uvm_gpu_chunk_get_size(chunk), 1059 block, 1060 local_gpu->id); 1061 } 1062 1063 static void block_sysmem_mappings_remove_gpu_chunk(uvm_gpu_t *local_gpu, 1064 uvm_gpu_chunk_t *chunk, 1065 uvm_gpu_t *accessing_gpu) 1066 { 1067 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu); 1068 uvm_pmm_sysmem_mappings_remove_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr); 1069 } 1070 1071 static NV_STATUS block_gpu_map_all_chunks_indirect_peer(uvm_va_block_t *block, 1072 uvm_gpu_t *local_gpu, 1073 uvm_gpu_t *accessing_gpu) 1074 { 1075 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id); 1076 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1077 size_t num_chunks, i; 1078 NV_STATUS status; 1079 1080 UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)], 1081 accessing_gpu->id)); 1082 1083 // If no chunks are allocated currently, the mappings will be created later 1084 // at chunk allocation. 1085 if (!gpu_state || !gpu_state->chunks) 1086 return NV_OK; 1087 1088 num_chunks = block_num_gpu_chunks(block, local_gpu); 1089 for (i = 0; i < num_chunks; i++) { 1090 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 1091 if (!chunk) 1092 continue; 1093 1094 status = uvm_pmm_gpu_indirect_peer_map(&local_gpu->pmm, chunk, accessing_gpu); 1095 if (status != NV_OK) 1096 goto error; 1097 1098 status = block_sysmem_mappings_add_gpu_chunk(block, local_gpu, chunk, accessing_gpu); 1099 if (status != NV_OK) 1100 goto error; 1101 } 1102 1103 return NV_OK; 1104 1105 error: 1106 while (i-- > 0) { 1107 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 1108 if (chunk) { 1109 // Indirect peer mappings are removed lazily by PMM, so if an error 1110 // occurs the mappings established above will be removed when the 1111 // chunk is freed later on. We only need to remove the sysmem 1112 // reverse mappings. 1113 block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu); 1114 } 1115 } 1116 1117 return status; 1118 } 1119 1120 // Mappings for indirect peers are removed lazily by PMM, but we need to remove 1121 // the entries from the reverse map. 1122 static void block_gpu_unmap_all_chunks_indirect_peer(uvm_va_block_t *block, 1123 uvm_gpu_t *local_gpu, 1124 uvm_gpu_t *accessing_gpu) 1125 { 1126 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id); 1127 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1128 size_t num_chunks, i; 1129 1130 UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)], 1131 accessing_gpu->id)); 1132 1133 // Exit if no chunks are allocated currently. 1134 if (!gpu_state || !gpu_state->chunks) 1135 return; 1136 1137 num_chunks = block_num_gpu_chunks(block, local_gpu); 1138 for (i = 0; i < num_chunks; i++) { 1139 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 1140 if (chunk) 1141 block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu); 1142 } 1143 } 1144 1145 // Retrieves the gpu_state for the given GPU. The returned pointer is 1146 // internally managed and will be allocated (and freed) automatically, 1147 // rather than by the caller. 1148 static uvm_va_block_gpu_state_t *block_gpu_state_get_alloc(uvm_va_block_t *block, uvm_gpu_t *gpu) 1149 { 1150 NV_STATUS status; 1151 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 1152 1153 if (gpu_state) 1154 return gpu_state; 1155 1156 gpu_state = nv_kmem_cache_zalloc(g_uvm_va_block_gpu_state_cache, NV_UVM_GFP_FLAGS); 1157 if (!gpu_state) 1158 return NULL; 1159 1160 gpu_state->chunks = uvm_kvmalloc_zero(block_num_gpu_chunks(block, gpu) * sizeof(gpu_state->chunks[0])); 1161 if (!gpu_state->chunks) 1162 goto error; 1163 1164 block->gpus[uvm_id_gpu_index(gpu->id)] = gpu_state; 1165 1166 status = block_gpu_map_phys_all_cpu_pages(block, gpu); 1167 if (status != NV_OK) 1168 goto error; 1169 1170 return gpu_state; 1171 1172 error: 1173 uvm_kvfree(gpu_state->chunks); 1174 kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state); 1175 block->gpus[uvm_id_gpu_index(gpu->id)] = NULL; 1176 1177 return NULL; 1178 } 1179 1180 NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block) 1181 { 1182 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 1183 uvm_gpu_id_t gpu_id; 1184 1185 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1186 uvm_assert_mutex_locked(&va_block->lock); 1187 1188 for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpus) { 1189 if (!block_gpu_state_get_alloc(va_block, uvm_va_space_get_gpu(va_space, gpu_id))) 1190 return NV_ERR_NO_MEMORY; 1191 } 1192 1193 return NV_OK; 1194 } 1195 1196 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block, 1197 uvm_cpu_chunk_t *chunk, 1198 uvm_page_index_t page_index) 1199 { 1200 uvm_gpu_id_t id; 1201 1202 for_each_gpu_id(id) { 1203 if (uvm_va_block_gpu_state_get(block, id)) 1204 cpu_chunk_remove_sysmem_gpu_mapping(chunk, block_get_gpu(block, id)); 1205 } 1206 } 1207 1208 NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *block, 1209 uvm_page_index_t page_index) 1210 { 1211 NV_STATUS status; 1212 uvm_gpu_id_t id; 1213 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 1214 uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk); 1215 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index); 1216 1217 // We can't iterate over va_space->registered_gpus because we might be 1218 // on the eviction path, which does not have the VA space lock held. We have 1219 // the VA block lock held however, so the gpu_states can't change. 1220 uvm_assert_mutex_locked(&block->lock); 1221 1222 for_each_gpu_id(id) { 1223 uvm_gpu_t *gpu; 1224 1225 if (!uvm_va_block_gpu_state_get(block, id)) 1226 continue; 1227 1228 gpu = block_get_gpu(block, id); 1229 status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, chunk_region.first, gpu); 1230 if (status != NV_OK) 1231 goto error; 1232 } 1233 1234 return NV_OK; 1235 1236 error: 1237 uvm_va_block_unmap_cpu_chunk_on_gpus(block, chunk, page_index); 1238 return status; 1239 } 1240 1241 void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region) 1242 { 1243 uvm_cpu_chunk_t *chunk; 1244 uvm_page_index_t page_index, next_page_index; 1245 uvm_va_block_region_t chunk_region; 1246 1247 for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, region) { 1248 chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk)); 1249 1250 uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], chunk_region); 1251 uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region); 1252 uvm_page_mask_region_clear(&va_block->cpu.resident, chunk_region); 1253 uvm_cpu_chunk_remove_from_block(va_block, page_index); 1254 uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index); 1255 uvm_cpu_chunk_free(chunk); 1256 } 1257 1258 if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) 1259 uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU); 1260 if (uvm_page_mask_empty(&va_block->cpu.resident)) 1261 uvm_processor_mask_clear(&va_block->resident, UVM_ID_CPU); 1262 } 1263 1264 // Create physical mappings to allow other GPUs to access this chunk. 1265 static NV_STATUS block_map_indirect_peers_to_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) 1266 { 1267 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1268 uvm_gpu_t *accessing_gpu, *remove_gpu; 1269 NV_STATUS status; 1270 1271 // Unlike uvm_va_block_map_cpu_chunk_on_gpus, this function isn't called on 1272 // the eviction path, so we can assume that the VA space is locked. 1273 // 1274 // TODO: Bug 2007346: In the future we may want to enable eviction to peers, 1275 // meaning we may need to allocate peer memory and map it on the 1276 // eviction path. That will require making sure that peers can't be 1277 // enabled or disabled either in the VA space or globally within this 1278 // function. 1279 uvm_assert_rwsem_locked(&va_space->lock); 1280 uvm_assert_mutex_locked(&block->lock); 1281 1282 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 1283 status = uvm_pmm_gpu_indirect_peer_map(&gpu->pmm, chunk, accessing_gpu); 1284 if (status != NV_OK) 1285 goto error; 1286 1287 status = block_sysmem_mappings_add_gpu_chunk(block, gpu, chunk, accessing_gpu); 1288 if (status != NV_OK) 1289 goto error; 1290 } 1291 1292 return NV_OK; 1293 1294 error: 1295 for_each_va_space_gpu_in_mask(remove_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 1296 if (remove_gpu == accessing_gpu) 1297 break; 1298 1299 // Indirect peer mappings are removed lazily by PMM, so if an error 1300 // occurs the mappings established above will be removed when the 1301 // chunk is freed later on. We only need to remove the sysmem 1302 // reverse mappings. 1303 block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, remove_gpu); 1304 } 1305 1306 return status; 1307 } 1308 1309 static void block_unmap_indirect_peers_from_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) 1310 { 1311 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1312 uvm_gpu_t *peer_gpu; 1313 1314 uvm_assert_rwsem_locked(&va_space->lock); 1315 uvm_assert_mutex_locked(&block->lock); 1316 1317 // Indirect peer mappings are removed lazily by PMM, so we only need to 1318 // remove the sysmem reverse mappings. 1319 for_each_va_space_gpu_in_mask(peer_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) 1320 block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, peer_gpu); 1321 } 1322 1323 // Mark a CPU page as dirty. 1324 static void block_mark_cpu_page_dirty(uvm_va_block_t *block, uvm_page_index_t page_index) 1325 { 1326 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 1327 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index); 1328 uvm_cpu_chunk_mark_dirty(chunk, page_index - chunk_region.first); 1329 } 1330 1331 // Mark a CPU page as clean. 1332 static void block_mark_cpu_page_clean(uvm_va_block_t *block, uvm_page_index_t page_index) 1333 { 1334 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 1335 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index); 1336 uvm_cpu_chunk_mark_clean(chunk, page_index - chunk_region.first); 1337 } 1338 1339 // Check if a CPU page is dirty. 1340 static bool block_cpu_page_is_dirty(uvm_va_block_t *block, uvm_page_index_t page_index) 1341 { 1342 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 1343 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index); 1344 return uvm_cpu_chunk_is_dirty(chunk, page_index - chunk_region.first); 1345 } 1346 1347 static NV_STATUS block_alloc_cpu_chunk(uvm_va_block_t *block, 1348 uvm_chunk_size_t alloc_size, 1349 uvm_cpu_chunk_alloc_flags_t flags, 1350 uvm_cpu_chunk_t **chunk) 1351 { 1352 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); 1353 1354 // Return out of memory error if the tests have requested it. As opposed to 1355 // other error injection settings, this one fails N times and then succeeds. 1356 // TODO: Bug 3701182: This will print a warning in Linux kernels newer than 1357 // 5.16.0-rc1+. 1358 if (block_test && block_test->inject_cpu_pages_allocation_error_count) { 1359 if (block_test->inject_cpu_pages_allocation_error_count != ~(NvU32)0) 1360 block_test->inject_cpu_pages_allocation_error_count--; 1361 return NV_ERR_NO_MEMORY; 1362 } 1363 1364 return uvm_cpu_chunk_alloc(alloc_size, flags, chunk); 1365 } 1366 1367 // Allocates the input page in the block, if it doesn't already exist 1368 // 1369 // Also maps the page for physical access by all GPUs used by the block, which 1370 // is required for IOMMU support. Skipped on GPUs without access to CPU memory. 1371 // e.g., this happens when the Confidential Computing Feature is enabled. 1372 static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block, 1373 uvm_page_mask_t *populate_page_mask, 1374 uvm_va_block_region_t populate_region, 1375 uvm_va_block_context_t *block_context) 1376 { 1377 NV_STATUS status = NV_OK; 1378 uvm_cpu_chunk_t *chunk; 1379 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); 1380 uvm_chunk_sizes_mask_t cpu_allocation_sizes = uvm_cpu_chunk_get_allocation_sizes(); 1381 uvm_chunk_size_t alloc_size; 1382 uvm_page_mask_t *resident_mask = &block_context->scratch_page_mask; 1383 uvm_cpu_chunk_alloc_flags_t alloc_flags = UVM_CPU_CHUNK_ALLOC_FLAGS_NONE; 1384 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1385 uvm_processor_mask_t uvm_lite_gpus; 1386 uvm_page_index_t page_index; 1387 uvm_gpu_id_t id; 1388 1389 // Check whether all requested pages have already been allocated. 1390 uvm_page_mask_init_from_region(&block_context->scratch_page_mask, populate_region, populate_page_mask); 1391 if (!uvm_page_mask_andnot(&block_context->scratch_page_mask, 1392 &block_context->scratch_page_mask, 1393 &block->cpu.allocated)) 1394 return NV_OK; 1395 1396 if (block_test) { 1397 if (block_test->cpu_chunk_allocation_size_mask) 1398 cpu_allocation_sizes &= block_test->cpu_chunk_allocation_size_mask; 1399 } 1400 1401 uvm_page_mask_zero(resident_mask); 1402 for_each_id_in_mask (id, &block->resident) 1403 uvm_page_mask_or(resident_mask, resident_mask, uvm_va_block_resident_mask_get(block, id)); 1404 1405 // If the VA space has a UVM-Lite GPU registered, only PAGE_SIZE allocations 1406 // should be used in order to avoid extra copies due to dirty compound 1407 // pages. HMM va_blocks also require PAGE_SIZE allocations. 1408 // TODO: Bug 3368756: add support for HMM transparent huge page (THP) 1409 // migrations. 1410 uvm_processor_mask_andnot(&uvm_lite_gpus, &va_space->registered_gpus, &va_space->faultable_processors); 1411 if (!uvm_processor_mask_empty(&uvm_lite_gpus) || uvm_va_block_is_hmm(block)) 1412 cpu_allocation_sizes = PAGE_SIZE; 1413 1414 if (block_context->mm) 1415 alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ACCOUNT; 1416 1417 UVM_ASSERT(cpu_allocation_sizes >= PAGE_SIZE); 1418 UVM_ASSERT(cpu_allocation_sizes & PAGE_SIZE); 1419 1420 for_each_va_block_page_in_region_mask(page_index, populate_page_mask, populate_region) { 1421 uvm_cpu_chunk_alloc_flags_t chunk_alloc_flags; 1422 uvm_va_block_region_t region = populate_region; 1423 1424 if (uvm_page_mask_test(&block->cpu.allocated, page_index)) { 1425 page_index = uvm_va_block_next_unset_page_in_mask(populate_region, &block->cpu.allocated, page_index) - 1; 1426 continue; 1427 } 1428 1429 UVM_ASSERT(!uvm_page_mask_test(&block->cpu.resident, page_index)); 1430 1431 chunk_alloc_flags = alloc_flags; 1432 1433 // Attempt to allocate CPU pages with the largest physically contiguous 1434 // size from the set of CPU chunk sizes that we can. 1435 // This is accomplished by: 1436 // 1. Aligning the CPU page address down to the allocation size. 1437 // 2. Ensuring that the entire allocation region fits withing the VA 1438 // block. 1439 // 3. Ensuring that the region covered by the allocation is empty. 1440 for_each_chunk_size_rev(alloc_size, cpu_allocation_sizes) { 1441 NvU64 alloc_virt_addr; 1442 1443 chunk = NULL; 1444 alloc_virt_addr = UVM_ALIGN_DOWN(uvm_va_block_cpu_page_address(block, page_index), alloc_size); 1445 1446 if (!uvm_va_block_contains_address(block, alloc_virt_addr) || 1447 !uvm_va_block_contains_address(block, alloc_virt_addr + alloc_size - 1)) 1448 continue; 1449 1450 region = uvm_va_block_region_from_start_end(block, alloc_virt_addr, alloc_virt_addr + alloc_size - 1); 1451 1452 if (!uvm_page_mask_region_empty(&block->cpu.allocated, region)) 1453 continue; 1454 1455 // If not all pages in the allocation region are resident somewhere, 1456 // zero out the allocated page. 1457 // This could be wasteful if only a few pages in high-order 1458 // allocation need to be zero'ed out but the alternative is to map 1459 // single sub-pages one-by-one. 1460 if (!uvm_page_mask_region_full(resident_mask, region)) 1461 chunk_alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO; 1462 1463 status = block_alloc_cpu_chunk(block, alloc_size, chunk_alloc_flags, &chunk); 1464 if (status == NV_OK) { 1465 page_index = region.first; 1466 break; 1467 } 1468 1469 UVM_ASSERT(status == NV_ERR_NO_MEMORY); 1470 } 1471 1472 if (status != NV_OK) 1473 break; 1474 1475 status = uvm_cpu_chunk_insert_in_block(block, chunk, page_index); 1476 if (status != NV_OK) { 1477 uvm_cpu_chunk_free(chunk); 1478 return status; 1479 } 1480 1481 status = uvm_va_block_map_cpu_chunk_on_gpus(block, page_index); 1482 if (status != NV_OK) 1483 break; 1484 1485 // Skip iterating over all pages covered by the allocated chunk. 1486 page_index = region.outer - 1; 1487 } 1488 1489 if (status != NV_OK && chunk) { 1490 uvm_cpu_chunk_remove_from_block(block, page_index); 1491 uvm_cpu_chunk_free(chunk); 1492 } 1493 1494 return status; 1495 } 1496 1497 // Try allocating a chunk. If eviction was required, 1498 // NV_ERR_MORE_PROCESSING_REQUIRED will be returned since the block's lock was 1499 // unlocked and relocked. The caller is responsible for adding the chunk to the 1500 // retry used_chunks list. 1501 static NV_STATUS block_alloc_gpu_chunk(uvm_va_block_t *block, 1502 uvm_va_block_retry_t *retry, 1503 uvm_gpu_t *gpu, 1504 uvm_chunk_size_t size, 1505 uvm_gpu_chunk_t **out_gpu_chunk) 1506 { 1507 NV_STATUS status = NV_OK; 1508 uvm_gpu_chunk_t *gpu_chunk; 1509 1510 // First try getting a free chunk from previously-made allocations. 1511 gpu_chunk = block_retry_get_free_chunk(retry, gpu, size); 1512 if (!gpu_chunk) { 1513 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); 1514 if (block_test && block_test->user_pages_allocation_retry_force_count > 0) { 1515 // Force eviction by pretending the allocation failed with no memory 1516 --block_test->user_pages_allocation_retry_force_count; 1517 status = NV_ERR_NO_MEMORY; 1518 } 1519 else { 1520 // Try allocating a new one without eviction 1521 status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_NONE, &gpu_chunk, &retry->tracker); 1522 } 1523 1524 if (status == NV_ERR_NO_MEMORY) { 1525 // If that fails with no memory, try allocating with eviction and 1526 // return back to the caller immediately so that the operation can 1527 // be restarted. 1528 uvm_mutex_unlock(&block->lock); 1529 1530 status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_EVICT, &gpu_chunk, &retry->tracker); 1531 if (status == NV_OK) { 1532 block_retry_add_free_chunk(retry, gpu_chunk); 1533 status = NV_ERR_MORE_PROCESSING_REQUIRED; 1534 } 1535 1536 uvm_mutex_lock(&block->lock); 1537 return status; 1538 } 1539 else if (status != NV_OK) { 1540 return status; 1541 } 1542 } 1543 1544 *out_gpu_chunk = gpu_chunk; 1545 return NV_OK; 1546 } 1547 1548 static bool block_gpu_has_page_tables(uvm_va_block_t *block, uvm_gpu_t *gpu) 1549 { 1550 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 1551 1552 if (!gpu_state) 1553 return false; 1554 1555 return gpu_state->page_table_range_4k.table || 1556 gpu_state->page_table_range_big.table || 1557 gpu_state->page_table_range_2m.table; 1558 } 1559 1560 // A helper to get a known-to-be-present GPU VA space given a VA block that's 1561 // locked. In order to use this function, the caller must know that at least one 1562 // of these conditions is true: 1563 // 1564 // 1) The VA space lock is held 1565 // 2) The VA block has active page tables for the GPU 1566 // 1567 // If the VA space lock is held (#1), then the gpu_va_space obviously can't go 1568 // away. 1569 // 1570 // On the eviction path, we don't have a lock on the VA space state. However, 1571 // since remove_gpu_va_space walks each block to unmap the GPU and free GPU page 1572 // tables before destroying the gpu_va_space, we're guaranteed that if this GPU 1573 // has page tables (#2), the gpu_va_space can't go away while we're holding the 1574 // block lock. 1575 static uvm_gpu_va_space_t *uvm_va_block_get_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_t *gpu) 1576 { 1577 uvm_gpu_va_space_t *gpu_va_space; 1578 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 1579 1580 UVM_ASSERT(gpu); 1581 1582 if (!block_gpu_has_page_tables(va_block, gpu)) 1583 uvm_assert_rwsem_locked(&va_space->lock); 1584 1585 UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id)); 1586 1587 gpu_va_space = va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)]; 1588 1589 UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE); 1590 UVM_ASSERT(gpu_va_space->va_space == va_space); 1591 UVM_ASSERT(gpu_va_space->gpu == gpu); 1592 1593 return gpu_va_space; 1594 } 1595 1596 static bool block_gpu_supports_2m(uvm_va_block_t *block, uvm_gpu_t *gpu) 1597 { 1598 uvm_gpu_va_space_t *gpu_va_space; 1599 1600 // TODO: Bug 3368756: add HMM support for transparent huge page migrations. 1601 if (uvm_va_block_size(block) < UVM_PAGE_SIZE_2M || uvm_va_block_is_hmm(block)) 1602 return false; 1603 1604 UVM_ASSERT(uvm_va_block_size(block) == UVM_PAGE_SIZE_2M); 1605 1606 gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 1607 return uvm_mmu_page_size_supported(&gpu_va_space->page_tables, UVM_PAGE_SIZE_2M); 1608 } 1609 1610 NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu) 1611 { 1612 uvm_gpu_va_space_t *gpu_va_space; 1613 1614 gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu); 1615 return gpu_va_space->page_tables.big_page_size; 1616 } 1617 1618 static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU32 big_page_size) 1619 { 1620 NvU64 first_addr = UVM_ALIGN_UP(start, big_page_size); 1621 NvU64 outer_addr = UVM_ALIGN_DOWN(end + 1, big_page_size); 1622 1623 // The range must fit within a VA block 1624 UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end)); 1625 1626 if (outer_addr <= first_addr) 1627 return uvm_va_block_region(0, 0); 1628 1629 return uvm_va_block_region((first_addr - start) / PAGE_SIZE, (outer_addr - start) / PAGE_SIZE); 1630 } 1631 1632 static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU32 big_page_size) 1633 { 1634 uvm_va_block_region_t region = range_big_page_region_all(start, end, big_page_size); 1635 return (size_t)uvm_div_pow2_64(uvm_va_block_region_size(region), big_page_size); 1636 } 1637 1638 uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size) 1639 { 1640 return range_big_page_region_all(va_block->start, va_block->end, big_page_size); 1641 } 1642 1643 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block, 1644 uvm_va_block_region_t region, 1645 NvU32 big_page_size) 1646 { 1647 NvU64 start = uvm_va_block_region_start(va_block, region); 1648 NvU64 end = uvm_va_block_region_end(va_block, region); 1649 uvm_va_block_region_t big_region; 1650 1651 UVM_ASSERT(start < va_block->end); 1652 UVM_ASSERT(end <= va_block->end); 1653 1654 big_region = range_big_page_region_all(start, end, big_page_size); 1655 if (big_region.outer) { 1656 big_region.first += region.first; 1657 big_region.outer += region.first; 1658 } 1659 1660 return big_region; 1661 } 1662 1663 size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size) 1664 { 1665 return range_num_big_pages(va_block->start, va_block->end, big_page_size); 1666 } 1667 1668 NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size) 1669 { 1670 NvU64 addr = UVM_ALIGN_UP(va_block->start, big_page_size) + (big_page_index * big_page_size); 1671 UVM_ASSERT(addr >= va_block->start); 1672 UVM_ASSERT(addr < va_block->end); 1673 return addr; 1674 } 1675 1676 uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size) 1677 { 1678 NvU64 page_addr = uvm_va_block_big_page_addr(va_block, big_page_index, big_page_size); 1679 1680 // Assume that we don't have to handle multiple big PTEs per system page. 1681 // It's not terribly difficult to implement, but we don't currently have a 1682 // use case. 1683 UVM_ASSERT(big_page_size >= PAGE_SIZE); 1684 1685 return uvm_va_block_region_from_start_size(va_block, page_addr, big_page_size); 1686 } 1687 1688 // Returns the big page index (the bit index within 1689 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If 1690 // page_index cannot be covered by a big PTE due to alignment or block size, 1691 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned. 1692 size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size) 1693 { 1694 uvm_va_block_region_t big_region_all = uvm_va_block_big_page_region_all(va_block, big_page_size); 1695 size_t big_index; 1696 1697 // Note that this condition also handles the case of having no big pages in 1698 // the block, in which case .first >= .outer. 1699 if (page_index < big_region_all.first || page_index >= big_region_all.outer) 1700 return MAX_BIG_PAGES_PER_UVM_VA_BLOCK; 1701 1702 big_index = (size_t)uvm_div_pow2_64((page_index - big_region_all.first) * PAGE_SIZE, big_page_size); 1703 1704 UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) >= va_block->start); 1705 UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) + big_page_size <= va_block->end + 1); 1706 1707 return big_index; 1708 } 1709 1710 static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block, 1711 uvm_gpu_t *gpu, 1712 uvm_page_mask_t *mask_out, 1713 const unsigned long *big_ptes_in) 1714 { 1715 uvm_va_block_region_t big_region; 1716 size_t big_page_index; 1717 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); 1718 1719 uvm_page_mask_zero(mask_out); 1720 1721 for_each_set_bit(big_page_index, big_ptes_in, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 1722 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 1723 uvm_page_mask_region_fill(mask_out, big_region); 1724 } 1725 } 1726 1727 NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index) 1728 { 1729 if (!uvm_page_mask_test(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index)) 1730 return 0; 1731 1732 UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU)); 1733 1734 // Despite the fact that physical CPU memory can be allocated at sizes 1735 // greater than PAGE_SIZE, vm_insert_page(s)() always maps CPU memory 1736 // with 4K PTEs. Until the core kernel adds support for PMD mappings, 1737 // the return value of this function will remain at PAGE_SIZE. 1738 return PAGE_SIZE; 1739 } 1740 1741 NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index) 1742 { 1743 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 1744 size_t big_page_size, big_page_index; 1745 1746 if (!gpu_state) 1747 return 0; 1748 1749 if (!uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index)) 1750 return 0; 1751 1752 UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, gpu_id)); 1753 1754 if (gpu_state->pte_is_2m) 1755 return UVM_PAGE_SIZE_2M; 1756 1757 big_page_size = uvm_va_block_gpu_big_page_size(va_block, block_get_gpu(va_block, gpu_id)); 1758 big_page_index = uvm_va_block_big_page_index(va_block, page_index, big_page_size); 1759 if (big_page_index != MAX_BIG_PAGES_PER_UVM_VA_BLOCK && test_bit(big_page_index, gpu_state->big_ptes)) 1760 return big_page_size; 1761 1762 return UVM_PAGE_SIZE_4K; 1763 } 1764 1765 // Get the size of the physical allocation backing the page, or 0 if not 1766 // resident. Note that this is different from uvm_va_block_page_size_* because 1767 // those return the size of the PTE which maps the page index, which may be 1768 // smaller than the physical allocation. 1769 static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page) 1770 { 1771 uvm_va_block_gpu_state_t *gpu_state; 1772 uvm_chunk_size_t chunk_size; 1773 1774 if (UVM_ID_IS_CPU(page.processor)) { 1775 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page.page_index); 1776 1777 if (!uvm_page_mask_test(&block->cpu.resident, page.page_index)) 1778 return 0; 1779 1780 UVM_ASSERT(uvm_processor_mask_test(&block->resident, UVM_ID_CPU)); 1781 return (NvU32)uvm_cpu_chunk_get_size(chunk); 1782 } 1783 1784 gpu_state = uvm_va_block_gpu_state_get(block, page.processor); 1785 if (!gpu_state || !uvm_page_mask_test(&gpu_state->resident, page.page_index)) 1786 return 0; 1787 1788 UVM_ASSERT(uvm_processor_mask_test(&block->resident, page.processor)); 1789 block_gpu_chunk_index(block, block_get_gpu(block, page.processor), page.page_index, &chunk_size); 1790 return (NvU32)chunk_size; 1791 } 1792 1793 static uvm_pte_bits_cpu_t get_cpu_pte_bit_index(uvm_prot_t prot) 1794 { 1795 uvm_pte_bits_cpu_t pte_bit_index = UVM_PTE_BITS_CPU_MAX; 1796 1797 // ATOMIC and WRITE are synonyms for the CPU 1798 if (prot == UVM_PROT_READ_WRITE_ATOMIC || prot == UVM_PROT_READ_WRITE) 1799 pte_bit_index = UVM_PTE_BITS_CPU_WRITE; 1800 else if (prot == UVM_PROT_READ_ONLY) 1801 pte_bit_index = UVM_PTE_BITS_CPU_READ; 1802 else 1803 UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot)); 1804 1805 return pte_bit_index; 1806 } 1807 1808 static uvm_pte_bits_gpu_t get_gpu_pte_bit_index(uvm_prot_t prot) 1809 { 1810 uvm_pte_bits_gpu_t pte_bit_index = UVM_PTE_BITS_GPU_MAX; 1811 1812 if (prot == UVM_PROT_READ_WRITE_ATOMIC) 1813 pte_bit_index = UVM_PTE_BITS_GPU_ATOMIC; 1814 else if (prot == UVM_PROT_READ_WRITE) 1815 pte_bit_index = UVM_PTE_BITS_GPU_WRITE; 1816 else if (prot == UVM_PROT_READ_ONLY) 1817 pte_bit_index = UVM_PTE_BITS_GPU_READ; 1818 else 1819 UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot)); 1820 1821 return pte_bit_index; 1822 } 1823 1824 uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor) 1825 { 1826 uvm_va_block_gpu_state_t *gpu_state; 1827 1828 if (UVM_ID_IS_CPU(processor)) 1829 return &block->cpu.resident; 1830 1831 gpu_state = uvm_va_block_gpu_state_get(block, processor); 1832 1833 UVM_ASSERT(gpu_state); 1834 return &gpu_state->resident; 1835 } 1836 1837 // Get the page residency mask for a processor 1838 // 1839 // Notably this will allocate GPU state if not yet present and if that fails 1840 // NULL is returned. 1841 static uvm_page_mask_t *block_resident_mask_get_alloc(uvm_va_block_t *block, uvm_processor_id_t processor) 1842 { 1843 uvm_va_block_gpu_state_t *gpu_state; 1844 1845 if (UVM_ID_IS_CPU(processor)) 1846 return &block->cpu.resident; 1847 1848 gpu_state = block_gpu_state_get_alloc(block, block_get_gpu(block, processor)); 1849 if (!gpu_state) 1850 return NULL; 1851 1852 return &gpu_state->resident; 1853 } 1854 1855 static const uvm_page_mask_t *block_map_with_prot_mask_get(uvm_va_block_t *block, 1856 uvm_processor_id_t processor, 1857 uvm_prot_t prot) 1858 { 1859 uvm_va_block_gpu_state_t *gpu_state; 1860 1861 if (UVM_ID_IS_CPU(processor)) 1862 return &block->cpu.pte_bits[get_cpu_pte_bit_index(prot)]; 1863 1864 gpu_state = uvm_va_block_gpu_state_get(block, processor); 1865 1866 UVM_ASSERT(gpu_state); 1867 return &gpu_state->pte_bits[get_gpu_pte_bit_index(prot)]; 1868 } 1869 1870 const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor) 1871 { 1872 return block_map_with_prot_mask_get(block, processor, UVM_PROT_READ_ONLY); 1873 } 1874 1875 static const uvm_page_mask_t *block_evicted_mask_get(uvm_va_block_t *block, uvm_gpu_id_t gpu_id) 1876 { 1877 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id); 1878 UVM_ASSERT(gpu_state); 1879 1880 return &gpu_state->evicted; 1881 } 1882 1883 static bool block_is_page_resident_anywhere(uvm_va_block_t *block, uvm_page_index_t page_index) 1884 { 1885 uvm_processor_id_t id; 1886 for_each_id_in_mask(id, &block->resident) { 1887 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(block, id), page_index)) 1888 return true; 1889 } 1890 1891 return false; 1892 } 1893 1894 static bool block_processor_page_is_populated(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index) 1895 { 1896 uvm_va_block_gpu_state_t *gpu_state; 1897 size_t chunk_index; 1898 1899 if (UVM_ID_IS_CPU(proc)) 1900 return uvm_page_mask_test(&block->cpu.allocated, page_index); 1901 1902 gpu_state = uvm_va_block_gpu_state_get(block, proc); 1903 if (!gpu_state) 1904 return false; 1905 1906 chunk_index = block_gpu_chunk_index(block, block_get_gpu(block, proc), page_index, NULL); 1907 return gpu_state->chunks[chunk_index] != NULL; 1908 } 1909 1910 static bool block_processor_page_is_resident_on(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index) 1911 { 1912 const uvm_page_mask_t *resident_mask; 1913 1914 if (UVM_ID_IS_CPU(proc)) { 1915 resident_mask = &block->cpu.resident; 1916 } 1917 else { 1918 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, proc); 1919 if (!gpu_state) 1920 return false; 1921 1922 resident_mask = &gpu_state->resident; 1923 } 1924 1925 return uvm_page_mask_test(resident_mask, page_index); 1926 } 1927 1928 // Compute the gpus that have at least the given access permissions for the 1929 // range described by region and page_mask. The function sets the bit if any 1930 // page in the region has the permissions. 1931 static void block_region_authorized_gpus(uvm_va_block_t *va_block, 1932 uvm_va_block_region_t region, 1933 uvm_prot_t access_permission, 1934 uvm_processor_mask_t *authorized_gpus) 1935 { 1936 uvm_gpu_id_t gpu_id; 1937 uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(access_permission); 1938 1939 uvm_processor_mask_zero(authorized_gpus); 1940 1941 // Test all GPUs with mappings on the block 1942 for_each_gpu_id_in_mask(gpu_id, &va_block->mapped) { 1943 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 1944 if (gpu_state && !uvm_page_mask_region_empty(&gpu_state->pte_bits[search_gpu_bit], region)) 1945 uvm_processor_mask_set(authorized_gpus, gpu_id); 1946 } 1947 } 1948 1949 // Compute the processors that have at least the given access permissions for 1950 // the range described by region and page_mask. The function sets the bit if any 1951 // page in the region has the permissions. 1952 static void block_region_authorized_processors(uvm_va_block_t *va_block, 1953 uvm_va_block_region_t region, 1954 uvm_prot_t access_permission, 1955 uvm_processor_mask_t *authorized_processors) 1956 { 1957 uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(access_permission); 1958 1959 // Compute GPUs 1960 block_region_authorized_gpus(va_block, region, access_permission, authorized_processors); 1961 1962 // Test CPU 1963 if (uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU) && 1964 !uvm_page_mask_region_empty(&va_block->cpu.pte_bits[search_cpu_bit], region)) { 1965 uvm_processor_mask_set(authorized_processors, UVM_ID_CPU); 1966 } 1967 } 1968 1969 static void block_page_authorized_processors(uvm_va_block_t *va_block, 1970 uvm_page_index_t page_index, 1971 uvm_prot_t access_permission, 1972 uvm_processor_mask_t *authorized_processors) 1973 { 1974 block_region_authorized_processors(va_block, 1975 uvm_va_block_region_for_page(page_index), 1976 access_permission, 1977 authorized_processors); 1978 } 1979 1980 static bool block_is_gpu_authorized_on_whole_region(uvm_va_block_t *va_block, 1981 uvm_va_block_region_t region, 1982 uvm_gpu_id_t gpu_id, 1983 uvm_prot_t required_prot) 1984 { 1985 uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(required_prot); 1986 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 1987 1988 if (!gpu_state) 1989 return false; 1990 1991 return uvm_page_mask_region_full(&gpu_state->pte_bits[search_gpu_bit], region); 1992 } 1993 1994 static bool block_is_processor_authorized_on_whole_region(uvm_va_block_t *va_block, 1995 uvm_va_block_region_t region, 1996 uvm_processor_id_t processor_id, 1997 uvm_prot_t required_prot) 1998 { 1999 if (UVM_ID_IS_CPU(processor_id)) { 2000 uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(required_prot); 2001 2002 return uvm_page_mask_region_full(&va_block->cpu.pte_bits[search_cpu_bit], region); 2003 } 2004 else { 2005 return block_is_gpu_authorized_on_whole_region(va_block, region, processor_id, required_prot); 2006 } 2007 } 2008 2009 bool uvm_va_block_page_is_gpu_authorized(uvm_va_block_t *va_block, 2010 uvm_page_index_t page_index, 2011 uvm_gpu_id_t gpu_id, 2012 uvm_prot_t required_prot) 2013 { 2014 return block_is_gpu_authorized_on_whole_region(va_block, 2015 uvm_va_block_region_for_page(page_index), 2016 gpu_id, 2017 required_prot); 2018 } 2019 2020 static bool block_page_is_processor_authorized(uvm_va_block_t *va_block, 2021 uvm_page_index_t page_index, 2022 uvm_processor_id_t processor_id, 2023 uvm_prot_t required_prot) 2024 { 2025 return block_is_processor_authorized_on_whole_region(va_block, 2026 uvm_va_block_region_for_page(page_index), 2027 processor_id, 2028 required_prot); 2029 } 2030 2031 // Compute the gpus that have a copy of the given page resident in their memory 2032 static void block_page_resident_gpus(uvm_va_block_t *va_block, 2033 uvm_page_index_t page_index, 2034 uvm_processor_mask_t *resident_gpus) 2035 { 2036 uvm_gpu_id_t id; 2037 uvm_processor_mask_zero(resident_gpus); 2038 2039 for_each_gpu_id_in_mask(id, &va_block->resident) { 2040 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index)) { 2041 UVM_ASSERT(block_processor_page_is_populated(va_block, id, page_index)); 2042 uvm_processor_mask_set(resident_gpus, id); 2043 } 2044 } 2045 } 2046 2047 void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block, 2048 uvm_page_index_t page_index, 2049 uvm_processor_mask_t *resident_processors) 2050 { 2051 block_page_resident_gpus(va_block, page_index, resident_processors); 2052 2053 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU), page_index)) { 2054 UVM_ASSERT(block_processor_page_is_populated(va_block, UVM_ID_CPU, page_index)); 2055 uvm_processor_mask_set(resident_processors, UVM_ID_CPU); 2056 } 2057 } 2058 2059 NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block, uvm_page_index_t page_index) 2060 { 2061 uvm_processor_mask_t resident_processors; 2062 uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors); 2063 2064 return uvm_processor_mask_get_count(&resident_processors); 2065 } 2066 2067 static uvm_processor_id_t block_page_get_closest_resident_in_mask(uvm_va_block_t *va_block, 2068 uvm_page_index_t page_index, 2069 uvm_processor_id_t processor, 2070 const uvm_processor_mask_t *processor_mask) 2071 { 2072 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 2073 uvm_processor_mask_t search_mask; 2074 uvm_processor_id_t id; 2075 2076 if (processor_mask) 2077 uvm_processor_mask_and(&search_mask, processor_mask, &va_block->resident); 2078 else 2079 uvm_processor_mask_copy(&search_mask, &va_block->resident); 2080 2081 for_each_closest_id(id, &search_mask, processor, va_space) { 2082 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index)) 2083 return id; 2084 } 2085 2086 // HMM va_blocks don't know if a page is CPU resident until either 2087 // migrate_vma_setup() or hmm_range_fault() is called. If a page isn't 2088 // resident anywhere, assume it is CPU resident. 2089 if (uvm_va_block_is_hmm(va_block)) 2090 return UVM_ID_CPU; 2091 2092 return UVM_ID_INVALID; 2093 } 2094 2095 uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block, 2096 uvm_page_index_t page_index, 2097 uvm_processor_id_t processor) 2098 { 2099 return block_page_get_closest_resident_in_mask(va_block, page_index, processor, NULL); 2100 } 2101 2102 // We don't track the specific aperture of each mapped page. Instead, we assume 2103 // that each virtual mapping from a given processor always targets the closest 2104 // processor on which that page is resident (with special rules for UVM-Lite). 2105 // 2106 // This function verifies that assumption: before a page becomes resident on a 2107 // new location, assert that no processor has a valid mapping to a farther 2108 // processor on that page. 2109 static bool block_check_resident_proximity(uvm_va_block_t *block, uvm_page_index_t page_index, uvm_processor_id_t new_residency) 2110 { 2111 uvm_processor_mask_t resident_procs, mapped_procs; 2112 uvm_processor_id_t mapped_id, closest_id; 2113 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 2114 2115 uvm_processor_mask_andnot(&mapped_procs, &block->mapped, block_get_uvm_lite_gpus(block)); 2116 2117 for_each_id_in_mask(mapped_id, &mapped_procs) { 2118 if (!uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index)) 2119 continue; 2120 2121 uvm_va_block_page_resident_processors(block, page_index, &resident_procs); 2122 UVM_ASSERT(!uvm_processor_mask_empty(&resident_procs)); 2123 UVM_ASSERT(!uvm_processor_mask_test(&resident_procs, new_residency)); 2124 uvm_processor_mask_set(&resident_procs, new_residency); 2125 closest_id = uvm_processor_mask_find_closest_id(va_space, &resident_procs, mapped_id); 2126 UVM_ASSERT(!uvm_id_equal(closest_id, new_residency)); 2127 } 2128 2129 return true; 2130 } 2131 2132 // Returns the processor to which page_index should be mapped on gpu 2133 static uvm_processor_id_t block_gpu_get_processor_to_map(uvm_va_block_t *block, 2134 uvm_gpu_t *gpu, 2135 uvm_page_index_t page_index) 2136 { 2137 uvm_processor_id_t dest_id; 2138 2139 // UVM-Lite GPUs can only map pages on the preferred location 2140 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) 2141 return uvm_va_range_get_policy(block->va_range)->preferred_location; 2142 2143 // Otherwise we always map the closest resident processor 2144 dest_id = uvm_va_block_page_get_closest_resident(block, page_index, gpu->id); 2145 UVM_ASSERT(UVM_ID_IS_VALID(dest_id)); 2146 return dest_id; 2147 } 2148 2149 // Returns the processor to which page_index should be mapped on mapping_id 2150 static uvm_processor_id_t block_get_processor_to_map(uvm_va_block_t *block, 2151 uvm_processor_id_t mapping_id, 2152 uvm_page_index_t page_index) 2153 { 2154 2155 if (UVM_ID_IS_CPU(mapping_id)) 2156 return uvm_va_block_page_get_closest_resident(block, page_index, mapping_id); 2157 2158 return block_gpu_get_processor_to_map(block, block_get_gpu(block, mapping_id), page_index); 2159 } 2160 2161 static void block_get_mapped_processors(uvm_va_block_t *block, 2162 uvm_processor_id_t resident_id, 2163 uvm_page_index_t page_index, 2164 uvm_processor_mask_t *mapped_procs) 2165 { 2166 uvm_processor_id_t mapped_id; 2167 2168 uvm_processor_mask_zero(mapped_procs); 2169 2170 for_each_id_in_mask(mapped_id, &block->mapped) { 2171 if (uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index)) { 2172 uvm_processor_id_t to_map_id = block_get_processor_to_map(block, mapped_id, page_index); 2173 2174 if (uvm_id_equal(to_map_id, resident_id)) 2175 uvm_processor_mask_set(mapped_procs, mapped_id); 2176 } 2177 } 2178 } 2179 2180 // We use block_gpu_get_processor_to_map to find the destination processor of a 2181 // given GPU mapping. This function is called when the mapping is established to 2182 // sanity check that the destination of the mapping matches the query. 2183 static bool block_check_mapping_residency_region(uvm_va_block_t *block, 2184 uvm_gpu_t *gpu, 2185 uvm_processor_id_t mapping_dest, 2186 uvm_va_block_region_t region, 2187 const uvm_page_mask_t *page_mask) 2188 { 2189 uvm_page_index_t page_index; 2190 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2191 NvU64 va = uvm_va_block_cpu_page_address(block, page_index); 2192 uvm_processor_id_t proc_to_map = block_gpu_get_processor_to_map(block, gpu, page_index); 2193 UVM_ASSERT_MSG(uvm_id_equal(mapping_dest, proc_to_map), 2194 "VA 0x%llx on %s: mapping %s, supposed to map %s", 2195 va, 2196 uvm_gpu_name(gpu), 2197 block_processor_name(block, mapping_dest), 2198 block_processor_name(block, proc_to_map)); 2199 } 2200 return true; 2201 } 2202 2203 static bool block_check_mapping_residency(uvm_va_block_t *block, 2204 uvm_gpu_t *gpu, 2205 uvm_processor_id_t mapping_dest, 2206 const uvm_page_mask_t *page_mask) 2207 { 2208 return block_check_mapping_residency_region(block, 2209 gpu, 2210 mapping_dest, 2211 uvm_va_block_region_from_block(block), 2212 page_mask); 2213 } 2214 2215 // Check that there are no mappings targeting resident_id from any processor in 2216 // the block. 2217 static bool block_check_processor_not_mapped(uvm_va_block_t *block, uvm_processor_id_t resident_id) 2218 { 2219 uvm_processor_id_t mapped_id; 2220 uvm_page_index_t page_index; 2221 2222 for_each_id_in_mask(mapped_id, &block->mapped) { 2223 const uvm_page_mask_t *map_mask = uvm_va_block_map_mask_get(block, mapped_id); 2224 2225 for_each_va_block_page_in_mask(page_index, map_mask, block) { 2226 uvm_processor_id_t to_map_id = block_get_processor_to_map(block, mapped_id, page_index); 2227 UVM_ASSERT(!uvm_id_equal(to_map_id, resident_id)); 2228 } 2229 } 2230 2231 return true; 2232 } 2233 2234 // Zero all pages of the newly-populated chunk which are not resident anywhere 2235 // else in the system, adding that work to the block's tracker. In all cases, 2236 // this function adds a dependency on passed in tracker to the block's tracker. 2237 static NV_STATUS block_zero_new_gpu_chunk(uvm_va_block_t *block, 2238 uvm_gpu_t *gpu, 2239 uvm_gpu_chunk_t *chunk, 2240 uvm_va_block_region_t chunk_region, 2241 uvm_tracker_t *tracker) 2242 { 2243 uvm_va_block_gpu_state_t *gpu_state; 2244 NV_STATUS status; 2245 uvm_gpu_address_t memset_addr_base, memset_addr; 2246 uvm_push_t push; 2247 uvm_gpu_id_t id; 2248 uvm_va_block_region_t subregion; 2249 uvm_page_mask_t *zero_mask; 2250 2251 UVM_ASSERT(uvm_va_block_region_size(chunk_region) == uvm_gpu_chunk_get_size(chunk)); 2252 2253 if (chunk->is_zero) 2254 return NV_OK; 2255 2256 gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 2257 zero_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS); 2258 2259 if (!zero_mask) 2260 return NV_ERR_NO_MEMORY; 2261 2262 // Tradeoff: zeroing entire chunk vs zeroing only the pages needed for the 2263 // operation. 2264 // 2265 // We may over-zero the page with this approach. For example, we might be 2266 // populating a 2MB chunk because only a single page within that chunk needs 2267 // to be made resident. If we also zero non-resident pages outside of the 2268 // strict region, we could waste the effort if those pages are populated on 2269 // another processor later and migrated here. 2270 // 2271 // We zero all non-resident pages in the chunk anyway for two reasons: 2272 // 2273 // 1) Efficiency. It's better to do all zeros as pipelined transfers once 2274 // rather than scatter them around for each populate operation. 2275 // 2276 // 2) Optimizing the common case of block_populate_gpu_chunk being called 2277 // for already-populated chunks. If we zero once at initial populate, we 2278 // can simply check whether the chunk is present in the array. Otherwise 2279 // we'd have to recompute the "is any page resident" mask every time. 2280 2281 // Roll up all pages in chunk_region which are resident somewhere 2282 uvm_page_mask_zero(zero_mask); 2283 for_each_id_in_mask(id, &block->resident) 2284 uvm_page_mask_or(zero_mask, zero_mask, uvm_va_block_resident_mask_get(block, id)); 2285 2286 // If all pages in the chunk are resident somewhere, we don't need to clear 2287 // anything. Just make sure the chunk is tracked properly. 2288 if (uvm_page_mask_region_full(zero_mask, chunk_region)) { 2289 status = uvm_tracker_add_tracker_safe(&block->tracker, tracker); 2290 goto out; 2291 } 2292 2293 // Complement to get the pages which are not resident anywhere. These 2294 // are the pages which must be zeroed. 2295 uvm_page_mask_complement(zero_mask, zero_mask); 2296 2297 memset_addr_base = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address)); 2298 memset_addr = memset_addr_base; 2299 2300 status = uvm_push_begin_acquire(gpu->channel_manager, 2301 UVM_CHANNEL_TYPE_GPU_INTERNAL, 2302 tracker, 2303 &push, 2304 "Zero out chunk [0x%llx, 0x%llx) for region [0x%llx, 0x%llx) in va block [0x%llx, 0x%llx)", 2305 chunk->address, 2306 chunk->address + uvm_gpu_chunk_get_size(chunk), 2307 uvm_va_block_region_start(block, chunk_region), 2308 uvm_va_block_region_end(block, chunk_region) + 1, 2309 block->start, 2310 block->end + 1); 2311 if (status != NV_OK) 2312 goto out; 2313 2314 for_each_va_block_subregion_in_mask(subregion, zero_mask, chunk_region) { 2315 // Pipeline the memsets since they never overlap with each other 2316 uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 2317 2318 // We'll push one membar later for all memsets in this loop 2319 uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 2320 2321 memset_addr.address = memset_addr_base.address + (subregion.first - chunk_region.first) * PAGE_SIZE; 2322 gpu->parent->ce_hal->memset_8(&push, memset_addr, 0, uvm_va_block_region_size(subregion)); 2323 } 2324 2325 // A membar from this GPU is required between this memset and any PTE write 2326 // pointing this or another GPU to this chunk. Otherwise an engine could 2327 // read the PTE then access the page before the memset write is visible to 2328 // that engine. 2329 // 2330 // This memset writes GPU memory, so local mappings need only a GPU-local 2331 // membar. We can't easily determine here whether a peer GPU will ever map 2332 // this page in the future, so always use a sysmembar. uvm_push_end provides 2333 // one by default. 2334 // 2335 // TODO: Bug 1766424: Use GPU-local membars if no peer can currently map 2336 // this page. When peer access gets enabled, do a MEMBAR_SYS at that 2337 // point. 2338 uvm_push_end(&push); 2339 status = uvm_tracker_add_push_safe(&block->tracker, &push); 2340 2341 out: 2342 if (zero_mask) 2343 kmem_cache_free(g_uvm_page_mask_cache, zero_mask); 2344 2345 return status; 2346 } 2347 2348 static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block, 2349 uvm_va_block_retry_t *retry, 2350 uvm_gpu_t *gpu, 2351 size_t chunk_index, 2352 uvm_va_block_region_t chunk_region) 2353 { 2354 uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu); 2355 uvm_gpu_chunk_t *chunk = NULL; 2356 uvm_chunk_size_t chunk_size = uvm_va_block_region_size(chunk_region); 2357 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); 2358 NV_STATUS status; 2359 2360 if (!gpu_state) 2361 return NV_ERR_NO_MEMORY; 2362 2363 uvm_assert_mutex_locked(&block->lock); 2364 UVM_ASSERT(chunk_index < block_num_gpu_chunks(block, gpu)); 2365 UVM_ASSERT(chunk_size & gpu->parent->mmu_user_chunk_sizes); 2366 2367 // We zero chunks as necessary at initial population, so if the chunk is 2368 // already populated we're done. See the comment in 2369 // block_zero_new_gpu_chunk. 2370 if (gpu_state->chunks[chunk_index]) 2371 return NV_OK; 2372 2373 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->resident, chunk_region)); 2374 2375 status = block_alloc_gpu_chunk(block, retry, gpu, chunk_size, &chunk); 2376 if (status != NV_OK) 2377 return status; 2378 2379 // In some configurations such as SR-IOV heavy, the chunk cannot be 2380 // referenced using its physical address. Create a virtual mapping. 2381 status = uvm_mmu_chunk_map(chunk); 2382 if (status != NV_OK) 2383 goto chunk_free; 2384 2385 status = block_zero_new_gpu_chunk(block, gpu, chunk, chunk_region, &retry->tracker); 2386 if (status != NV_OK) 2387 goto chunk_unmap; 2388 2389 // It is safe to modify the page index field without holding any PMM locks 2390 // because the chunk is pinned, which means that none of the other fields in 2391 // the bitmap can change. 2392 chunk->va_block_page_index = chunk_region.first; 2393 2394 // va_block_page_index is a bitfield of size PAGE_SHIFT. Make sure at 2395 // compile-time that it can store VA Block page indexes. 2396 BUILD_BUG_ON(PAGES_PER_UVM_VA_BLOCK >= PAGE_SIZE); 2397 2398 status = block_map_indirect_peers_to_gpu_chunk(block, gpu, chunk); 2399 if (status != NV_OK) 2400 goto chunk_unmap; 2401 2402 if (block_test && block_test->inject_populate_error) { 2403 block_test->inject_populate_error = false; 2404 2405 // Use NV_ERR_MORE_PROCESSING_REQUIRED to force a retry rather than 2406 // causing a fatal OOM failure. 2407 status = NV_ERR_MORE_PROCESSING_REQUIRED; 2408 goto chunk_unmap_indirect_peers; 2409 } 2410 2411 // Record the used chunk so that it can be unpinned at the end of the whole 2412 // operation. 2413 block_retry_add_used_chunk(retry, chunk); 2414 gpu_state->chunks[chunk_index] = chunk; 2415 2416 return NV_OK; 2417 2418 chunk_unmap_indirect_peers: 2419 block_unmap_indirect_peers_from_gpu_chunk(block, gpu, chunk); 2420 2421 chunk_unmap: 2422 uvm_mmu_chunk_unmap(chunk, &block->tracker); 2423 2424 chunk_free: 2425 // block_zero_new_gpu_chunk may have pushed memsets on this chunk which it 2426 // placed in the block tracker. 2427 uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker); 2428 2429 return status; 2430 } 2431 2432 // Populate all chunks which cover the given region and page mask. 2433 static NV_STATUS block_populate_pages_gpu(uvm_va_block_t *block, 2434 uvm_va_block_retry_t *retry, 2435 uvm_gpu_t *gpu, 2436 uvm_va_block_region_t region, 2437 const uvm_page_mask_t *populate_mask) 2438 { 2439 uvm_va_block_region_t chunk_region, check_region; 2440 size_t chunk_index; 2441 uvm_page_index_t page_index; 2442 uvm_chunk_size_t chunk_size; 2443 NV_STATUS status; 2444 2445 page_index = uvm_va_block_first_page_in_mask(region, populate_mask); 2446 if (page_index == region.outer) 2447 return NV_OK; 2448 2449 chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size); 2450 chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index); 2451 2452 while (1) { 2453 check_region = uvm_va_block_region(max(chunk_region.first, region.first), 2454 min(chunk_region.outer, region.outer)); 2455 page_index = uvm_va_block_first_page_in_mask(check_region, populate_mask); 2456 if (page_index != check_region.outer) { 2457 status = block_populate_gpu_chunk(block, retry, gpu, chunk_index, chunk_region); 2458 if (status != NV_OK) 2459 return status; 2460 } 2461 2462 if (check_region.outer == region.outer) 2463 break; 2464 2465 ++chunk_index; 2466 chunk_size = block_gpu_chunk_size(block, gpu, chunk_region.outer); 2467 chunk_region = uvm_va_block_region(chunk_region.outer, chunk_region.outer + (chunk_size / PAGE_SIZE)); 2468 } 2469 2470 return NV_OK; 2471 } 2472 2473 static NV_STATUS block_populate_pages(uvm_va_block_t *block, 2474 uvm_va_block_retry_t *retry, 2475 uvm_va_block_context_t *block_context, 2476 uvm_processor_id_t dest_id, 2477 uvm_va_block_region_t region, 2478 const uvm_page_mask_t *page_mask) 2479 { 2480 NV_STATUS status; 2481 const uvm_page_mask_t *resident_mask = block_resident_mask_get_alloc(block, dest_id); 2482 uvm_page_mask_t *populate_page_mask = &block_context->make_resident.page_mask; 2483 uvm_memcg_context_t memcg_context; 2484 2485 if (!resident_mask) 2486 return NV_ERR_NO_MEMORY; 2487 2488 if (page_mask) 2489 uvm_page_mask_andnot(populate_page_mask, page_mask, resident_mask); 2490 else 2491 uvm_page_mask_complement(populate_page_mask, resident_mask); 2492 2493 if (UVM_ID_IS_GPU(dest_id)) 2494 return block_populate_pages_gpu(block, retry, block_get_gpu(block, dest_id), region, populate_page_mask); 2495 2496 uvm_memcg_context_start(&memcg_context, block_context->mm); 2497 status = block_populate_pages_cpu(block, populate_page_mask, region, block_context); 2498 uvm_memcg_context_end(&memcg_context); 2499 return status; 2500 } 2501 2502 static const uvm_processor_mask_t *block_get_can_copy_from_mask(uvm_va_block_t *block, uvm_processor_id_t from) 2503 { 2504 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 2505 2506 return &va_space->can_copy_from[uvm_id_value(from)]; 2507 } 2508 2509 static bool block_can_copy_from(uvm_va_block_t *va_block, uvm_processor_id_t from, uvm_processor_id_t to) 2510 { 2511 return uvm_processor_mask_test(block_get_can_copy_from_mask(va_block, to), from); 2512 } 2513 2514 // Get the chunk containing the given page, along with the offset of that page 2515 // within the chunk. 2516 static uvm_gpu_chunk_t *block_phys_page_chunk(uvm_va_block_t *block, block_phys_page_t block_page, size_t *chunk_offset) 2517 { 2518 uvm_gpu_t *gpu = block_get_gpu(block, block_page.processor); 2519 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, block_page.processor); 2520 size_t chunk_index; 2521 uvm_gpu_chunk_t *chunk; 2522 uvm_chunk_size_t chunk_size; 2523 2524 UVM_ASSERT(gpu_state); 2525 2526 chunk_index = block_gpu_chunk_index(block, gpu, block_page.page_index, &chunk_size); 2527 chunk = gpu_state->chunks[chunk_index]; 2528 UVM_ASSERT(chunk); 2529 2530 if (chunk_offset) { 2531 size_t page_offset = block_page.page_index - 2532 uvm_va_block_chunk_region(block,chunk_size, block_page.page_index).first; 2533 *chunk_offset = page_offset * PAGE_SIZE; 2534 } 2535 2536 return chunk; 2537 } 2538 2539 // Get the physical GPU address of a block's page from the POV of the specified GPU 2540 // This is the address that should be used for making PTEs for the specified GPU. 2541 static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block, 2542 block_phys_page_t block_page, 2543 uvm_gpu_t *gpu) 2544 { 2545 uvm_va_block_gpu_state_t *accessing_gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 2546 size_t chunk_offset; 2547 uvm_gpu_chunk_t *chunk; 2548 2549 UVM_ASSERT(accessing_gpu_state); 2550 2551 if (UVM_ID_IS_CPU(block_page.processor)) { 2552 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, block_page.page_index); 2553 NvU64 dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 2554 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, 2555 uvm_cpu_chunk_get_size(chunk), 2556 block_page.page_index); 2557 2558 // The page should be mapped for physical access already as we do that 2559 // eagerly on CPU page population and GPU state alloc. 2560 UVM_ASSERT(dma_addr != 0); 2561 dma_addr += (block_page.page_index - chunk_region.first) * PAGE_SIZE; 2562 2563 return uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr); 2564 } 2565 2566 chunk = block_phys_page_chunk(block, block_page, &chunk_offset); 2567 2568 if (uvm_id_equal(block_page.processor, gpu->id)) { 2569 return uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address + chunk_offset); 2570 } 2571 else { 2572 uvm_gpu_phys_address_t phys_addr; 2573 uvm_gpu_t *owning_gpu = block_get_gpu(block, block_page.processor); 2574 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 2575 2576 UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu)); 2577 phys_addr = uvm_pmm_gpu_peer_phys_address(&owning_gpu->pmm, chunk, gpu); 2578 phys_addr.address += chunk_offset; 2579 return phys_addr; 2580 } 2581 } 2582 2583 // Get the physical GPU address of a block's page from the POV of the specified 2584 // GPU, suitable for accessing the memory from UVM-internal CE channels. 2585 // 2586 // Notably this is may be different from block_phys_page_address() to handle CE 2587 // limitations in addressing physical memory directly. 2588 static uvm_gpu_address_t block_phys_page_copy_address(uvm_va_block_t *block, 2589 block_phys_page_t block_page, 2590 uvm_gpu_t *gpu) 2591 { 2592 uvm_gpu_t *owning_gpu; 2593 size_t chunk_offset; 2594 uvm_gpu_chunk_t *chunk; 2595 uvm_gpu_address_t copy_addr; 2596 uvm_va_space_t *va_space; 2597 2598 UVM_ASSERT_MSG(block_can_copy_from(block, gpu->id, block_page.processor), 2599 "from %s to %s\n", 2600 block_processor_name(block, gpu->id), 2601 block_processor_name(block, block_page.processor)); 2602 2603 // CPU and local GPU accesses can rely on block_phys_page_address, but the 2604 // resulting physical address may need to be converted into virtual. 2605 if (UVM_ID_IS_CPU(block_page.processor) || uvm_id_equal(block_page.processor, gpu->id)) 2606 return uvm_gpu_address_copy(gpu, block_phys_page_address(block, block_page, gpu)); 2607 2608 va_space = uvm_va_block_get_va_space(block); 2609 2610 // See the comments on the peer_identity_mappings_supported assignments in 2611 // the HAL for why we disable direct copies between peers. 2612 owning_gpu = block_get_gpu(block, block_page.processor); 2613 2614 UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu)); 2615 2616 chunk = block_phys_page_chunk(block, block_page, &chunk_offset); 2617 copy_addr = uvm_pmm_gpu_peer_copy_address(&owning_gpu->pmm, chunk, gpu); 2618 copy_addr.address += chunk_offset; 2619 return copy_addr; 2620 } 2621 2622 uvm_gpu_phys_address_t uvm_va_block_res_phys_page_address(uvm_va_block_t *va_block, 2623 uvm_page_index_t page_index, 2624 uvm_processor_id_t residency, 2625 uvm_gpu_t *gpu) 2626 { 2627 uvm_assert_mutex_locked(&va_block->lock); 2628 2629 return block_phys_page_address(va_block, block_phys_page(residency, page_index), gpu); 2630 } 2631 2632 uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block, 2633 uvm_page_index_t page_index, 2634 uvm_gpu_t *gpu) 2635 { 2636 return uvm_va_block_res_phys_page_address(va_block, page_index, gpu->id, gpu); 2637 } 2638 2639 typedef struct 2640 { 2641 // Location of the memory 2642 uvm_processor_id_t id; 2643 2644 // Whether the whole block has a single physically-contiguous chunk of 2645 // storage on the processor. 2646 bool is_block_contig; 2647 2648 // Starting address of the physically-contiguous allocation, from the view 2649 // of the copying GPU. Valid only if is_block_contig. 2650 uvm_gpu_address_t gpu_address; 2651 } block_copy_addr_t; 2652 2653 typedef struct 2654 { 2655 block_copy_addr_t src; 2656 block_copy_addr_t dst; 2657 uvm_conf_computing_dma_buffer_t *dma_buffer; 2658 } block_copy_state_t; 2659 2660 // Begin a push appropriate for copying data from src_id processor to dst_id processor. 2661 // One of src_id and dst_id needs to be a GPU. 2662 static NV_STATUS block_copy_begin_push(uvm_va_block_t *va_block, 2663 block_copy_state_t *copy_state, 2664 uvm_tracker_t *tracker, 2665 uvm_push_t *push) 2666 { 2667 uvm_gpu_t *gpu; 2668 NV_STATUS status; 2669 uvm_channel_type_t channel_type; 2670 uvm_tracker_t *tracker_ptr = tracker; 2671 uvm_processor_id_t dst_id = copy_state->dst.id; 2672 uvm_processor_id_t src_id = copy_state->src.id; 2673 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 2674 2675 UVM_ASSERT_MSG(!uvm_id_equal(src_id, dst_id), 2676 "Unexpected copy to self, processor %s\n", 2677 block_processor_name(va_block, src_id)); 2678 2679 if (UVM_ID_IS_CPU(src_id)) { 2680 gpu = block_get_gpu(va_block, dst_id); 2681 channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU; 2682 } 2683 else if (UVM_ID_IS_CPU(dst_id)) { 2684 gpu = block_get_gpu(va_block, src_id); 2685 channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU; 2686 } 2687 else { 2688 // For GPU to GPU copies, prefer to "push" the data from the source as 2689 // that works better at least for P2P over PCI-E. 2690 gpu = block_get_gpu(va_block, src_id); 2691 2692 channel_type = UVM_CHANNEL_TYPE_GPU_TO_GPU; 2693 } 2694 2695 UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, dst_id), 2696 "GPU %s dst %s src %s\n", 2697 block_processor_name(va_block, gpu->id), 2698 block_processor_name(va_block, dst_id), 2699 block_processor_name(va_block, src_id)); 2700 UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, src_id), 2701 "GPU %s dst %s src %s\n", 2702 block_processor_name(va_block, gpu->id), 2703 block_processor_name(va_block, dst_id), 2704 block_processor_name(va_block, src_id)); 2705 2706 if (channel_type == UVM_CHANNEL_TYPE_GPU_TO_GPU) { 2707 uvm_gpu_t *dst_gpu = block_get_gpu(va_block, dst_id); 2708 return uvm_push_begin_acquire_gpu_to_gpu(gpu->channel_manager, 2709 dst_gpu, 2710 tracker, 2711 push, 2712 "Copy from %s to %s for block [0x%llx, 0x%llx]", 2713 block_processor_name(va_block, src_id), 2714 block_processor_name(va_block, dst_id), 2715 va_block->start, 2716 va_block->end); 2717 } 2718 2719 if (uvm_conf_computing_mode_enabled(gpu)) { 2720 // When the Confidential Feature is enabled, additional dependencies 2721 // apply to the input tracker as well as the dma_buffer tracker. 2722 // * In the CPU to GPU case, because UVM performs CPU side 2723 // crypto-operations first before the GPU copy, we both need to 2724 // ensure that the dma_buffer and the input tracker are completed. 2725 // * In the GPU to CPU case, the GPU copy happens first, but the same 2726 // principles apply. Hence, UVM acquires the input tracker and the 2727 // dma buffer. 2728 status = uvm_tracker_overwrite_safe(&local_tracker, tracker); 2729 if (status != NV_OK) 2730 goto error; 2731 2732 UVM_ASSERT(copy_state->dma_buffer == NULL); 2733 status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, 2734 ©_state->dma_buffer, 2735 &local_tracker); 2736 2737 if (status != NV_OK) 2738 goto error; 2739 2740 if (channel_type == UVM_CHANNEL_TYPE_CPU_TO_GPU) { 2741 status = uvm_tracker_wait(&local_tracker); 2742 if (status != NV_OK) 2743 goto error; 2744 } 2745 2746 tracker_ptr = &local_tracker; 2747 } 2748 2749 status = uvm_push_begin_acquire(gpu->channel_manager, 2750 channel_type, 2751 tracker_ptr, 2752 push, 2753 "Copy from %s to %s for block [0x%llx, 0x%llx]", 2754 block_processor_name(va_block, src_id), 2755 block_processor_name(va_block, dst_id), 2756 va_block->start, 2757 va_block->end); 2758 2759 error: 2760 // Caller is responsible for freeing the DMA buffer on error 2761 uvm_tracker_deinit(&local_tracker); 2762 return status; 2763 } 2764 2765 // A page is clean iff... 2766 // the destination is the preferred location and 2767 // the source is the CPU and 2768 // the destination does not support faults/eviction and 2769 // the CPU page is not dirty 2770 static bool block_page_is_clean(uvm_va_block_t *block, 2771 uvm_processor_id_t dst_id, 2772 uvm_processor_id_t src_id, 2773 uvm_page_index_t page_index) 2774 { 2775 return !uvm_va_block_is_hmm(block) && 2776 uvm_id_equal(dst_id, uvm_va_range_get_policy(block->va_range)->preferred_location) && 2777 UVM_ID_IS_CPU(src_id) && 2778 !block_get_gpu(block, dst_id)->parent->isr.replayable_faults.handling && 2779 !block_cpu_page_is_dirty(block, page_index); 2780 } 2781 2782 // When the destination is the CPU... 2783 // if the source is the preferred location, mark as clean 2784 // otherwise, mark as dirty 2785 static void block_update_page_dirty_state(uvm_va_block_t *block, 2786 uvm_processor_id_t dst_id, 2787 uvm_processor_id_t src_id, 2788 uvm_page_index_t page_index) 2789 { 2790 if (UVM_ID_IS_GPU(dst_id)) 2791 return; 2792 2793 if (uvm_id_equal(src_id, uvm_va_range_get_policy(block->va_range)->preferred_location)) 2794 block_mark_cpu_page_clean(block, page_index); 2795 else 2796 block_mark_cpu_page_dirty(block, page_index); 2797 } 2798 2799 static void block_mark_memory_used(uvm_va_block_t *block, uvm_processor_id_t id) 2800 { 2801 uvm_gpu_t *gpu; 2802 2803 if (UVM_ID_IS_CPU(id)) 2804 return; 2805 2806 gpu = block_get_gpu(block, id); 2807 2808 // If the block is of the max size and the GPU supports eviction, mark the 2809 // root chunk as used in PMM. 2810 // HMM always allocates PAGE_SIZE GPU chunks so skip HMM va_blocks. 2811 if (!uvm_va_block_is_hmm(block) && 2812 uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX && 2813 uvm_gpu_supports_eviction(gpu)) { 2814 // The chunk has to be there if this GPU is resident 2815 UVM_ASSERT(uvm_processor_mask_test(&block->resident, id)); 2816 uvm_pmm_gpu_mark_root_chunk_used(&gpu->pmm, uvm_va_block_gpu_state_get(block, gpu->id)->chunks[0]); 2817 } 2818 } 2819 2820 static void block_set_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id) 2821 { 2822 UVM_ASSERT(!uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id))); 2823 2824 if (uvm_processor_mask_test_and_set(&block->resident, id)) 2825 return; 2826 2827 block_mark_memory_used(block, id); 2828 } 2829 2830 static void block_clear_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id) 2831 { 2832 uvm_gpu_t *gpu; 2833 2834 UVM_ASSERT(uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id))); 2835 2836 if (!uvm_processor_mask_test_and_clear(&block->resident, id)) 2837 return; 2838 2839 if (UVM_ID_IS_CPU(id)) 2840 return; 2841 2842 gpu = block_get_gpu(block, id); 2843 2844 // If the block is of the max size and the GPU supports eviction, mark the 2845 // root chunk as unused in PMM. 2846 if (!uvm_va_block_is_hmm(block) && 2847 uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX && 2848 uvm_gpu_supports_eviction(gpu)) { 2849 // The chunk may not be there any more when residency is cleared. 2850 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 2851 if (gpu_state && gpu_state->chunks[0]) 2852 uvm_pmm_gpu_mark_root_chunk_unused(&gpu->pmm, gpu_state->chunks[0]); 2853 } 2854 } 2855 2856 static bool block_phys_copy_contig_check(uvm_va_block_t *block, 2857 uvm_page_index_t page_index, 2858 const uvm_gpu_address_t *base_address, 2859 uvm_processor_id_t proc_id, 2860 uvm_gpu_t *copying_gpu) 2861 { 2862 uvm_gpu_address_t page_address; 2863 uvm_gpu_address_t contig_address = *base_address; 2864 2865 contig_address.address += page_index * PAGE_SIZE; 2866 2867 page_address = block_phys_page_copy_address(block, block_phys_page(proc_id, page_index), copying_gpu); 2868 2869 return uvm_gpu_addr_cmp(page_address, contig_address) == 0; 2870 } 2871 2872 // Check if the VA block has a single physically-contiguous chunk of storage 2873 // on the processor. 2874 static bool is_block_phys_contig(uvm_va_block_t *block, uvm_processor_id_t id) 2875 { 2876 uvm_cpu_chunk_t *chunk; 2877 2878 if (UVM_ID_IS_GPU(id)) 2879 return uvm_va_block_size(block) == block_gpu_chunk_size(block, block_get_gpu(block, id), 0); 2880 2881 chunk = uvm_cpu_chunk_first_in_region(block, uvm_va_block_region_from_block(block), NULL); 2882 return chunk && (uvm_va_block_size(block) == uvm_cpu_chunk_get_size(chunk)); 2883 } 2884 2885 static uvm_va_block_region_t block_phys_contig_region(uvm_va_block_t *block, 2886 uvm_page_index_t page_index, 2887 uvm_processor_id_t resident_id) 2888 { 2889 if (UVM_ID_IS_CPU(resident_id)) { 2890 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 2891 return uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk)); 2892 } 2893 else { 2894 uvm_chunk_size_t chunk_size; 2895 (void)block_gpu_chunk_index(block, block_get_gpu(block, resident_id), page_index, &chunk_size); 2896 return uvm_va_block_chunk_region(block, chunk_size, page_index); 2897 } 2898 } 2899 2900 // Like block_phys_page_copy_address, but uses the address cached in bca when 2901 // possible. 2902 static uvm_gpu_address_t block_copy_get_address(uvm_va_block_t *block, 2903 block_copy_addr_t *bca, 2904 uvm_page_index_t page_index, 2905 uvm_gpu_t *copying_gpu) 2906 { 2907 if (bca->is_block_contig) { 2908 uvm_gpu_address_t addr = bca->gpu_address; 2909 addr.address += page_index * PAGE_SIZE; 2910 UVM_ASSERT(block_phys_copy_contig_check(block, page_index, &bca->gpu_address, bca->id, copying_gpu)); 2911 return addr; 2912 } 2913 2914 return block_phys_page_copy_address(block, block_phys_page(bca->id, page_index), copying_gpu); 2915 } 2916 2917 // When the Confidential Computing feature is enabled, the function performs 2918 // CPU side page encryption and GPU side decryption to the CPR. 2919 // GPU operations respect the caller's membar previously set in the push. 2920 static void conf_computing_block_copy_push_cpu_to_gpu(uvm_va_block_t *block, 2921 block_copy_state_t *copy_state, 2922 uvm_va_block_region_t region, 2923 uvm_push_t *push) 2924 { 2925 uvm_push_flag_t membar_flag = 0; 2926 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 2927 uvm_page_index_t page_index = region.first; 2928 uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer; 2929 struct page *src_page = uvm_cpu_chunk_get_cpu_page(block, page_index); 2930 uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu); 2931 uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu); 2932 char *cpu_auth_tag_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag) + 2933 (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE); 2934 uvm_gpu_address_t dst_address = block_copy_get_address(block, ©_state->dst, page_index, gpu); 2935 char *cpu_va_staging_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc) + (page_index * PAGE_SIZE); 2936 2937 UVM_ASSERT(UVM_ID_IS_CPU(copy_state->src.id)); 2938 UVM_ASSERT(UVM_ID_IS_GPU(copy_state->dst.id)); 2939 2940 UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); 2941 2942 // See comment in block_copy_begin_push. 2943 UVM_ASSERT(uvm_tracker_is_completed(&block->tracker)); 2944 2945 staging_buffer.address += page_index * PAGE_SIZE; 2946 auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 2947 2948 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE)) 2949 membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE; 2950 else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU)) 2951 membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU; 2952 2953 // kmap() only guarantees PAGE_SIZE contiguity, all encryption and 2954 // decryption must happen on a PAGE_SIZE basis. 2955 for_each_va_block_page_in_region(page_index, region) { 2956 void *src_cpu_virt_addr; 2957 2958 // The caller guarantees that all pages in region are contiguous, 2959 // meaning they're guaranteed to be part of the same compound page. 2960 UVM_ASSERT(src_page == uvm_cpu_chunk_get_cpu_page(block, page_index)); 2961 2962 src_cpu_virt_addr = kmap(src_page); 2963 uvm_conf_computing_cpu_encrypt(push->channel, 2964 cpu_va_staging_buffer, 2965 src_cpu_virt_addr, 2966 NULL, 2967 PAGE_SIZE, 2968 cpu_auth_tag_buffer); 2969 kunmap(src_page); 2970 2971 // First LCE operation should be non-pipelined to guarantee ordering as 2972 // we do not know when was the last non-pipelined copy. 2973 // Last one applies the membar originally planned for the push if any 2974 // TODO: 3857691: Inherit policy instead of forcing first invocation to 2975 // be non pipelined. 2976 if (page_index > region.first) 2977 uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 2978 2979 if (page_index < (region.outer - 1)) 2980 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 2981 else if (membar_flag) 2982 uvm_push_set_flag(push, membar_flag); 2983 2984 gpu->parent->ce_hal->decrypt(push, dst_address, staging_buffer, PAGE_SIZE, auth_tag_buffer); 2985 2986 src_page++; 2987 dst_address.address += PAGE_SIZE; 2988 cpu_va_staging_buffer += PAGE_SIZE; 2989 staging_buffer.address += PAGE_SIZE; 2990 cpu_auth_tag_buffer += UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 2991 auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 2992 } 2993 } 2994 2995 // When the Confidential Computing feature is enabled, the function performs 2996 // GPU side page encryption. GPU operations respect the caller's membar 2997 // previously set in the push. 2998 static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block, 2999 block_copy_state_t *copy_state, 3000 uvm_va_block_region_t region, 3001 uvm_push_t *push) 3002 { 3003 uvm_push_flag_t membar_flag = 0; 3004 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 3005 uvm_page_index_t page_index = region.first; 3006 uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer; 3007 uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu); 3008 uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu); 3009 uvm_gpu_address_t src_address = block_copy_get_address(block, ©_state->src, page_index, gpu); 3010 3011 UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id)); 3012 UVM_ASSERT(UVM_ID_IS_CPU(copy_state->dst.id)); 3013 3014 UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); 3015 3016 staging_buffer.address += page_index * PAGE_SIZE; 3017 auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 3018 3019 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE)) 3020 membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE; 3021 else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU)) 3022 membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU; 3023 3024 // Because we use kmap() for mapping pages for CPU side 3025 // crypto-operations and it only guarantees PAGE_SIZE contiguity, all 3026 // encryptions and decryptions must happen on a PAGE_SIZE basis. 3027 for_each_va_block_page_in_region(page_index, region) { 3028 uvm_conf_computing_log_gpu_encryption(push->channel, &dma_buffer->decrypt_iv[page_index]); 3029 3030 // First LCE operation should be non-pipelined to guarantee ordering as 3031 // we do not know when was the last non-pipelined copy. 3032 // Last one applies the membar originally planned for the push if any 3033 // TODO: 3857691: Inherit policy instead of forcing first invocation to 3034 // be non pipelined. 3035 if (page_index > region.first) 3036 uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 3037 3038 if (page_index < (region.outer - 1)) 3039 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 3040 else if (membar_flag) 3041 uvm_push_set_flag(push, membar_flag); 3042 3043 gpu->parent->ce_hal->encrypt(push, staging_buffer, src_address, PAGE_SIZE, auth_tag_buffer); 3044 3045 src_address.address += PAGE_SIZE; 3046 staging_buffer.address += PAGE_SIZE; 3047 auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 3048 } 3049 3050 uvm_page_mask_region_fill(&dma_buffer->encrypted_page_mask, region); 3051 } 3052 3053 static NV_STATUS conf_computing_copy_pages_finish(uvm_va_block_t *block, 3054 block_copy_state_t *copy_state, 3055 uvm_push_t *push) 3056 { 3057 NV_STATUS status; 3058 uvm_page_index_t page_index; 3059 uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer; 3060 uvm_page_mask_t *encrypted_page_mask = &dma_buffer->encrypted_page_mask; 3061 void *auth_tag_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag); 3062 void *staging_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc); 3063 3064 UVM_ASSERT(uvm_channel_is_secure(push->channel)); 3065 3066 if (UVM_ID_IS_GPU(copy_state->dst.id)) 3067 return NV_OK; 3068 3069 UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id)); 3070 3071 status = uvm_push_wait(push); 3072 if (status != NV_OK) 3073 return status; 3074 3075 // kmap() only guarantees PAGE_SIZE contiguity, all encryption and 3076 // decryption must happen on a PAGE_SIZE basis. 3077 for_each_va_block_page_in_mask(page_index, encrypted_page_mask, block) { 3078 struct page *dst_page = uvm_cpu_chunk_get_cpu_page(block, page_index); 3079 void *staging_buffer = (char *)staging_buffer_base + (page_index * PAGE_SIZE); 3080 void *auth_tag_buffer = (char *)auth_tag_buffer_base + (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE); 3081 void *cpu_page_address = kmap(dst_page); 3082 3083 status = uvm_conf_computing_cpu_decrypt(push->channel, 3084 cpu_page_address, 3085 staging_buffer, 3086 &dma_buffer->decrypt_iv[page_index], 3087 PAGE_SIZE, 3088 auth_tag_buffer); 3089 kunmap(dst_page); 3090 if (status != NV_OK) { 3091 // TODO: Bug 3814087: [UVM][HCC] Handle CSL auth_tag verification 3092 // failures & other failures gracefully. 3093 // uvm_conf_computing_cpu_decrypt() can fail if the authentication 3094 // tag verification fails. May this happen, it is considered a 3095 // critical failure and cannot be recovered. 3096 uvm_global_set_fatal_error(status); 3097 return status; 3098 } 3099 } 3100 3101 return NV_OK; 3102 } 3103 3104 static void block_copy_push(uvm_va_block_t *block, 3105 block_copy_state_t *copy_state, 3106 uvm_va_block_region_t region, 3107 uvm_push_t *push) 3108 { 3109 uvm_gpu_address_t gpu_dst_address; 3110 uvm_gpu_address_t gpu_src_address; 3111 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 3112 3113 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 3114 3115 if (uvm_channel_is_secure(push->channel)) { 3116 if (UVM_ID_IS_CPU(copy_state->src.id)) 3117 conf_computing_block_copy_push_cpu_to_gpu(block, copy_state, region, push); 3118 else 3119 conf_computing_block_copy_push_gpu_to_cpu(block, copy_state, region, push); 3120 3121 return; 3122 } 3123 3124 gpu_dst_address = block_copy_get_address(block, ©_state->dst, region.first, gpu); 3125 gpu_src_address = block_copy_get_address(block, ©_state->src, region.first, gpu); 3126 gpu->parent->ce_hal->memcopy(push, gpu_dst_address, gpu_src_address, uvm_va_block_region_size(region)); 3127 } 3128 3129 static NV_STATUS block_copy_end_push(uvm_va_block_t *block, 3130 block_copy_state_t *copy_state, 3131 uvm_tracker_t *copy_tracker, 3132 NV_STATUS push_status, 3133 uvm_push_t *push) 3134 { 3135 NV_STATUS tracker_status; 3136 3137 // TODO: Bug 1766424: If the destination is a GPU and the copy was done 3138 // by that GPU, use a GPU-local membar if no peer can currently 3139 // map this page. When peer access gets enabled, do a MEMBAR_SYS 3140 // at that point. 3141 uvm_push_end(push); 3142 3143 if ((push_status == NV_OK) && uvm_channel_is_secure(push->channel)) 3144 push_status = conf_computing_copy_pages_finish(block, copy_state, push); 3145 3146 tracker_status = uvm_tracker_add_push_safe(copy_tracker, push); 3147 if (push_status == NV_OK) 3148 push_status = tracker_status; 3149 3150 if (uvm_channel_is_secure(push->channel)) { 3151 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 3152 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 3153 3154 uvm_tracker_overwrite_with_push(&local_tracker, push); 3155 uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, 3156 copy_state->dma_buffer, 3157 &local_tracker); 3158 copy_state->dma_buffer = NULL; 3159 uvm_tracker_deinit(&local_tracker); 3160 } 3161 3162 return push_status; 3163 } 3164 3165 // Copies pages resident on the src_id processor to the dst_id processor 3166 // 3167 // The function adds the pages that were successfully copied to the output 3168 // migrated_pages mask and returns the number of pages in copied_pages. These 3169 // fields are reliable even if an error is returned. 3170 // 3171 // Acquires the block's tracker and adds all of its pushes to the copy_tracker. 3172 static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block, 3173 uvm_va_block_context_t *block_context, 3174 uvm_processor_id_t dst_id, 3175 uvm_processor_id_t src_id, 3176 uvm_va_block_region_t region, 3177 uvm_page_mask_t *copy_mask, 3178 const uvm_page_mask_t *prefetch_page_mask, 3179 uvm_va_block_transfer_mode_t transfer_mode, 3180 uvm_page_mask_t *migrated_pages, 3181 NvU32 *copied_pages, 3182 uvm_tracker_t *copy_tracker) 3183 { 3184 NV_STATUS status = NV_OK; 3185 uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(block, dst_id); 3186 uvm_gpu_t *copying_gpu = NULL; 3187 uvm_push_t push; 3188 uvm_page_index_t page_index; 3189 uvm_page_index_t contig_start_index = region.outer; 3190 uvm_page_index_t last_index = region.outer; 3191 uvm_range_group_range_t *rgr = NULL; 3192 bool rgr_has_changed = false; 3193 uvm_make_resident_cause_t cause = block_context->make_resident.cause; 3194 uvm_make_resident_cause_t contig_cause = cause; 3195 const bool may_prefetch = (cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT || 3196 cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT || 3197 cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER) && !!prefetch_page_mask; 3198 block_copy_state_t copy_state = {0}; 3199 uvm_va_range_t *va_range = block->va_range; 3200 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 3201 3202 copy_state.src.id = src_id; 3203 copy_state.dst.id = dst_id; 3204 copy_state.src.is_block_contig = is_block_phys_contig(block, src_id); 3205 copy_state.dst.is_block_contig = is_block_phys_contig(block, dst_id); 3206 3207 *copied_pages = 0; 3208 3209 // If there are no pages to be copied, exit early 3210 if (!uvm_page_mask_andnot(copy_mask, copy_mask, dst_resident_mask) || 3211 !uvm_page_mask_andnot(copy_mask, copy_mask, migrated_pages)) 3212 return NV_OK; 3213 3214 // uvm_range_group_range_iter_first should only be called when the va_space 3215 // lock is held, which is always the case unless an eviction is taking 3216 // place. 3217 if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION) { 3218 rgr = uvm_range_group_range_iter_first(va_space, 3219 uvm_va_block_region_start(block, region), 3220 uvm_va_block_region_end(block, region)); 3221 rgr_has_changed = true; 3222 } 3223 3224 if (UVM_ID_IS_CPU(dst_id)) { 3225 uvm_memcg_context_t memcg_context; 3226 3227 // To support staging through CPU, populate CPU pages on demand. 3228 // GPU destinations should have their pages populated already, but 3229 // that might change if we add staging through GPUs. 3230 uvm_memcg_context_start(&memcg_context, block_context->mm); 3231 status = block_populate_pages_cpu(block, copy_mask, region, block_context); 3232 uvm_memcg_context_end(&memcg_context); 3233 if (status != NV_OK) 3234 return status; 3235 } 3236 3237 // TODO: Bug 3745051: This function is complicated and needs refactoring 3238 for_each_va_block_page_in_region_mask(page_index, copy_mask, region) { 3239 NvU64 page_start = uvm_va_block_cpu_page_address(block, page_index); 3240 uvm_make_resident_cause_t page_cause = (may_prefetch && uvm_page_mask_test(prefetch_page_mask, page_index)) ? 3241 UVM_MAKE_RESIDENT_CAUSE_PREFETCH: 3242 cause; 3243 3244 UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id)); 3245 UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index)); 3246 3247 // If we're not evicting and we're migrating away from the preferred 3248 // location, then we should add the range group range to the list of 3249 // migrated ranges in the range group. It's safe to skip this because 3250 // the use of range_group's migrated_ranges list is a UVM-Lite 3251 // optimization - eviction is not supported on UVM-Lite GPUs. 3252 if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION && !uvm_va_block_is_hmm(block) && 3253 uvm_id_equal(src_id, uvm_va_range_get_policy(va_range)->preferred_location)) { 3254 // rgr_has_changed is used to minimize the number of times the 3255 // migrated_ranges_lock is taken. It is set to false when the range 3256 // group range pointed by rgr is added to the migrated_ranges list, 3257 // and it is just set back to true when we move to a different 3258 // range group range. 3259 3260 // The current page could be after the end of rgr. Iterate over the 3261 // range group ranges until rgr's end location is greater than or 3262 // equal to the current page. 3263 while (rgr && rgr->node.end < page_start) { 3264 rgr = uvm_range_group_range_iter_next(va_space, rgr, uvm_va_block_region_end(block, region)); 3265 rgr_has_changed = true; 3266 } 3267 3268 // Check whether the current page lies within rgr. A single page 3269 // must entirely reside within a range group range. Since we've 3270 // incremented rgr until its end is higher than page_start, we now 3271 // check if page_start lies within rgr. 3272 if (rgr && rgr_has_changed && page_start >= rgr->node.start && page_start <= rgr->node.end) { 3273 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock); 3274 if (list_empty(&rgr->range_group_migrated_list_node)) 3275 list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges); 3276 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock); 3277 3278 rgr_has_changed = false; 3279 } 3280 } 3281 3282 // No need to copy pages that haven't changed. Just clear residency 3283 // information 3284 if (block_page_is_clean(block, dst_id, src_id, page_index)) 3285 continue; 3286 3287 if (!copying_gpu) { 3288 status = block_copy_begin_push(block, ©_state, &block->tracker, &push); 3289 3290 if (status != NV_OK) 3291 break; 3292 copying_gpu = uvm_push_get_gpu(&push); 3293 3294 // Record all processors involved in the copy 3295 uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, copying_gpu->id); 3296 uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, dst_id); 3297 uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, src_id); 3298 3299 // This function is called just once per VA block and needs to 3300 // receive the "main" cause for the migration (it mainly checks if 3301 // we are in the eviction path). Therefore, we pass cause instead 3302 // of contig_cause 3303 uvm_tools_record_block_migration_begin(block, &push, dst_id, src_id, page_start, cause); 3304 } 3305 else { 3306 uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 3307 } 3308 3309 if (!uvm_va_block_is_hmm(block)) 3310 block_update_page_dirty_state(block, dst_id, src_id, page_index); 3311 3312 if (last_index == region.outer) { 3313 bool can_cache_src_phys_addr = copy_state.src.is_block_contig; 3314 bool can_cache_dst_phys_addr = copy_state.dst.is_block_contig; 3315 contig_start_index = page_index; 3316 contig_cause = page_cause; 3317 3318 // When CC is enabled, transfers between GPU and CPU don't rely on 3319 // any GPU mapping of CPU chunks, physical or virtual. 3320 if (UVM_ID_IS_CPU(src_id) && uvm_conf_computing_mode_enabled(copying_gpu)) 3321 can_cache_src_phys_addr = false; 3322 if (UVM_ID_IS_CPU(dst_id) && uvm_conf_computing_mode_enabled(copying_gpu)) 3323 can_cache_dst_phys_addr = false; 3324 // Computing the physical address is a non-trivial operation and 3325 // seems to be a performance limiter on systems with 2 or more 3326 // NVLINK links. Therefore, for physically-contiguous block 3327 // storage, we cache the start address and compute the page address 3328 // using the page index. 3329 if (can_cache_src_phys_addr) { 3330 copy_state.src.gpu_address = block_phys_page_copy_address(block, 3331 block_phys_page(src_id, 0), 3332 copying_gpu); 3333 } 3334 if (can_cache_dst_phys_addr) { 3335 copy_state.dst.gpu_address = block_phys_page_copy_address(block, 3336 block_phys_page(dst_id, 0), 3337 copying_gpu); 3338 } 3339 } 3340 else if ((page_index != last_index + 1) || contig_cause != page_cause) { 3341 uvm_va_block_region_t contig_region = uvm_va_block_region(contig_start_index, last_index + 1); 3342 UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region)); 3343 3344 // If both src and dst are physically-contiguous, consolidate copies 3345 // of contiguous pages into a single method. 3346 if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig) 3347 block_copy_push(block, ©_state, contig_region, &push); 3348 3349 uvm_perf_event_notify_migration(&va_space->perf_events, 3350 &push, 3351 block, 3352 dst_id, 3353 src_id, 3354 uvm_va_block_region_start(block, contig_region), 3355 uvm_va_block_region_size(contig_region), 3356 transfer_mode, 3357 contig_cause, 3358 &block_context->make_resident); 3359 3360 contig_start_index = page_index; 3361 contig_cause = page_cause; 3362 } 3363 3364 if (!copy_state.src.is_block_contig || !copy_state.dst.is_block_contig) 3365 block_copy_push(block, ©_state, uvm_va_block_region_for_page(page_index), &push); 3366 3367 last_index = page_index; 3368 } 3369 3370 // Copy the remaining pages 3371 if (copying_gpu) { 3372 uvm_va_block_region_t contig_region = uvm_va_block_region(contig_start_index, last_index + 1); 3373 UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region)); 3374 3375 if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig) 3376 block_copy_push(block, ©_state, contig_region, &push); 3377 3378 uvm_perf_event_notify_migration(&va_space->perf_events, 3379 &push, 3380 block, 3381 dst_id, 3382 src_id, 3383 uvm_va_block_region_start(block, contig_region), 3384 uvm_va_block_region_size(contig_region), 3385 transfer_mode, 3386 contig_cause, 3387 &block_context->make_resident); 3388 3389 status = block_copy_end_push(block, ©_state, copy_tracker, status, &push); 3390 } 3391 3392 // Update VA block status bits 3393 // 3394 // Only update the bits for the pages that succeeded 3395 if (status != NV_OK) 3396 uvm_page_mask_region_clear(copy_mask, uvm_va_block_region(page_index, PAGES_PER_UVM_VA_BLOCK)); 3397 3398 *copied_pages = uvm_page_mask_weight(copy_mask); 3399 if (*copied_pages) 3400 uvm_page_mask_or(migrated_pages, migrated_pages, copy_mask); 3401 3402 return status; 3403 } 3404 3405 // Copy resident pages to the destination from all source processors in the 3406 // src_processor_mask 3407 // 3408 // The function adds the pages that were successfully copied to the output 3409 // migrated_pages mask and returns the number of pages in copied_pages. These 3410 // fields are reliable even if an error is returned. 3411 static NV_STATUS block_copy_resident_pages_mask(uvm_va_block_t *block, 3412 uvm_va_block_context_t *block_context, 3413 uvm_processor_id_t dst_id, 3414 const uvm_processor_mask_t *src_processor_mask, 3415 uvm_va_block_region_t region, 3416 const uvm_page_mask_t *page_mask, 3417 const uvm_page_mask_t *prefetch_page_mask, 3418 uvm_va_block_transfer_mode_t transfer_mode, 3419 NvU32 max_pages_to_copy, 3420 uvm_page_mask_t *migrated_pages, 3421 NvU32 *copied_pages_out, 3422 uvm_tracker_t *tracker_out) 3423 { 3424 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 3425 uvm_processor_id_t src_id; 3426 uvm_processor_mask_t search_mask; 3427 uvm_page_mask_t *copy_mask = &block_context->make_resident.copy_resident_pages_mask; 3428 3429 uvm_processor_mask_copy(&search_mask, src_processor_mask); 3430 3431 *copied_pages_out = 0; 3432 3433 for_each_closest_id(src_id, &search_mask, dst_id, va_space) { 3434 uvm_page_mask_t *src_resident_mask = uvm_va_block_resident_mask_get(block, src_id); 3435 NV_STATUS status; 3436 NvU32 copied_pages_from_src; 3437 3438 UVM_ASSERT(!uvm_id_equal(src_id, dst_id)); 3439 3440 uvm_page_mask_init_from_region(copy_mask, region, src_resident_mask); 3441 3442 if (page_mask) 3443 uvm_page_mask_and(copy_mask, copy_mask, page_mask); 3444 3445 status = block_copy_resident_pages_between(block, 3446 block_context, 3447 dst_id, 3448 src_id, 3449 region, 3450 copy_mask, 3451 prefetch_page_mask, 3452 transfer_mode, 3453 migrated_pages, 3454 &copied_pages_from_src, 3455 tracker_out); 3456 *copied_pages_out += copied_pages_from_src; 3457 UVM_ASSERT(*copied_pages_out <= max_pages_to_copy); 3458 3459 if (status != NV_OK) 3460 return status; 3461 3462 // Break out once we copied max pages already 3463 if (*copied_pages_out == max_pages_to_copy) 3464 break; 3465 } 3466 3467 return NV_OK; 3468 } 3469 3470 static void break_read_duplication_in_region(uvm_va_block_t *block, 3471 uvm_va_block_context_t *block_context, 3472 uvm_processor_id_t dst_id, 3473 uvm_va_block_region_t region, 3474 const uvm_page_mask_t *page_mask) 3475 { 3476 uvm_processor_id_t id; 3477 uvm_page_mask_t *break_pages_in_region = &block_context->scratch_page_mask; 3478 3479 uvm_page_mask_init_from_region(break_pages_in_region, region, page_mask); 3480 3481 UVM_ASSERT(uvm_page_mask_subset(break_pages_in_region, uvm_va_block_resident_mask_get(block, dst_id))); 3482 3483 // Clear read_duplicated bit for all pages in region 3484 uvm_page_mask_andnot(&block->read_duplicated_pages, &block->read_duplicated_pages, break_pages_in_region); 3485 3486 // Clear residency bits for all processors other than dst_id 3487 for_each_id_in_mask(id, &block->resident) { 3488 uvm_page_mask_t *other_resident_mask; 3489 3490 if (uvm_id_equal(id, dst_id)) 3491 continue; 3492 3493 other_resident_mask = uvm_va_block_resident_mask_get(block, id); 3494 3495 if (!uvm_page_mask_andnot(other_resident_mask, other_resident_mask, break_pages_in_region)) 3496 block_clear_resident_processor(block, id); 3497 } 3498 } 3499 3500 static void block_copy_set_first_touch_residency(uvm_va_block_t *block, 3501 uvm_va_block_context_t *block_context, 3502 uvm_processor_id_t dst_id, 3503 uvm_va_block_region_t region, 3504 const uvm_page_mask_t *page_mask) 3505 { 3506 uvm_page_index_t page_index; 3507 uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id); 3508 uvm_page_mask_t *first_touch_mask = &block_context->make_resident.page_mask; 3509 3510 if (page_mask) 3511 uvm_page_mask_andnot(first_touch_mask, page_mask, resident_mask); 3512 else 3513 uvm_page_mask_complement(first_touch_mask, resident_mask); 3514 3515 uvm_page_mask_region_clear_outside(first_touch_mask, region); 3516 3517 for_each_va_block_page_in_mask(page_index, first_touch_mask, block) { 3518 UVM_ASSERT(!block_is_page_resident_anywhere(block, page_index)); 3519 UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index)); 3520 UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id)); 3521 } 3522 3523 uvm_page_mask_or(resident_mask, resident_mask, first_touch_mask); 3524 if (!uvm_page_mask_empty(resident_mask)) 3525 block_set_resident_processor(block, dst_id); 3526 3527 // Add them to the output mask, too 3528 uvm_page_mask_or(&block_context->make_resident.pages_changed_residency, 3529 &block_context->make_resident.pages_changed_residency, 3530 first_touch_mask); 3531 } 3532 3533 // Copy resident pages from other processors to the destination. 3534 // All the pages on the destination need to be populated by the caller first. 3535 // Pages not resident anywhere else need to be zeroed out as well. 3536 // The transfer_mode is only used to tell uvm_perf_event_notify_migration() 3537 // whether the copy is for a migration or read duplication. 3538 static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block, 3539 uvm_va_block_context_t *block_context, 3540 uvm_processor_id_t dst_id, 3541 uvm_va_block_region_t region, 3542 const uvm_page_mask_t *page_mask, 3543 const uvm_page_mask_t *prefetch_page_mask, 3544 uvm_va_block_transfer_mode_t transfer_mode) 3545 { 3546 NV_STATUS status = NV_OK; 3547 NV_STATUS tracker_status; 3548 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 3549 uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id); 3550 NvU32 missing_pages_count; 3551 NvU32 pages_copied; 3552 NvU32 pages_copied_to_cpu; 3553 uvm_processor_mask_t src_processor_mask; 3554 uvm_page_mask_t *copy_page_mask = &block_context->make_resident.page_mask; 3555 uvm_page_mask_t *migrated_pages = &block_context->make_resident.pages_migrated; 3556 uvm_page_mask_t *staged_pages = &block_context->make_resident.pages_staged; 3557 3558 uvm_page_mask_zero(migrated_pages); 3559 uvm_page_mask_zero(staged_pages); 3560 3561 if (page_mask) 3562 uvm_page_mask_andnot(copy_page_mask, page_mask, resident_mask); 3563 else 3564 uvm_page_mask_complement(copy_page_mask, resident_mask); 3565 3566 missing_pages_count = uvm_page_mask_region_weight(copy_page_mask, region); 3567 3568 if (missing_pages_count == 0) 3569 goto out; 3570 3571 // TODO: Bug 1753731: Add P2P2P copies staged through a GPU 3572 // TODO: Bug 1753731: When a page is resident in multiple locations due to 3573 // read-duplication, spread out the source of the copy so we don't 3574 // bottleneck on a single location. 3575 3576 uvm_processor_mask_zero(&src_processor_mask); 3577 3578 if (!uvm_id_equal(dst_id, UVM_ID_CPU)) { 3579 // If the destination is a GPU, first copy everything from processors 3580 // with copy access supported. Notably this will copy pages from the CPU 3581 // as well even if later some extra copies from CPU are required for 3582 // staged copies. 3583 uvm_processor_mask_and(&src_processor_mask, block_get_can_copy_from_mask(block, dst_id), &block->resident); 3584 uvm_processor_mask_clear(&src_processor_mask, dst_id); 3585 3586 status = block_copy_resident_pages_mask(block, 3587 block_context, 3588 dst_id, 3589 &src_processor_mask, 3590 region, 3591 copy_page_mask, 3592 prefetch_page_mask, 3593 transfer_mode, 3594 missing_pages_count, 3595 migrated_pages, 3596 &pages_copied, 3597 &local_tracker); 3598 3599 UVM_ASSERT(missing_pages_count >= pages_copied); 3600 missing_pages_count -= pages_copied; 3601 3602 if (status != NV_OK) 3603 goto out; 3604 3605 if (missing_pages_count == 0) 3606 goto out; 3607 3608 if (pages_copied) 3609 uvm_page_mask_andnot(copy_page_mask, copy_page_mask, migrated_pages); 3610 } 3611 3612 // Now copy from everywhere else to the CPU. This is both for when the 3613 // destination is the CPU (src_processor_mask empty) and for a staged copy 3614 // (src_processor_mask containing processors with copy access to dst_id). 3615 uvm_processor_mask_andnot(&src_processor_mask, &block->resident, &src_processor_mask); 3616 uvm_processor_mask_clear(&src_processor_mask, dst_id); 3617 uvm_processor_mask_clear(&src_processor_mask, UVM_ID_CPU); 3618 3619 status = block_copy_resident_pages_mask(block, 3620 block_context, 3621 UVM_ID_CPU, 3622 &src_processor_mask, 3623 region, 3624 copy_page_mask, 3625 prefetch_page_mask, 3626 transfer_mode, 3627 missing_pages_count, 3628 staged_pages, 3629 &pages_copied_to_cpu, 3630 &local_tracker); 3631 if (status != NV_OK) 3632 goto out; 3633 3634 // If destination is the CPU then we copied everything there above 3635 if (UVM_ID_IS_CPU(dst_id)) { 3636 uvm_page_mask_or(migrated_pages, migrated_pages, staged_pages); 3637 missing_pages_count -= pages_copied_to_cpu; 3638 3639 goto out; 3640 } 3641 3642 // Add everything to the block's tracker so that the 3643 // block_copy_resident_pages_between() call below will acquire it. 3644 status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker); 3645 if (status != NV_OK) 3646 goto out; 3647 uvm_tracker_clear(&local_tracker); 3648 3649 // Now copy staged pages from the CPU to the destination. 3650 status = block_copy_resident_pages_between(block, 3651 block_context, 3652 dst_id, 3653 UVM_ID_CPU, 3654 region, 3655 staged_pages, 3656 prefetch_page_mask, 3657 transfer_mode, 3658 migrated_pages, 3659 &pages_copied, 3660 &local_tracker); 3661 3662 UVM_ASSERT(missing_pages_count >= pages_copied); 3663 missing_pages_count -= pages_copied; 3664 3665 if (status != NV_OK) 3666 goto out; 3667 3668 // If we get here, that means we were staging the copy through the CPU and 3669 // we should copy as many pages from the CPU as we copied to the CPU. 3670 UVM_ASSERT(pages_copied == pages_copied_to_cpu); 3671 3672 out: 3673 // Add everything from the local tracker to the block's tracker. 3674 // Notably this is also needed for handling 3675 // block_copy_resident_pages_between() failures in the first loop. 3676 tracker_status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker); 3677 uvm_tracker_deinit(&local_tracker); 3678 3679 return status == NV_OK ? tracker_status : status; 3680 } 3681 3682 NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block, 3683 uvm_va_block_retry_t *va_block_retry, 3684 uvm_va_block_context_t *va_block_context, 3685 uvm_processor_id_t dest_id, 3686 uvm_va_block_region_t region, 3687 const uvm_page_mask_t *page_mask, 3688 const uvm_page_mask_t *prefetch_page_mask, 3689 uvm_make_resident_cause_t cause) 3690 { 3691 NV_STATUS status; 3692 uvm_processor_mask_t unmap_processor_mask; 3693 uvm_page_mask_t *unmap_page_mask = &va_block_context->make_resident.page_mask; 3694 uvm_page_mask_t *resident_mask; 3695 3696 va_block_context->make_resident.dest_id = dest_id; 3697 va_block_context->make_resident.cause = cause; 3698 3699 if (prefetch_page_mask) { 3700 UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT || 3701 cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT || 3702 cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER); 3703 } 3704 3705 uvm_assert_mutex_locked(&va_block->lock); 3706 UVM_ASSERT(uvm_va_block_is_hmm(va_block) || va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 3707 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region)); 3708 3709 resident_mask = block_resident_mask_get_alloc(va_block, dest_id); 3710 if (!resident_mask) 3711 return NV_ERR_NO_MEMORY; 3712 3713 // Unmap all mapped processors except for UVM-Lite GPUs as their mappings 3714 // are largely persistent. 3715 uvm_processor_mask_andnot(&unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block)); 3716 3717 if (page_mask) 3718 uvm_page_mask_andnot(unmap_page_mask, page_mask, resident_mask); 3719 else 3720 uvm_page_mask_complement(unmap_page_mask, resident_mask); 3721 uvm_page_mask_region_clear_outside(unmap_page_mask, region); 3722 3723 // Unmap all pages not resident on the destination 3724 status = uvm_va_block_unmap_mask(va_block, va_block_context, &unmap_processor_mask, region, unmap_page_mask); 3725 if (status != NV_OK) 3726 return status; 3727 3728 if (page_mask) 3729 uvm_page_mask_and(unmap_page_mask, page_mask, &va_block->read_duplicated_pages); 3730 else 3731 uvm_page_mask_init_from_region(unmap_page_mask, region, &va_block->read_duplicated_pages); 3732 uvm_page_mask_region_clear_outside(unmap_page_mask, region); 3733 3734 // Also unmap read-duplicated pages excluding dest_id 3735 uvm_processor_mask_clear(&unmap_processor_mask, dest_id); 3736 status = uvm_va_block_unmap_mask(va_block, va_block_context, &unmap_processor_mask, region, unmap_page_mask); 3737 if (status != NV_OK) 3738 return status; 3739 3740 uvm_tools_record_read_duplicate_invalidate(va_block, 3741 dest_id, 3742 region, 3743 unmap_page_mask); 3744 3745 // Note that block_populate_pages and block_copy_resident_pages also use 3746 // va_block_context->make_resident.page_mask. 3747 unmap_page_mask = NULL; 3748 3749 status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask); 3750 if (status != NV_OK) 3751 return status; 3752 3753 return block_copy_resident_pages(va_block, 3754 va_block_context, 3755 dest_id, 3756 region, 3757 page_mask, 3758 prefetch_page_mask, 3759 UVM_VA_BLOCK_TRANSFER_MODE_MOVE); 3760 } 3761 3762 static void block_make_resident_clear_evicted(uvm_va_block_t *va_block, 3763 uvm_processor_id_t dst_id, 3764 uvm_page_mask_t *page_mask) 3765 { 3766 uvm_va_block_gpu_state_t *dst_gpu_state = uvm_va_block_gpu_state_get(va_block, dst_id); 3767 3768 UVM_ASSERT(dst_gpu_state); 3769 3770 if (!uvm_page_mask_andnot(&dst_gpu_state->evicted, &dst_gpu_state->evicted, page_mask)) 3771 uvm_processor_mask_clear(&va_block->evicted_gpus, dst_id); 3772 } 3773 3774 static void block_make_resident_update_state(uvm_va_block_t *va_block, 3775 uvm_va_block_context_t *va_block_context, 3776 uvm_processor_id_t dst_id, 3777 uvm_va_block_region_t region, 3778 uvm_page_mask_t *copy_mask, 3779 uvm_make_resident_cause_t cause) 3780 { 3781 uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dst_id); 3782 3783 uvm_page_mask_or(dst_resident_mask, dst_resident_mask, copy_mask); 3784 block_set_resident_processor(va_block, dst_id); 3785 3786 // Accumulate the pages that migrated into the output mask. 3787 uvm_page_mask_or(&va_block_context->make_resident.pages_changed_residency, 3788 &va_block_context->make_resident.pages_changed_residency, 3789 copy_mask); 3790 3791 // Any move operation implies that mappings have been removed from all 3792 // non-UVM-Lite GPUs. 3793 uvm_page_mask_andnot(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, copy_mask); 3794 3795 // If we are migrating due to an eviction, set the GPU as evicted and 3796 // mark the evicted pages. If we are migrating away from the CPU this 3797 // means that those pages are not evicted. 3798 if (cause == UVM_MAKE_RESIDENT_CAUSE_EVICTION) { 3799 uvm_processor_id_t src_id; 3800 3801 UVM_ASSERT(UVM_ID_IS_CPU(dst_id)); 3802 3803 // Note that the destination is the CPU so this loop excludes it. 3804 for_each_gpu_id_in_mask(src_id, &va_block_context->make_resident.all_involved_processors) { 3805 uvm_va_block_gpu_state_t *src_gpu_state = uvm_va_block_gpu_state_get(va_block, src_id); 3806 3807 UVM_ASSERT(src_gpu_state); 3808 3809 uvm_page_mask_or(&src_gpu_state->evicted, &src_gpu_state->evicted, copy_mask); 3810 uvm_processor_mask_set(&va_block->evicted_gpus, src_id); 3811 } 3812 } 3813 else if (UVM_ID_IS_GPU(dst_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dst_id)) 3814 block_make_resident_clear_evicted(va_block, dst_id, copy_mask); 3815 } 3816 3817 void uvm_va_block_make_resident_finish(uvm_va_block_t *va_block, 3818 uvm_va_block_context_t *va_block_context, 3819 uvm_va_block_region_t region, 3820 const uvm_page_mask_t *page_mask) 3821 { 3822 uvm_page_mask_t *migrated_pages = &va_block_context->make_resident.pages_migrated; 3823 uvm_processor_id_t dst_id = va_block_context->make_resident.dest_id; 3824 3825 uvm_assert_mutex_locked(&va_block->lock); 3826 3827 if (page_mask) 3828 uvm_page_mask_and(migrated_pages, migrated_pages, page_mask); 3829 3830 if (!uvm_page_mask_empty(migrated_pages)) { 3831 // The migrated pages are now resident on the destination. 3832 block_make_resident_update_state(va_block, 3833 va_block_context, 3834 dst_id, 3835 region, 3836 migrated_pages, 3837 va_block_context->make_resident.cause); 3838 } 3839 3840 // Pages that weren't resident anywhere else were populated at the 3841 // destination directly. Mark them as resident now. 3842 block_copy_set_first_touch_residency(va_block, va_block_context, dst_id, region, page_mask); 3843 3844 // Break read duplication and clear residency from other processors. 3845 break_read_duplication_in_region(va_block, va_block_context, dst_id, region, page_mask); 3846 3847 // Update eviction heuristics, if needed. Notably this could repeat the call 3848 // done in block_set_resident_processor(), but that doesn't do anything bad 3849 // and it's simpler to keep it in both places. 3850 // 3851 // Skip this if we didn't do anything (the input region and/or page mask was 3852 // empty). 3853 if (uvm_processor_mask_test(&va_block->resident, dst_id)) 3854 block_mark_memory_used(va_block, dst_id); 3855 } 3856 3857 NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block, 3858 uvm_va_block_retry_t *va_block_retry, 3859 uvm_va_block_context_t *va_block_context, 3860 uvm_processor_id_t dest_id, 3861 uvm_va_block_region_t region, 3862 const uvm_page_mask_t *page_mask, 3863 const uvm_page_mask_t *prefetch_page_mask, 3864 uvm_make_resident_cause_t cause) 3865 { 3866 NV_STATUS status; 3867 3868 status = uvm_va_block_make_resident_copy(va_block, 3869 va_block_retry, 3870 va_block_context, 3871 dest_id, 3872 region, 3873 page_mask, 3874 prefetch_page_mask, 3875 cause); 3876 if (status != NV_OK) 3877 return status; 3878 3879 uvm_va_block_make_resident_finish(va_block, 3880 va_block_context, 3881 region, 3882 page_mask); 3883 3884 return NV_OK; 3885 } 3886 3887 // Combination function which prepares the input {region, page_mask} for 3888 // entering read-duplication. It: 3889 // - Unmaps all processors but revoke_id 3890 // - Revokes write access from revoke_id 3891 static NV_STATUS block_prep_read_duplicate_mapping(uvm_va_block_t *va_block, 3892 uvm_va_block_context_t *va_block_context, 3893 uvm_processor_id_t revoke_id, 3894 uvm_va_block_region_t region, 3895 const uvm_page_mask_t *page_mask) 3896 { 3897 uvm_processor_mask_t unmap_processor_mask; 3898 uvm_processor_id_t unmap_id; 3899 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 3900 NV_STATUS status, tracker_status; 3901 3902 // Unmap everybody except revoke_id 3903 uvm_processor_mask_andnot(&unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block)); 3904 uvm_processor_mask_clear(&unmap_processor_mask, revoke_id); 3905 3906 for_each_id_in_mask(unmap_id, &unmap_processor_mask) { 3907 status = uvm_va_block_unmap(va_block, 3908 va_block_context, 3909 unmap_id, 3910 region, 3911 page_mask, 3912 &local_tracker); 3913 if (status != NV_OK) 3914 goto out; 3915 } 3916 3917 // Revoke WRITE/ATOMIC access permissions from the remaining mapped 3918 // processor. 3919 status = uvm_va_block_revoke_prot(va_block, 3920 va_block_context, 3921 revoke_id, 3922 region, 3923 page_mask, 3924 UVM_PROT_READ_WRITE, 3925 &local_tracker); 3926 if (status != NV_OK) 3927 goto out; 3928 3929 out: 3930 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 3931 uvm_tracker_deinit(&local_tracker); 3932 return status == NV_OK ? tracker_status : status; 3933 } 3934 3935 NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block, 3936 uvm_va_block_retry_t *va_block_retry, 3937 uvm_va_block_context_t *va_block_context, 3938 uvm_processor_id_t dest_id, 3939 uvm_va_block_region_t region, 3940 const uvm_page_mask_t *page_mask, 3941 const uvm_page_mask_t *prefetch_page_mask, 3942 uvm_make_resident_cause_t cause) 3943 { 3944 NV_STATUS status = NV_OK; 3945 uvm_processor_id_t src_id; 3946 uvm_page_mask_t *dst_resident_mask; 3947 uvm_page_mask_t *cpu_resident_mask; 3948 uvm_page_mask_t *migrated_pages; 3949 uvm_page_mask_t *staged_pages; 3950 uvm_page_mask_t *first_touch_mask; 3951 3952 // TODO: Bug 3660922: need to implement HMM read duplication support. 3953 UVM_ASSERT(!uvm_va_block_is_hmm(va_block)); 3954 UVM_ASSERT(va_block_context->policy == uvm_va_range_get_policy(va_block->va_range)); 3955 3956 va_block_context->make_resident.dest_id = dest_id; 3957 va_block_context->make_resident.cause = cause; 3958 3959 if (prefetch_page_mask) { 3960 // TODO: Bug 1877578: investigate automatic read-duplicate policies 3961 UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT || 3962 cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT || 3963 cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER); 3964 } 3965 3966 uvm_assert_mutex_locked(&va_block->lock); 3967 UVM_ASSERT(!uvm_va_block_is_dead(va_block)); 3968 3969 // For pages that are entering read-duplication we need to unmap remote 3970 // mappings and revoke RW and higher access permissions. 3971 // 3972 // The current implementation: 3973 // - Unmaps pages from all processors but the one with the resident copy 3974 // - Revokes write access from the processor with the resident copy 3975 for_each_id_in_mask(src_id, &va_block->resident) { 3976 // Note that the below calls to block_populate_pages and 3977 // block_copy_resident_pages also use 3978 // va_block_context->make_resident.page_mask. 3979 uvm_page_mask_t *preprocess_page_mask = &va_block_context->make_resident.page_mask; 3980 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id); 3981 UVM_ASSERT(!uvm_page_mask_empty(resident_mask)); 3982 3983 if (page_mask) 3984 uvm_page_mask_andnot(preprocess_page_mask, page_mask, &va_block->read_duplicated_pages); 3985 else 3986 uvm_page_mask_complement(preprocess_page_mask, &va_block->read_duplicated_pages); 3987 3988 // If there are no pages that need to be unmapped/revoked, skip to the 3989 // next processor 3990 if (!uvm_page_mask_and(preprocess_page_mask, preprocess_page_mask, resident_mask)) 3991 continue; 3992 3993 status = block_prep_read_duplicate_mapping(va_block, va_block_context, src_id, region, preprocess_page_mask); 3994 if (status != NV_OK) 3995 return status; 3996 } 3997 3998 status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask); 3999 if (status != NV_OK) 4000 return status; 4001 4002 status = block_copy_resident_pages(va_block, 4003 va_block_context, 4004 dest_id, 4005 region, 4006 page_mask, 4007 prefetch_page_mask, 4008 UVM_VA_BLOCK_TRANSFER_MODE_COPY); 4009 if (status != NV_OK) 4010 return status; 4011 4012 // Pages that weren't resident anywhere else were populated at the 4013 // destination directly. Mark them as resident now, since there were no 4014 // errors from block_copy_resident_pages() above. 4015 // Note that va_block_context->scratch_page_mask is passed to 4016 // block_copy_set_first_touch_residency() which is generally unsafe but in 4017 // this case, block_copy_set_first_touch_residency() copies page_mask 4018 // before scratch_page_mask could be clobbered. 4019 migrated_pages = &va_block_context->make_resident.pages_migrated; 4020 first_touch_mask = &va_block_context->scratch_page_mask; 4021 uvm_page_mask_init_from_region(first_touch_mask, region, page_mask); 4022 uvm_page_mask_andnot(first_touch_mask, first_touch_mask, migrated_pages); 4023 4024 if (!uvm_page_mask_empty(first_touch_mask)) 4025 block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, first_touch_mask); 4026 4027 staged_pages = &va_block_context->make_resident.pages_staged; 4028 if (!UVM_ID_IS_CPU(dest_id) && !uvm_page_mask_empty(staged_pages)) { 4029 cpu_resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU); 4030 uvm_page_mask_or(cpu_resident_mask, cpu_resident_mask, staged_pages); 4031 block_set_resident_processor(va_block, UVM_ID_CPU); 4032 uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, staged_pages); 4033 uvm_tools_record_read_duplicate(va_block, UVM_ID_CPU, region, staged_pages); 4034 } 4035 4036 if (!uvm_page_mask_empty(migrated_pages)) { 4037 dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dest_id); 4038 uvm_page_mask_or(dst_resident_mask, dst_resident_mask, migrated_pages); 4039 block_set_resident_processor(va_block, dest_id); 4040 uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, migrated_pages); 4041 uvm_tools_record_read_duplicate(va_block, dest_id, region, migrated_pages); 4042 } 4043 4044 UVM_ASSERT(cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION); 4045 if (UVM_ID_IS_GPU(dest_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dest_id)) 4046 block_make_resident_clear_evicted(va_block, dest_id, migrated_pages); 4047 4048 // Update eviction heuristics, if needed. Notably this could repeat the call 4049 // done in block_set_resident_processor(), but that doesn't do anything bad 4050 // and it's simpler to keep it in both places. 4051 // 4052 // Skip this if we didn't do anything (the input region and/or page mask was 4053 // empty). 4054 if (uvm_processor_mask_test(&va_block->resident, dest_id)) 4055 block_mark_memory_used(va_block, dest_id); 4056 4057 return NV_OK; 4058 } 4059 4060 // Looks up the current CPU mapping state of page from the 4061 // block->cpu.pte_bits bitmaps. If write access is enabled, 4062 // UVM_PROT_READ_WRITE_ATOMIC is returned instead of UVM_PROT_READ_WRITE, since 4063 // write access implies atomic access for CPUs. 4064 static uvm_prot_t block_page_prot_cpu(uvm_va_block_t *block, uvm_page_index_t page_index) 4065 { 4066 uvm_prot_t prot; 4067 4068 UVM_ASSERT(!uvm_va_block_is_dead(block)); 4069 4070 if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index)) 4071 prot = UVM_PROT_READ_WRITE_ATOMIC; 4072 else if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index)) 4073 prot = UVM_PROT_READ_ONLY; 4074 else 4075 prot = UVM_PROT_NONE; 4076 4077 return prot; 4078 } 4079 4080 // Looks up the current GPU mapping state of page from the 4081 // block->gpus[i]->pte_bits bitmaps. 4082 static uvm_prot_t block_page_prot_gpu(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t page_index) 4083 { 4084 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4085 uvm_prot_t prot; 4086 4087 UVM_ASSERT(!uvm_va_block_is_dead(block)); 4088 4089 if (!gpu_state) 4090 return UVM_PROT_NONE; 4091 4092 if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], page_index)) 4093 prot = UVM_PROT_READ_WRITE_ATOMIC; 4094 else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], page_index)) 4095 prot = UVM_PROT_READ_WRITE; 4096 else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index)) 4097 prot = UVM_PROT_READ_ONLY; 4098 else 4099 prot = UVM_PROT_NONE; 4100 4101 return prot; 4102 } 4103 4104 static uvm_prot_t block_page_prot(uvm_va_block_t *block, uvm_processor_id_t id, uvm_page_index_t page_index) 4105 { 4106 if (UVM_ID_IS_CPU(id)) 4107 return block_page_prot_cpu(block, page_index); 4108 else 4109 return block_page_prot_gpu(block, block_get_gpu(block, id), page_index); 4110 } 4111 4112 // Returns true if the block has any valid CPU PTE mapping in the block region. 4113 static bool block_has_valid_mapping_cpu(uvm_va_block_t *block, uvm_va_block_region_t region) 4114 { 4115 size_t valid_page; 4116 4117 UVM_ASSERT(region.outer <= uvm_va_block_num_cpu_pages(block)); 4118 4119 // Early-out: check whether any address in this block has a CPU mapping 4120 if (!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) { 4121 UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])); 4122 UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])); 4123 return false; 4124 } 4125 4126 // All valid mappings have at least read permissions so we only need to 4127 // inspect the read bits. 4128 valid_page = uvm_va_block_first_page_in_mask(region, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]); 4129 if (valid_page == region.outer) 4130 return false; 4131 4132 UVM_ASSERT(block_page_prot_cpu(block, valid_page) != UVM_PROT_NONE); 4133 return true; 4134 } 4135 4136 static bool block_check_chunk_indirect_peers(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) 4137 { 4138 uvm_gpu_t *accessing_gpu; 4139 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 4140 4141 if (!uvm_pmm_sysmem_mappings_indirect_supported()) 4142 return true; 4143 4144 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 4145 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); 4146 uvm_reverse_map_t reverse_map; 4147 size_t num_mappings; 4148 4149 num_mappings = uvm_pmm_sysmem_mappings_dma_to_virt(&accessing_gpu->pmm_reverse_sysmem_mappings, 4150 peer_addr, 4151 uvm_gpu_chunk_get_size(chunk), 4152 &reverse_map, 4153 1); 4154 UVM_ASSERT(num_mappings == 1); 4155 UVM_ASSERT(reverse_map.va_block == block); 4156 UVM_ASSERT(reverse_map.region.first == chunk->va_block_page_index); 4157 UVM_ASSERT(uvm_va_block_region_size(reverse_map.region) == uvm_gpu_chunk_get_size(chunk)); 4158 4159 uvm_va_block_release_no_destroy(reverse_map.va_block); 4160 } 4161 4162 return true; 4163 } 4164 4165 // Sanity check the given GPU's chunks array 4166 static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id) 4167 { 4168 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id); 4169 uvm_gpu_t *gpu; 4170 size_t i, num_chunks; 4171 uvm_page_index_t page_index; 4172 uvm_chunk_size_t chunk_size; 4173 4174 if (!gpu_state) 4175 return true; 4176 4177 gpu = block_get_gpu(block, id); 4178 4179 num_chunks = block_num_gpu_chunks(block, gpu); 4180 for (page_index = 0, i = 0; i < num_chunks; i++) { 4181 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 4182 size_t chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size); 4183 4184 if (chunk_index != i) { 4185 UVM_ERR_PRINT("chunk index mismatch: calculated %zu, is in %zu. VA block [0x%llx, 0x%llx) GPU %u page_index: %u\n", 4186 chunk_index, 4187 i, 4188 block->start, 4189 block->end + 1, 4190 uvm_id_value(id), 4191 page_index); 4192 return false; 4193 } 4194 4195 if (chunk) { 4196 if (chunk_size != uvm_gpu_chunk_get_size(chunk)) { 4197 UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu\n", 4198 chunk_size, 4199 uvm_gpu_chunk_get_size(chunk), 4200 block->start, 4201 block->end + 1, 4202 uvm_id_value(id), 4203 page_index, 4204 i); 4205 return false; 4206 } 4207 4208 if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) { 4209 UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu chunk_size: %u\n", 4210 uvm_pmm_gpu_chunk_state_string(chunk->state), 4211 block->start, 4212 block->end + 1, 4213 uvm_id_value(id), 4214 page_index, 4215 i, 4216 chunk_size); 4217 return false; 4218 } 4219 4220 UVM_ASSERT(chunk->va_block == block); 4221 UVM_ASSERT(chunk->va_block_page_index == page_index); 4222 4223 UVM_ASSERT(block_check_chunk_indirect_peers(block, gpu, chunk)); 4224 } 4225 4226 page_index += chunk_size / PAGE_SIZE; 4227 } 4228 4229 return true; 4230 } 4231 4232 static bool block_check_chunks(uvm_va_block_t *va_block) 4233 { 4234 uvm_gpu_id_t id; 4235 4236 for_each_gpu_id(id) { 4237 if (!block_check_gpu_chunks(va_block, id)) 4238 return false; 4239 } 4240 4241 return block_check_cpu_chunks(va_block); 4242 } 4243 4244 // Sanity checks for page mappings 4245 static bool block_check_mappings_page(uvm_va_block_t *block, uvm_page_index_t page_index) 4246 { 4247 uvm_processor_mask_t atomic_mappings, write_mappings, read_mappings; 4248 uvm_processor_mask_t lite_read_mappings, lite_atomic_mappings; 4249 uvm_processor_mask_t remaining_mappings, temp_mappings; 4250 uvm_processor_mask_t resident_processors; 4251 const uvm_processor_mask_t *residency_accessible_from = NULL; 4252 const uvm_processor_mask_t *residency_has_native_atomics = NULL; 4253 uvm_processor_id_t residency, id; 4254 uvm_va_range_t *va_range = block->va_range; 4255 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 4256 uvm_processor_id_t preferred_location = va_range ? 4257 uvm_va_range_get_policy(va_range)->preferred_location : 4258 UVM_ID_INVALID; 4259 const uvm_processor_mask_t *uvm_lite_gpus = block_get_uvm_lite_gpus(block); 4260 4261 block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings); 4262 block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE, &write_mappings); 4263 block_page_authorized_processors(block, page_index, UVM_PROT_READ_ONLY, &read_mappings); 4264 4265 // Each access bit implies all accesses below it 4266 UVM_ASSERT(uvm_processor_mask_subset(&atomic_mappings, &write_mappings)); 4267 UVM_ASSERT(uvm_processor_mask_subset(&write_mappings, &read_mappings)); 4268 UVM_ASSERT(uvm_processor_mask_subset(&read_mappings, &block->mapped)); 4269 4270 uvm_va_block_page_resident_processors(block, page_index, &resident_processors); 4271 UVM_ASSERT(uvm_processor_mask_subset(&resident_processors, &block->resident)); 4272 4273 // Sanity check block_get_mapped_processors 4274 uvm_processor_mask_copy(&remaining_mappings, &read_mappings); 4275 for_each_id_in_mask(residency, &resident_processors) { 4276 block_get_mapped_processors(block, residency, page_index, &temp_mappings); 4277 UVM_ASSERT(uvm_processor_mask_subset(&temp_mappings, &remaining_mappings)); 4278 uvm_processor_mask_andnot(&remaining_mappings, &remaining_mappings, &temp_mappings); 4279 } 4280 4281 // Any remaining mappings point to non-resident locations, so they must be 4282 // UVM-Lite mappings. 4283 UVM_ASSERT(uvm_processor_mask_subset(&remaining_mappings, uvm_lite_gpus)); 4284 4285 residency = uvm_processor_mask_find_first_id(&resident_processors); 4286 4287 if (uvm_processor_mask_get_count(&resident_processors) > 0) { 4288 residency_accessible_from = &va_space->accessible_from[uvm_id_value(residency)]; 4289 residency_has_native_atomics = &va_space->has_native_atomics[uvm_id_value(residency)]; 4290 } 4291 4292 // If the page is not resident, there should be no valid mappings 4293 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&resident_processors) > 0 || 4294 uvm_processor_mask_get_count(&read_mappings) == 0, 4295 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", 4296 *resident_processors.bitmap, 4297 *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, 4298 *va_space->system_wide_atomics_enabled_processors.bitmap, 4299 *block->read_duplicated_pages.bitmap); 4300 4301 // Test read_duplicated_pages mask 4302 UVM_ASSERT_MSG((uvm_processor_mask_get_count(&resident_processors) <= 1 && 4303 !uvm_page_mask_test(&block->read_duplicated_pages, page_index)) || 4304 (uvm_processor_mask_get_count(&resident_processors) > 1 && 4305 uvm_page_mask_test(&block->read_duplicated_pages, page_index)), 4306 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", 4307 *resident_processors.bitmap, 4308 *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, 4309 *va_space->system_wide_atomics_enabled_processors.bitmap, 4310 *block->read_duplicated_pages.bitmap); 4311 4312 if (!uvm_processor_mask_empty(uvm_lite_gpus)) 4313 UVM_ASSERT(UVM_ID_IS_VALID(preferred_location)); 4314 4315 // UVM-Lite checks. Since the range group is made non-migratable before the 4316 // actual migrations for that range group happen, we can only make those 4317 // checks which are valid on both migratable and non-migratable range 4318 // groups. 4319 uvm_processor_mask_and(&lite_read_mappings, &read_mappings, uvm_lite_gpus); 4320 uvm_processor_mask_and(&lite_atomic_mappings, &atomic_mappings, uvm_lite_gpus); 4321 4322 // Any mapping from a UVM-Lite GPU must be atomic... 4323 UVM_ASSERT(uvm_processor_mask_equal(&lite_read_mappings, &lite_atomic_mappings)); 4324 4325 // ... and must have access to preferred_location 4326 if (UVM_ID_IS_VALID(preferred_location)) { 4327 const uvm_processor_mask_t *preferred_location_accessible_from; 4328 4329 preferred_location_accessible_from = &va_space->accessible_from[uvm_id_value(preferred_location)]; 4330 UVM_ASSERT(uvm_processor_mask_subset(&lite_atomic_mappings, preferred_location_accessible_from)); 4331 } 4332 4333 for_each_id_in_mask(id, &lite_atomic_mappings) 4334 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], preferred_location)); 4335 4336 // Exclude uvm_lite_gpus from mappings' masks after UVM-Lite tests 4337 uvm_processor_mask_andnot(&read_mappings, &read_mappings, uvm_lite_gpus); 4338 uvm_processor_mask_andnot(&write_mappings, &write_mappings, uvm_lite_gpus); 4339 uvm_processor_mask_andnot(&atomic_mappings, &atomic_mappings, uvm_lite_gpus); 4340 4341 // Pages set to zero in maybe_mapped_pages must not be mapped on any 4342 // non-UVM-Lite GPU 4343 if (!uvm_page_mask_test(&block->maybe_mapped_pages, page_index)) { 4344 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&read_mappings) == 0, 4345 "Resident: 0x%lx - Mappings Block: 0x%lx / Page R: 0x%lx W: 0x%lx A: 0x%lx\n", 4346 *resident_processors.bitmap, 4347 *block->mapped.bitmap, 4348 *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap); 4349 } 4350 4351 // atomic mappings from GPUs with disabled system-wide atomics are treated 4352 // as write mappings. Therefore, we remove them from the atomic mappings mask 4353 uvm_processor_mask_and(&atomic_mappings, &atomic_mappings, &va_space->system_wide_atomics_enabled_processors); 4354 4355 if (!uvm_processor_mask_empty(&read_mappings)) { 4356 // Read-duplicate: if a page is resident in multiple locations, it 4357 // must be resident locally on each mapped processor. 4358 if (uvm_processor_mask_get_count(&resident_processors) > 1) { 4359 UVM_ASSERT_MSG(uvm_processor_mask_subset(&read_mappings, &resident_processors), 4360 "Read-duplicate copies from remote processors\n" 4361 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", 4362 *resident_processors.bitmap, 4363 *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, 4364 *va_space->system_wide_atomics_enabled_processors.bitmap, 4365 *block->read_duplicated_pages.bitmap); 4366 } 4367 else { 4368 // Processors with mappings must have access to the processor that 4369 // has the valid copy 4370 UVM_ASSERT_MSG(uvm_processor_mask_subset(&read_mappings, residency_accessible_from), 4371 "Not all processors have access to %s\n" 4372 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" 4373 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", 4374 uvm_va_space_processor_name(va_space, residency), 4375 *resident_processors.bitmap, 4376 *read_mappings.bitmap, 4377 *write_mappings.bitmap, 4378 *atomic_mappings.bitmap, 4379 *residency_accessible_from->bitmap, 4380 *residency_has_native_atomics->bitmap, 4381 *va_space->system_wide_atomics_enabled_processors.bitmap); 4382 for_each_id_in_mask(id, &read_mappings) { 4383 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], residency)); 4384 4385 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(residency)], id)) { 4386 uvm_gpu_t *resident_gpu = uvm_va_space_get_gpu(va_space, residency); 4387 uvm_gpu_t *mapped_gpu = uvm_va_space_get_gpu(va_space, id); 4388 uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_phys_page(residency, page_index), NULL); 4389 4390 // This function will assert if no mapping exists 4391 (void)uvm_pmm_gpu_indirect_peer_addr(&resident_gpu->pmm, chunk, mapped_gpu); 4392 } 4393 } 4394 } 4395 } 4396 4397 // If any processor has a writable mapping, there must only be one copy of 4398 // the page in the system 4399 if (!uvm_processor_mask_empty(&write_mappings)) { 4400 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&resident_processors) == 1, 4401 "Too many resident copies for pages with write_mappings\n" 4402 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", 4403 *resident_processors.bitmap, 4404 *read_mappings.bitmap, 4405 *write_mappings.bitmap, 4406 *atomic_mappings.bitmap, 4407 *va_space->system_wide_atomics_enabled_processors.bitmap, 4408 *block->read_duplicated_pages.bitmap); 4409 } 4410 4411 if (!uvm_processor_mask_empty(&atomic_mappings)) { 4412 uvm_processor_mask_t native_atomics; 4413 4414 uvm_processor_mask_and(&native_atomics, &atomic_mappings, residency_has_native_atomics); 4415 4416 if (uvm_processor_mask_empty(&native_atomics)) { 4417 // No other faultable processor should be able to write 4418 uvm_processor_mask_and(&write_mappings, &write_mappings, &va_space->faultable_processors); 4419 4420 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&write_mappings) == 1, 4421 "Too many write mappings to %s from processors with non-native atomics\n" 4422 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" 4423 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", 4424 uvm_va_space_processor_name(va_space, residency), 4425 *resident_processors.bitmap, 4426 *read_mappings.bitmap, 4427 *write_mappings.bitmap, 4428 *atomic_mappings.bitmap, 4429 *residency_accessible_from->bitmap, 4430 *residency_has_native_atomics->bitmap, 4431 *va_space->system_wide_atomics_enabled_processors.bitmap); 4432 4433 // Only one processor outside of the native group can have atomics enabled 4434 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&atomic_mappings) == 1, 4435 "Too many atomics mappings to %s from processors with non-native atomics\n" 4436 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" 4437 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", 4438 uvm_va_space_processor_name(va_space, residency), 4439 *resident_processors.bitmap, 4440 *read_mappings.bitmap, 4441 *write_mappings.bitmap, 4442 *atomic_mappings.bitmap, 4443 *residency_accessible_from->bitmap, 4444 *residency_has_native_atomics->bitmap, 4445 *va_space->system_wide_atomics_enabled_processors.bitmap); 4446 } 4447 else { 4448 uvm_processor_mask_t non_native_atomics; 4449 4450 // One or more processors within the native group have atomics enabled. 4451 // All processors outside of that group may have write but not atomic 4452 // permissions. 4453 uvm_processor_mask_andnot(&non_native_atomics, &atomic_mappings, residency_has_native_atomics); 4454 4455 UVM_ASSERT_MSG(uvm_processor_mask_empty(&non_native_atomics), 4456 "atomic mappings to %s from processors native and non-native\n" 4457 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" 4458 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", 4459 uvm_va_space_processor_name(va_space, residency), 4460 *resident_processors.bitmap, 4461 *read_mappings.bitmap, 4462 *write_mappings.bitmap, 4463 *atomic_mappings.bitmap, 4464 *residency_accessible_from->bitmap, 4465 *residency_has_native_atomics->bitmap, 4466 *va_space->system_wide_atomics_enabled_processors.bitmap); 4467 } 4468 } 4469 4470 return true; 4471 } 4472 4473 static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_gpu_t *gpu) 4474 { 4475 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4476 uvm_va_block_gpu_state_t *resident_gpu_state; 4477 uvm_pte_bits_gpu_t pte_bit; 4478 uvm_processor_id_t resident_id; 4479 uvm_prot_t prot; 4480 NvU32 big_page_size; 4481 size_t num_big_pages, big_page_index; 4482 uvm_va_block_region_t big_region, chunk_region; 4483 uvm_gpu_chunk_t *chunk; 4484 4485 if (!gpu_state->page_table_range_4k.table) 4486 UVM_ASSERT(!gpu_state->activated_4k); 4487 4488 if (!gpu_state->page_table_range_big.table) { 4489 UVM_ASSERT(!gpu_state->initialized_big); 4490 UVM_ASSERT(!gpu_state->activated_big); 4491 } 4492 4493 // It's only safe to check the PTE mappings if we have page tables. See 4494 // uvm_va_block_get_gpu_va_space. 4495 if (!block_gpu_has_page_tables(block, gpu)) { 4496 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, gpu->id)); 4497 return true; 4498 } 4499 4500 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); 4501 num_big_pages = uvm_va_block_num_big_pages(block, big_page_size); 4502 4503 if (block_gpu_supports_2m(block, gpu)) { 4504 if (gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table) { 4505 // 2M blocks require the 2M entry to be allocated for the lower 4506 // ranges to also be allocated. 4507 UVM_ASSERT(gpu_state->page_table_range_2m.table); 4508 } 4509 else if (gpu_state->page_table_range_2m.table) { 4510 // If the 2M entry is present but the lower ones aren't, the PTE 4511 // must be 2M. 4512 UVM_ASSERT(gpu_state->pte_is_2m); 4513 } 4514 } 4515 else { 4516 UVM_ASSERT(!gpu_state->page_table_range_2m.table); 4517 if (num_big_pages == 0) 4518 UVM_ASSERT(!gpu_state->page_table_range_big.table); 4519 } 4520 4521 // If we have the big table and it's in use then it must have been 4522 // initialized, even if it doesn't currently contain active PTEs. 4523 if ((!block_gpu_supports_2m(block, gpu) && gpu_state->page_table_range_big.table) || 4524 (block_gpu_supports_2m(block, gpu) && !gpu_state->pte_is_2m && gpu_state->activated_big)) 4525 UVM_ASSERT(gpu_state->initialized_big); 4526 4527 if (gpu_state->pte_is_2m) { 4528 UVM_ASSERT(block_gpu_supports_2m(block, gpu)); 4529 UVM_ASSERT(gpu_state->page_table_range_2m.table); 4530 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 4531 UVM_ASSERT(!gpu_state->force_4k_ptes); 4532 4533 // GPU architectures which support 2M pages only support 64K as the big 4534 // page size. All of the 2M code assumes that 4535 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK covers a 2M PTE exactly (bitmap_full, 4536 // bitmap_complement, etc). 4537 BUILD_BUG_ON((UVM_PAGE_SIZE_2M / UVM_PAGE_SIZE_64K) != MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 4538 4539 prot = block_page_prot_gpu(block, gpu, 0); 4540 4541 // All page permissions match 4542 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { 4543 if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot)) 4544 UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[pte_bit])); 4545 else 4546 UVM_ASSERT(uvm_page_mask_full(&gpu_state->pte_bits[pte_bit])); 4547 } 4548 4549 if (prot != UVM_PROT_NONE) { 4550 resident_id = block_gpu_get_processor_to_map(block, gpu, 0); 4551 4552 // block_check_resident_proximity verifies that no closer processor 4553 // has a resident page, so we don't need to check that all pages 4554 // have the same resident_id. 4555 4556 // block_check_mappings_page verifies that all pages marked resident 4557 // are backed by populated memory. 4558 4559 // The mapped processor should be fully resident and physically- 4560 // contiguous. 4561 UVM_ASSERT(uvm_page_mask_full(uvm_va_block_resident_mask_get(block, resident_id))); 4562 4563 if (UVM_ID_IS_GPU(resident_id)) { 4564 resident_gpu_state = uvm_va_block_gpu_state_get(block, resident_id); 4565 UVM_ASSERT(resident_gpu_state); 4566 UVM_ASSERT(uvm_gpu_chunk_get_size(resident_gpu_state->chunks[0]) == UVM_CHUNK_SIZE_2M); 4567 } 4568 else { 4569 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_first_in_region(block, 4570 uvm_va_block_region_from_block(block), 4571 NULL); 4572 4573 UVM_ASSERT(uvm_page_mask_full(&block->cpu.allocated)); 4574 UVM_ASSERT(chunk); 4575 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M); 4576 } 4577 } 4578 } 4579 else if (!bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 4580 UVM_ASSERT(gpu_state->page_table_range_big.table); 4581 UVM_ASSERT(!gpu_state->force_4k_ptes); 4582 UVM_ASSERT(num_big_pages > 0); 4583 UVM_ASSERT(gpu_state->initialized_big); 4584 4585 for (big_page_index = 0; big_page_index < num_big_pages; big_page_index++) { 4586 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 4587 4588 if (!test_bit(big_page_index, gpu_state->big_ptes)) { 4589 // If there are valid mappings but this isn't a big PTE, the 4590 // mapping must be using the 4k PTEs. 4591 if (!uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region)) 4592 UVM_ASSERT(gpu_state->page_table_range_4k.table); 4593 continue; 4594 } 4595 4596 prot = block_page_prot_gpu(block, gpu, big_region.first); 4597 4598 // All page permissions match 4599 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { 4600 if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot)) 4601 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[pte_bit], big_region)); 4602 else 4603 UVM_ASSERT(uvm_page_mask_region_full(&gpu_state->pte_bits[pte_bit], big_region)); 4604 } 4605 4606 if (prot != UVM_PROT_NONE) { 4607 resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first); 4608 4609 // The mapped processor should be fully resident and physically- 4610 // contiguous. Exception: UVM-Lite GPUs always map the preferred 4611 // location even if the memory is resident elsewhere. Skip the 4612 // residency check but still verify contiguity. 4613 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) { 4614 UVM_ASSERT(uvm_page_mask_region_full(uvm_va_block_resident_mask_get(block, resident_id), 4615 big_region)); 4616 } 4617 4618 if (UVM_ID_IS_CPU(resident_id)) { 4619 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, big_region.first); 4620 4621 UVM_ASSERT(gpu->parent->can_map_sysmem_with_large_pages); 4622 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) >= uvm_va_block_region_size(big_region)); 4623 } 4624 else { 4625 // Check GPU chunks 4626 chunk = block_phys_page_chunk(block, block_phys_page(resident_id, big_region.first), NULL); 4627 chunk_region = uvm_va_block_chunk_region(block, uvm_gpu_chunk_get_size(chunk), big_region.first); 4628 UVM_ASSERT(uvm_va_block_region_contains_region(chunk_region, big_region)); 4629 } 4630 } 4631 } 4632 } 4633 4634 return true; 4635 } 4636 4637 static bool block_check_mappings(uvm_va_block_t *block) 4638 { 4639 uvm_page_index_t page_index; 4640 uvm_processor_id_t id; 4641 4642 // Verify the master masks, since block_check_mappings_page relies on them 4643 for_each_processor_id(id) { 4644 const uvm_page_mask_t *resident_mask, *map_mask; 4645 4646 if (UVM_ID_IS_GPU(id) && !uvm_va_block_gpu_state_get(block, id)) { 4647 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id)); 4648 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id)); 4649 UVM_ASSERT(!uvm_processor_mask_test(&block->evicted_gpus, id)); 4650 continue; 4651 } 4652 4653 resident_mask = uvm_va_block_resident_mask_get(block, id); 4654 UVM_ASSERT(uvm_processor_mask_test(&block->resident, id) == !uvm_page_mask_empty(resident_mask)); 4655 4656 map_mask = uvm_va_block_map_mask_get(block, id); 4657 UVM_ASSERT(uvm_processor_mask_test(&block->mapped, id) == !uvm_page_mask_empty(map_mask)); 4658 4659 if (UVM_ID_IS_GPU(id)) { 4660 const uvm_page_mask_t *evicted_mask = block_evicted_mask_get(block, id); 4661 UVM_ASSERT(uvm_processor_mask_test(&block->evicted_gpus, id) == !uvm_page_mask_empty(evicted_mask)); 4662 4663 // Pages cannot be resident if they are marked as evicted 4664 UVM_ASSERT(!uvm_page_mask_intersects(evicted_mask, resident_mask)); 4665 4666 // Pages cannot be resident on a GPU with no memory 4667 if (!block_processor_has_memory(block, id)) 4668 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id)); 4669 } 4670 } 4671 4672 // Check that every page has coherent mappings 4673 for_each_va_block_page(page_index, block) 4674 block_check_mappings_page(block, page_index); 4675 4676 for_each_gpu_id(id) { 4677 if (uvm_va_block_gpu_state_get(block, id)) { 4678 uvm_gpu_t *gpu = block_get_gpu(block, id); 4679 4680 // Check big and/or 2M PTE state 4681 block_check_mappings_ptes(block, gpu); 4682 } 4683 } 4684 4685 return true; 4686 } 4687 4688 // See the comments on uvm_va_block_unmap 4689 static void block_unmap_cpu(uvm_va_block_t *block, uvm_va_block_region_t region, const uvm_page_mask_t *unmap_pages) 4690 { 4691 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 4692 uvm_pte_bits_cpu_t pte_bit; 4693 bool unmapped_something = false; 4694 uvm_va_block_region_t subregion; 4695 NvU32 num_mapped_processors; 4696 4697 // Early-out if nothing in the region is mapped or being unmapped. 4698 if (!block_has_valid_mapping_cpu(block, region) || 4699 (unmap_pages && !uvm_page_mask_intersects(unmap_pages, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]))) 4700 return; 4701 4702 // We can't actually unmap HMM ranges from the CPU here. 4703 // Unmapping happens as part of migrate_vma_setup(). 4704 if (uvm_va_block_is_hmm(block)) { 4705 UVM_ASSERT(!uvm_va_block_is_hmm(block)); 4706 return; 4707 } 4708 4709 num_mapped_processors = uvm_processor_mask_get_count(&block->mapped); 4710 4711 // If we are unmapping a page which we are tracking due to CPU faults with 4712 // correct permissions, clear the info. This will cover both the unmap and 4713 // revoke cases (since we implement CPU revocation by unmap + map) 4714 if (block->cpu.fault_authorized.first_fault_stamp && 4715 uvm_page_mask_region_test(unmap_pages, region, block->cpu.fault_authorized.page_index)) 4716 block->cpu.fault_authorized.first_fault_stamp = 0; 4717 4718 for_each_va_block_subregion_in_mask(subregion, unmap_pages, region) { 4719 if (!block_has_valid_mapping_cpu(block, subregion)) 4720 continue; 4721 4722 unmap_mapping_range(va_space->mapping, 4723 uvm_va_block_region_start(block, subregion), 4724 uvm_va_block_region_size(subregion), 1); 4725 4726 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++) 4727 uvm_page_mask_region_clear(&block->cpu.pte_bits[pte_bit], subregion); 4728 4729 // If the CPU is the only processor with mappings we can safely mark 4730 // the pages as fully unmapped 4731 if (num_mapped_processors == 1) 4732 uvm_page_mask_region_clear(&block->maybe_mapped_pages, subregion); 4733 4734 unmapped_something = true; 4735 } 4736 4737 if (!unmapped_something) 4738 return; 4739 4740 // Check whether the block has any more mappings 4741 if (uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) { 4742 UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])); 4743 uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU); 4744 } 4745 4746 UVM_ASSERT(block_check_mappings(block)); 4747 } 4748 4749 // Given a mask of mapped pages, returns true if any of the pages in the mask 4750 // are mapped remotely by the given GPU. 4751 static bool block_has_remote_mapping_gpu(uvm_va_block_t *block, 4752 uvm_va_block_context_t *block_context, 4753 uvm_gpu_id_t gpu_id, 4754 const uvm_page_mask_t *mapped_pages) 4755 { 4756 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id); 4757 4758 if (!gpu_state) 4759 return false; 4760 4761 // The caller must ensure that all pages of the input mask are really mapped 4762 UVM_ASSERT(uvm_page_mask_subset(mapped_pages, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ])); 4763 4764 // UVM-Lite GPUs map the preferred location if it's accessible, regardless 4765 // of the resident location. 4766 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu_id)) { 4767 if (uvm_page_mask_empty(mapped_pages)) 4768 return false; 4769 4770 return !uvm_id_equal(uvm_va_range_get_policy(block->va_range)->preferred_location, gpu_id); 4771 } 4772 4773 // Remote pages are pages which are mapped but not resident locally 4774 return uvm_page_mask_andnot(&block_context->scratch_page_mask, mapped_pages, &gpu_state->resident); 4775 } 4776 4777 // Writes pte_clear_val to the 4k PTEs covered by clear_page_mask. If 4778 // clear_page_mask is NULL, all 4k PTEs in the {block, gpu} are written. 4779 // 4780 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The 4781 // caller is responsible for ending the TLB batch with the appropriate membar. 4782 static void block_gpu_pte_clear_4k(uvm_va_block_t *block, 4783 uvm_gpu_t *gpu, 4784 const uvm_page_mask_t *clear_page_mask, 4785 NvU64 pte_clear_val, 4786 uvm_pte_batch_t *pte_batch, 4787 uvm_tlb_batch_t *tlb_batch) 4788 { 4789 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4790 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 4791 uvm_gpu_phys_address_t pte_addr; 4792 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K); 4793 uvm_va_block_region_t region = uvm_va_block_region_from_block(block); 4794 uvm_va_block_region_t subregion; 4795 size_t num_ptes, ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K; 4796 4797 for_each_va_block_subregion_in_mask(subregion, clear_page_mask, region) { 4798 num_ptes = uvm_va_block_region_num_pages(subregion) * ptes_per_page; 4799 4800 pte_addr = uvm_page_table_range_entry_address(tree, 4801 &gpu_state->page_table_range_4k, 4802 subregion.first * ptes_per_page); 4803 4804 uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, num_ptes); 4805 4806 if (tlb_batch) { 4807 uvm_tlb_batch_invalidate(tlb_batch, 4808 uvm_va_block_region_start(block, subregion), 4809 uvm_va_block_region_size(subregion), 4810 UVM_PAGE_SIZE_4K, 4811 UVM_MEMBAR_NONE); 4812 } 4813 } 4814 } 4815 4816 // Writes the 4k PTEs covered by write_page_mask using memory from resident_id 4817 // with new_prot permissions. new_prot must not be UVM_PROT_NONE: use 4818 // block_gpu_pte_clear_4k instead. 4819 // 4820 // If write_page_mask is NULL, all 4k PTEs in the {block, gpu} are written. 4821 // 4822 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The 4823 // caller is responsible for ending the TLB batch with the appropriate membar. 4824 static void block_gpu_pte_write_4k(uvm_va_block_t *block, 4825 uvm_gpu_t *gpu, 4826 uvm_processor_id_t resident_id, 4827 uvm_prot_t new_prot, 4828 const uvm_page_mask_t *write_page_mask, 4829 uvm_pte_batch_t *pte_batch, 4830 uvm_tlb_batch_t *tlb_batch) 4831 { 4832 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4833 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 4834 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K); 4835 const size_t ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K; 4836 uvm_va_block_region_t contig_region = {0}; 4837 uvm_gpu_phys_address_t contig_addr = {0}; 4838 uvm_gpu_phys_address_t page_addr = {0}; 4839 uvm_page_index_t page_index; 4840 NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id); 4841 4842 UVM_ASSERT(new_prot != UVM_PROT_NONE); 4843 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 4844 4845 for_each_va_block_page_in_mask(page_index, write_page_mask, block) { 4846 uvm_gpu_phys_address_t pte_addr; 4847 size_t i; 4848 4849 // Assume that this mapping will be used to write to the page 4850 if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block)) 4851 block_mark_cpu_page_dirty(block, page_index); 4852 4853 if (page_index >= contig_region.outer) { 4854 contig_region = block_phys_contig_region(block, page_index, resident_id); 4855 contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu); 4856 page_addr = contig_addr; 4857 } 4858 4859 page_addr.address = contig_addr.address + (page_index - contig_region.first) * PAGE_SIZE; 4860 4861 pte_addr = uvm_page_table_range_entry_address(tree, 4862 &gpu_state->page_table_range_4k, 4863 page_index * ptes_per_page); 4864 4865 // Handle PAGE_SIZE > GPU PTE size 4866 for (i = 0; i < ptes_per_page; i++) { 4867 NvU64 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags); 4868 uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size); 4869 page_addr.address += UVM_PAGE_SIZE_4K; 4870 pte_addr.address += pte_size; 4871 } 4872 4873 if (tlb_batch) { 4874 NvU64 page_virt_addr = uvm_va_block_cpu_page_address(block, page_index); 4875 uvm_tlb_batch_invalidate(tlb_batch, page_virt_addr, PAGE_SIZE, UVM_PAGE_SIZE_4K, UVM_MEMBAR_NONE); 4876 } 4877 } 4878 } 4879 4880 // Writes all 4k PTEs under the big PTE regions described by big_ptes_covered. 4881 // This is used to initialize the 4k PTEs when splitting 2M and big PTEs. It 4882 // only writes 4k PTEs, not big PTEs. 4883 // 4884 // For those 4k PTEs, new_pages_mask indicates which ones should inherit the 4885 // mapping from the corresponding big page (0) and which ones should be written 4886 // using memory from resident_id and new_prot (1). Unlike the other pte_write 4887 // functions, new_prot may be UVM_PROT_NONE. 4888 // 4889 // If resident_id is UVM_ID_INVALID, this function looks up the resident ID 4890 // which should inherit the current permissions. new_prot must be UVM_PROT_NONE 4891 // in this case. 4892 // 4893 // new_pages_mask must not be NULL. 4894 // 4895 // No TLB invalidates are required since we've set up the lower PTEs to never be 4896 // cached by the GPU's MMU when covered by larger PTEs. 4897 static void block_gpu_pte_big_split_write_4k(uvm_va_block_t *block, 4898 uvm_va_block_context_t *block_context, 4899 uvm_gpu_t *gpu, 4900 uvm_processor_id_t resident_id, 4901 uvm_prot_t new_prot, 4902 const unsigned long *big_ptes_covered, 4903 const uvm_page_mask_t *new_pages_mask, 4904 uvm_pte_batch_t *pte_batch) 4905 { 4906 uvm_va_block_region_t big_region; 4907 size_t big_page_index; 4908 uvm_processor_id_t curr_resident_id; 4909 uvm_prot_t curr_prot; 4910 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); 4911 4912 if (UVM_ID_IS_INVALID(resident_id)) 4913 UVM_ASSERT(new_prot == UVM_PROT_NONE); 4914 4915 for_each_set_bit(big_page_index, big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 4916 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 4917 4918 curr_prot = block_page_prot_gpu(block, gpu, big_region.first); 4919 4920 // The unmap path doesn't know the current residency ahead of time, so 4921 // we have to look it up. 4922 if (UVM_ID_IS_INVALID(resident_id)) { 4923 curr_resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first); 4924 } 4925 else { 4926 // Check that we aren't changing the aperture of the existing 4927 // mappings. It could be legal in some cases (switching from {RO, A} 4928 // to {RO, B} for example) but we'd need to issue TLB membars. 4929 if (curr_prot != UVM_PROT_NONE) 4930 UVM_ASSERT(uvm_id_equal(block_gpu_get_processor_to_map(block, gpu, big_region.first), resident_id)); 4931 4932 curr_resident_id = resident_id; 4933 } 4934 4935 // pages in new_pages_mask under this big page get new_prot 4936 uvm_page_mask_zero(&block_context->scratch_page_mask); 4937 uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region); 4938 if (uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) { 4939 if (new_prot == UVM_PROT_NONE) { 4940 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 4941 } 4942 else { 4943 block_gpu_pte_write_4k(block, 4944 gpu, 4945 curr_resident_id, 4946 new_prot, 4947 &block_context->scratch_page_mask, 4948 pte_batch, 4949 NULL); 4950 } 4951 } 4952 4953 // All other pages under this big page inherit curr_prot 4954 uvm_page_mask_zero(&block_context->scratch_page_mask); 4955 uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region); 4956 if (uvm_page_mask_andnot(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) { 4957 if (curr_prot == UVM_PROT_NONE) { 4958 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 4959 } 4960 else { 4961 block_gpu_pte_write_4k(block, 4962 gpu, 4963 curr_resident_id, 4964 curr_prot, 4965 &block_context->scratch_page_mask, 4966 pte_batch, 4967 NULL); 4968 } 4969 } 4970 } 4971 } 4972 4973 // Writes pte_clear_val to the big PTEs in big_ptes_mask. If big_ptes_mask is 4974 // NULL, all big PTEs in the {block, gpu} are cleared. 4975 // 4976 // If tlb_batch is provided, the big PTEs written are added to the batch. The 4977 // caller is responsible for ending the TLB batch with the appropriate membar. 4978 static void block_gpu_pte_clear_big(uvm_va_block_t *block, 4979 uvm_gpu_t *gpu, 4980 const unsigned long *big_ptes_mask, 4981 NvU64 pte_clear_val, 4982 uvm_pte_batch_t *pte_batch, 4983 uvm_tlb_batch_t *tlb_batch) 4984 { 4985 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4986 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 4987 NvU32 big_page_size = gpu_va_space->page_tables.big_page_size; 4988 uvm_gpu_phys_address_t pte_addr; 4989 NvU32 pte_size = uvm_mmu_pte_size(&gpu_va_space->page_tables, big_page_size); 4990 size_t big_page_index; 4991 DECLARE_BITMAP(big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 4992 4993 if (big_ptes_mask) 4994 bitmap_copy(big_ptes_to_clear, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 4995 else 4996 bitmap_set(big_ptes_to_clear, 0, uvm_va_block_num_big_pages(block, big_page_size)); 4997 4998 for_each_set_bit(big_page_index, big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 4999 pte_addr = uvm_page_table_range_entry_address(&gpu_va_space->page_tables, 5000 &gpu_state->page_table_range_big, 5001 big_page_index); 5002 uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, 1); 5003 5004 if (tlb_batch) { 5005 uvm_tlb_batch_invalidate(tlb_batch, 5006 uvm_va_block_big_page_addr(block, big_page_index, big_page_size), 5007 big_page_size, 5008 big_page_size, 5009 UVM_MEMBAR_NONE); 5010 } 5011 } 5012 } 5013 5014 // Writes the big PTEs in big_ptes_mask using memory from resident_id with 5015 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use 5016 // block_gpu_pte_clear_big instead. 5017 // 5018 // Unlike block_gpu_pte_clear_big, big_ptes_mask must not be NULL. 5019 // 5020 // If tlb_batch is provided, the big PTEs written are added to the batch. The 5021 // caller is responsible for ending the TLB batch with the appropriate membar. 5022 static void block_gpu_pte_write_big(uvm_va_block_t *block, 5023 uvm_gpu_t *gpu, 5024 uvm_processor_id_t resident_id, 5025 uvm_prot_t new_prot, 5026 const unsigned long *big_ptes_mask, 5027 uvm_pte_batch_t *pte_batch, 5028 uvm_tlb_batch_t *tlb_batch) 5029 { 5030 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5031 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 5032 uvm_page_tree_t *tree = &gpu_va_space->page_tables; 5033 NvU32 big_page_size = tree->big_page_size; 5034 NvU32 pte_size = uvm_mmu_pte_size(tree, big_page_size); 5035 size_t big_page_index; 5036 uvm_va_block_region_t contig_region = {0}; 5037 uvm_gpu_phys_address_t contig_addr = {0}; 5038 uvm_gpu_phys_address_t page_addr = {0}; 5039 NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id); 5040 5041 UVM_ASSERT(new_prot != UVM_PROT_NONE); 5042 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 5043 UVM_ASSERT(big_ptes_mask); 5044 5045 if (!bitmap_empty(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 5046 UVM_ASSERT(uvm_va_block_num_big_pages(block, big_page_size) > 0); 5047 5048 if (!gpu->parent->can_map_sysmem_with_large_pages) 5049 UVM_ASSERT(UVM_ID_IS_GPU(resident_id)); 5050 } 5051 5052 for_each_set_bit(big_page_index, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 5053 NvU64 pte_val; 5054 uvm_gpu_phys_address_t pte_addr; 5055 uvm_va_block_region_t big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 5056 5057 // Assume that this mapping will be used to write to the page 5058 if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block)) { 5059 uvm_page_index_t page_index; 5060 5061 for_each_va_block_page_in_region(page_index, big_region) 5062 block_mark_cpu_page_dirty(block, page_index); 5063 } 5064 5065 if (big_region.first >= contig_region.outer) { 5066 contig_region = block_phys_contig_region(block, big_region.first, resident_id); 5067 contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu); 5068 page_addr = contig_addr; 5069 } 5070 5071 page_addr.address = contig_addr.address + (big_region.first - contig_region.first) * PAGE_SIZE; 5072 5073 pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_big, big_page_index); 5074 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags); 5075 uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size); 5076 5077 if (tlb_batch) { 5078 uvm_tlb_batch_invalidate(tlb_batch, 5079 uvm_va_block_region_start(block, big_region), 5080 big_page_size, 5081 big_page_size, 5082 UVM_MEMBAR_NONE); 5083 } 5084 } 5085 } 5086 5087 // Switches any mix of valid or invalid 4k PTEs under the big PTEs in 5088 // big_ptes_to_merge to an unmapped big PTE. This also ends both pte_batch and 5089 // tlb_batch in order to poison the now-unused 4k PTEs. 5090 // 5091 // The 4k PTEs are invalidated with the specified membar. 5092 static void block_gpu_pte_merge_big_and_end(uvm_va_block_t *block, 5093 uvm_va_block_context_t *block_context, 5094 uvm_gpu_t *gpu, 5095 const unsigned long *big_ptes_to_merge, 5096 uvm_push_t *push, 5097 uvm_pte_batch_t *pte_batch, 5098 uvm_tlb_batch_t *tlb_batch, 5099 uvm_membar_t tlb_membar) 5100 { 5101 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5102 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5103 NvU32 big_page_size = tree->big_page_size; 5104 NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size); 5105 size_t big_page_index; 5106 DECLARE_BITMAP(dummy_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5107 5108 UVM_ASSERT(!bitmap_empty(big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5109 UVM_ASSERT(!bitmap_and(dummy_big_ptes, gpu_state->big_ptes, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5110 5111 // We can be called with the 4k PTEs in two cases: 5112 // 1) 4k PTEs allocated. In this case the 4k PTEs are currently active. 5113 // 5114 // 2) 4k PTEs unallocated. In this case the GPU may not have invalid 4k PTEs 5115 // active under the big PTE, depending on whether neighboring blocks 5116 // caused the page tables to be allocated. 5117 // 5118 // In both cases we need to invalidate the 4k PTEs in case the GPU MMU has 5119 // them cached. 5120 5121 // Each big PTE is currently invalid so the 4ks are active (or unallocated). 5122 // First make the big PTEs unmapped to disable future lookups of the 4ks 5123 // under it. We can't directly transition the entry from valid 4k PTEs to 5124 // valid big PTEs, because that could cause the GPU TLBs to cache the same 5125 // VA in different cache lines. That could cause memory ordering to not be 5126 // maintained. 5127 block_gpu_pte_clear_big(block, gpu, big_ptes_to_merge, unmapped_pte_val, pte_batch, tlb_batch); 5128 5129 // Now invalidate the big PTEs we just wrote as well as all 4ks under them. 5130 // Subsequent MMU fills will stop at the now-unmapped big PTEs, so we only 5131 // need to invalidate the 4k PTEs without actually writing them. 5132 for_each_set_bit(big_page_index, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 5133 uvm_tlb_batch_invalidate(tlb_batch, 5134 uvm_va_block_big_page_addr(block, big_page_index, big_page_size), 5135 big_page_size, 5136 big_page_size | UVM_PAGE_SIZE_4K, 5137 UVM_MEMBAR_NONE); 5138 } 5139 5140 // End the batches for the caller. We need to do this here in order to 5141 // poison the 4ks below. 5142 uvm_pte_batch_end(pte_batch); 5143 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5144 5145 // As a guard against bad PTE writes/TLB invalidates, fill the now-unused 5146 // PTEs with a pattern which will trigger fatal faults on access. We have to 5147 // do this after the TLB invalidate of the big PTEs, or the GPU might use 5148 // the new values. 5149 if (UVM_IS_DEBUG() && gpu_state->page_table_range_4k.table) { 5150 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_to_merge); 5151 uvm_pte_batch_begin(push, pte_batch); 5152 block_gpu_pte_clear_4k(block, 5153 gpu, 5154 &block_context->scratch_page_mask, 5155 tree->hal->poisoned_pte(), 5156 pte_batch, 5157 NULL); 5158 uvm_pte_batch_end(pte_batch); 5159 } 5160 } 5161 5162 // Writes 0 (invalid) to the 2M PTE for this {block, gpu}. 5163 // 5164 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is 5165 // responsible for ending the TLB batch with the appropriate membar. 5166 static void block_gpu_pte_clear_2m(uvm_va_block_t *block, 5167 uvm_gpu_t *gpu, 5168 uvm_pte_batch_t *pte_batch, 5169 uvm_tlb_batch_t *tlb_batch) 5170 { 5171 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5172 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5173 uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0); 5174 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M); 5175 5176 // uvm_pte_batch_write_pte only writes the lower 8 bytes of the 16-byte PTE, 5177 // which would cause a problem when trying to make the entry invalid since 5178 // both halves must be 0. Using uvm_pte_batch_clear_ptes writes the entire 5179 // 16 bytes. 5180 uvm_pte_batch_clear_ptes(pte_batch, pte_addr, 0, pte_size, 1); 5181 5182 if (tlb_batch) 5183 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE); 5184 } 5185 5186 // Writes the 2M PTE for {block, gpu} using memory from resident_id with 5187 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use 5188 // block_gpu_pte_clear_2m instead. 5189 // 5190 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is 5191 // responsible for ending the TLB batch with the appropriate membar. 5192 static void block_gpu_pte_write_2m(uvm_va_block_t *block, 5193 uvm_gpu_t *gpu, 5194 uvm_processor_id_t resident_id, 5195 uvm_prot_t new_prot, 5196 uvm_pte_batch_t *pte_batch, 5197 uvm_tlb_batch_t *tlb_batch) 5198 { 5199 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5200 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5201 uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0); 5202 uvm_gpu_phys_address_t page_addr; 5203 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M); 5204 NvU64 pte_val; 5205 NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id); 5206 5207 UVM_ASSERT(new_prot != UVM_PROT_NONE); 5208 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 5209 5210 if (UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block)) 5211 block_mark_cpu_page_dirty(block, 0); 5212 5213 page_addr = block_phys_page_address(block, block_phys_page(resident_id, 0), gpu); 5214 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags); 5215 uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size); 5216 5217 if (tlb_batch) 5218 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE); 5219 } 5220 5221 static bool block_gpu_needs_to_activate_table(uvm_va_block_t *block, uvm_gpu_t *gpu) 5222 { 5223 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5224 5225 if (!block_gpu_supports_2m(block, gpu)) 5226 return false; 5227 5228 if ((gpu_state->page_table_range_big.table && !gpu_state->activated_big) || 5229 (gpu_state->page_table_range_4k.table && !gpu_state->activated_4k)) 5230 return true; 5231 5232 return false; 5233 } 5234 5235 // Only used if 2M PTEs are supported. Either transitions a 2M PTE to a PDE, or 5236 // activates a newly-allocated page table (big or 4k) while the other is already 5237 // active. The caller must have already written the new PTEs under the table 5238 // with the appropriate membar. 5239 static void block_gpu_write_pde(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_push_t *push, uvm_tlb_batch_t *tlb_batch) 5240 { 5241 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5242 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5243 5244 if (!gpu_state->pte_is_2m) 5245 UVM_ASSERT(block_gpu_needs_to_activate_table(block, gpu)); 5246 5247 UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table); 5248 5249 // We always need a membar to order PDE/PTE writes with the TLB invalidate. 5250 // write_pde will do a MEMBAR_SYS by default. 5251 if (uvm_page_table_range_aperture(&gpu_state->page_table_range_2m) == UVM_APERTURE_VID) 5252 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU); 5253 uvm_page_tree_write_pde(tree, &gpu_state->page_table_range_2m, push); 5254 5255 gpu->parent->host_hal->wait_for_idle(push); 5256 5257 // Invalidate just the PDE 5258 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE); 5259 5260 if (gpu_state->page_table_range_big.table) 5261 gpu_state->activated_big = true; 5262 5263 if (gpu_state->page_table_range_4k.table) 5264 gpu_state->activated_4k = true; 5265 } 5266 5267 // Called to switch the 2M PTE (valid or invalid) to a PDE. The caller should 5268 // have written all lower PTEs as appropriate into the given pte_batch already. 5269 // This function ends the PTE batch, activates the 2M PDE, and does a TLB 5270 // invalidate. 5271 // 5272 // The caller does not need to do any TLB invalidates since none of the lower 5273 // PTEs could be cached. 5274 static void block_gpu_pte_finish_split_2m(uvm_va_block_t *block, 5275 uvm_gpu_t *gpu, 5276 uvm_push_t *push, 5277 uvm_pte_batch_t *pte_batch, 5278 uvm_tlb_batch_t *tlb_batch, 5279 uvm_membar_t tlb_membar) 5280 { 5281 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5282 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); 5283 5284 // Step 1: Make the 2M entry invalid. We can't directly transition from a 5285 // valid 2M PTE to valid lower PTEs, because that could cause the 5286 // GPU TLBs to cache the same VA in different cache lines. That 5287 // could cause memory ordering to not be maintained. 5288 // 5289 // If the 2M PTE is already invalid, no TLB invalidate is needed. 5290 5291 if (curr_prot == UVM_PROT_NONE) { 5292 // If we aren't downgrading, then we don't need a membar. 5293 UVM_ASSERT(tlb_membar == UVM_MEMBAR_NONE); 5294 5295 // End the batch, which pushes a membar to ensure that the caller's PTE 5296 // writes below 2M are observed before the PDE write we're about to do. 5297 uvm_pte_batch_end(pte_batch); 5298 } 5299 else { 5300 // The 64k and 4k PTEs can't possibly be cached since the 2M entry is 5301 // not yet a PDE, so we just need to invalidate this single 2M entry. 5302 uvm_tlb_batch_begin(tree, tlb_batch); 5303 block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch); 5304 5305 // Make sure the PTE writes are observed before the TLB invalidate 5306 uvm_pte_batch_end(pte_batch); 5307 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5308 } 5309 5310 // Step 2: Switch the 2M entry from invalid to a PDE. This activates the 5311 // smaller PTEs. 5312 uvm_tlb_batch_begin(tree, tlb_batch); 5313 block_gpu_write_pde(block, gpu, push, tlb_batch); 5314 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 5315 } 5316 5317 // Switches any mix of valid or invalid 4k or 64k PTEs to an invalid 2M PTE. 5318 // Any lower PTEs are invalidated with the specified membar. 5319 static void block_gpu_pte_merge_2m(uvm_va_block_t *block, 5320 uvm_va_block_context_t *block_context, 5321 uvm_gpu_t *gpu, 5322 uvm_push_t *push, 5323 uvm_membar_t tlb_membar) 5324 { 5325 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5326 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5327 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5328 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5329 NvU32 tlb_inval_sizes; 5330 5331 UVM_ASSERT(!gpu_state->pte_is_2m); 5332 UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table); 5333 5334 // The 2M entry is currently a PDE, so first make it invalid. We can't 5335 // directly transition the entry from a valid PDE to a valid 2M PTE, because 5336 // that could cause the GPU TLBs to cache the same VA in different cache 5337 // lines. That could cause memory ordering to not be maintained. 5338 uvm_pte_batch_begin(push, pte_batch); 5339 block_gpu_pte_clear_2m(block, gpu, pte_batch, NULL); 5340 uvm_pte_batch_end(pte_batch); 5341 5342 // Now invalidate both the 2M entry we just wrote as well as all lower-level 5343 // entries which could be cached. Subsequent MMU fills will stop at the now- 5344 // invalid 2M entry, so we only need to invalidate the lower PTEs without 5345 // actually writing them. 5346 tlb_inval_sizes = UVM_PAGE_SIZE_2M; 5347 if (gpu_state->page_table_range_big.table) 5348 tlb_inval_sizes |= UVM_PAGE_SIZE_64K; 5349 5350 // Strictly-speaking we only need to invalidate those 4k ranges which are 5351 // not covered by a big pte. However, any such invalidate will require 5352 // enough 4k invalidates to force the TLB batching to invalidate everything 5353 // anyway, so just do the simpler thing. 5354 if (!bitmap_full(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 5355 tlb_inval_sizes |= UVM_PAGE_SIZE_4K; 5356 5357 uvm_tlb_batch_begin(tree, tlb_batch); 5358 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, tlb_inval_sizes, UVM_MEMBAR_NONE); 5359 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5360 5361 // As a guard against bad PTE writes/TLB invalidates, fill the now-unused 5362 // PTEs with a pattern which will trigger fatal faults on access. We have to 5363 // do this after the TLB invalidate of the 2M entry, or the GPU might use 5364 // the new values. 5365 if (UVM_IS_DEBUG()) { 5366 uvm_pte_batch_begin(push, pte_batch); 5367 5368 if (gpu_state->page_table_range_big.table) { 5369 block_gpu_pte_clear_big(block, 5370 gpu, 5371 NULL, 5372 tree->hal->poisoned_pte(), 5373 pte_batch, 5374 NULL); 5375 } 5376 5377 if (gpu_state->page_table_range_4k.table) { 5378 block_gpu_pte_clear_4k(block, 5379 gpu, 5380 NULL, 5381 tree->hal->poisoned_pte(), 5382 pte_batch, 5383 NULL); 5384 } 5385 5386 uvm_pte_batch_end(pte_batch); 5387 } 5388 } 5389 5390 static uvm_membar_t block_pte_op_membar(block_pte_op_t pte_op, uvm_gpu_t *gpu, uvm_processor_id_t resident_id) 5391 { 5392 // Permissions upgrades (MAP) don't need membars 5393 if (pte_op == BLOCK_PTE_OP_MAP) 5394 return UVM_MEMBAR_NONE; 5395 5396 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 5397 UVM_ASSERT(pte_op == BLOCK_PTE_OP_REVOKE); 5398 5399 return uvm_hal_downgrade_membar_type(gpu, uvm_id_equal(gpu->id, resident_id)); 5400 } 5401 5402 // Write the 2M PTE for {block, gpu} to the memory on resident_id with new_prot 5403 // permissions. If the 2M entry is currently a PDE, it is first merged into a 5404 // PTE. 5405 // 5406 // new_prot must not be UVM_PROT_NONE: use block_gpu_unmap_to_2m instead. 5407 // 5408 // pte_op specifies whether this is a MAP or REVOKE operation, which determines 5409 // the TLB membar required. 5410 static void block_gpu_map_to_2m(uvm_va_block_t *block, 5411 uvm_va_block_context_t *block_context, 5412 uvm_gpu_t *gpu, 5413 uvm_processor_id_t resident_id, 5414 uvm_prot_t new_prot, 5415 uvm_push_t *push, 5416 block_pte_op_t pte_op) 5417 { 5418 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5419 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 5420 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5421 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5422 uvm_membar_t tlb_membar; 5423 5424 UVM_ASSERT(new_prot != UVM_PROT_NONE); 5425 5426 // If we have a mix of big and 4k PTEs, we have to first merge them to an 5427 // invalid 2M PTE. 5428 if (!gpu_state->pte_is_2m) { 5429 block_gpu_pte_merge_2m(block, block_context, gpu, push, UVM_MEMBAR_NONE); 5430 5431 gpu_state->pte_is_2m = true; 5432 bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5433 } 5434 5435 // Write the new permissions 5436 uvm_pte_batch_begin(push, pte_batch); 5437 uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch); 5438 5439 block_gpu_pte_write_2m(block, gpu, resident_id, new_prot, pte_batch, tlb_batch); 5440 5441 uvm_pte_batch_end(pte_batch); 5442 5443 tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id); 5444 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5445 } 5446 5447 // Combination split + map operation, called when only part of a 2M PTE mapping 5448 // is being changed. This splits an existing valid or invalid 2M PTE into the 5449 // mix of big and 4k PTEs described by block_context->mapping.new_pte_state. 5450 // 5451 // The PTEs covering the pages in pages_to_write are written to the memory on 5452 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE. 5453 // 5454 // The PTEs covering the pages not set in pages_to_write inherit the mapping of 5455 // the current 2M PTE. If the current mapping is valid, it must target 5456 // resident_id. 5457 // 5458 // pte_op specifies whether this is a MAP or REVOKE operation, which determines 5459 // the TLB membar required. 5460 static void block_gpu_map_split_2m(uvm_va_block_t *block, 5461 uvm_va_block_context_t *block_context, 5462 uvm_gpu_t *gpu, 5463 uvm_processor_id_t resident_id, 5464 const uvm_page_mask_t *pages_to_write, 5465 uvm_prot_t new_prot, 5466 uvm_push_t *push, 5467 block_pte_op_t pte_op) 5468 { 5469 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5470 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5471 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 5472 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5473 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5474 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); 5475 uvm_membar_t tlb_membar; 5476 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5477 DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5478 DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5479 5480 UVM_ASSERT(gpu_state->pte_is_2m); 5481 5482 if (!gpu_state->page_table_range_4k.table) 5483 UVM_ASSERT(bitmap_full(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5484 5485 uvm_pte_batch_begin(push, pte_batch); 5486 5487 // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries 5488 // from the lower levels. This means we don't need to issue a TLB invalidate 5489 // when writing those levels. 5490 5491 // Cases to handle: 5492 // 1) Big PTEs which inherit curr_prot 5493 // 2) Big PTEs which get new_prot 5494 // 3) Big PTEs which are split to 4k 5495 // a) 4k PTEs which inherit curr_prot under the split big PTEs 5496 // b) 4k PTEs which get new_prot under the split big PTEs 5497 5498 // Compute the big PTEs which will need to be split to 4k, if any. 5499 bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5500 5501 if (gpu_state->page_table_range_big.table) { 5502 // Case 1: Write the big PTEs which will inherit the 2M permissions, if 5503 // any. These are the big PTEs which are unchanged (uncovered) by the 5504 // operation. 5505 bitmap_andnot(big_ptes_inherit, 5506 new_pte_state->big_ptes, 5507 new_pte_state->big_ptes_covered, 5508 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5509 5510 if (curr_prot == UVM_PROT_NONE) { 5511 block_gpu_pte_clear_big(block, 5512 gpu, 5513 big_ptes_inherit, 5514 tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K), 5515 pte_batch, 5516 NULL); 5517 } 5518 else { 5519 block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL); 5520 } 5521 5522 // Case 2: Write the new big PTEs 5523 bitmap_and(big_ptes_new_prot, 5524 new_pte_state->big_ptes, 5525 new_pte_state->big_ptes_covered, 5526 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5527 block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_new_prot, pte_batch, NULL); 5528 5529 // Case 3: Write the big PTEs which cover 4k PTEs 5530 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL); 5531 5532 // We just wrote all possible big PTEs, so mark them as initialized 5533 gpu_state->initialized_big = true; 5534 } 5535 else { 5536 UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5537 } 5538 5539 // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs 5540 block_gpu_pte_big_split_write_4k(block, 5541 block_context, 5542 gpu, 5543 resident_id, 5544 new_prot, 5545 big_ptes_split, 5546 pages_to_write, 5547 pte_batch); 5548 5549 // Activate the 2M PDE. This ends the pte_batch and issues a single TLB 5550 // invalidate for the 2M entry. 5551 tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id); 5552 block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar); 5553 5554 gpu_state->pte_is_2m = false; 5555 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5556 } 5557 5558 // Split the existing 2M PTE into big and 4k PTEs. No permissions are changed. 5559 // 5560 // new_big_ptes specifies which PTEs should be big. NULL means all PTEs should 5561 // be 4k. 5562 static void block_gpu_split_2m(uvm_va_block_t *block, 5563 uvm_va_block_context_t *block_context, 5564 uvm_gpu_t *gpu, 5565 const unsigned long *new_big_ptes, 5566 uvm_push_t *push) 5567 { 5568 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5569 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5570 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5571 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5572 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); 5573 DECLARE_BITMAP(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5574 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5575 NvU64 unmapped_pte_val; 5576 uvm_processor_id_t curr_residency; 5577 5578 UVM_ASSERT(gpu_state->pte_is_2m); 5579 5580 if (new_big_ptes) 5581 bitmap_copy(new_big_ptes_local, new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5582 else 5583 bitmap_zero(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5584 5585 if (!bitmap_empty(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 5586 UVM_ASSERT(gpu_state->page_table_range_big.table); 5587 5588 // We're splitting from 2M to big only, so we'll be writing all big PTEs 5589 if (gpu_state->page_table_range_big.table) 5590 gpu_state->initialized_big = true; 5591 5592 // Cases to handle: 5593 // 1) Big PTEs which inherit curr_prot 5594 // 2) Big PTEs which are split to 4k 5595 // a) 4k PTEs inherit curr_prot under the split big PTEs 5596 5597 // big_ptes_split will cover the 4k regions 5598 bitmap_complement(big_ptes_split, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5599 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_split); 5600 5601 uvm_pte_batch_begin(push, pte_batch); 5602 5603 // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries 5604 // from the lower levels. This means we don't need to issue a TLB invalidate 5605 // when writing those levels. 5606 5607 if (curr_prot == UVM_PROT_NONE) { 5608 unmapped_pte_val = tree->hal->unmapped_pte(tree->big_page_size); 5609 5610 // Case 2a: Clear the 4k PTEs under big_ptes_split 5611 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 5612 5613 // Case 1: Make the remaining big PTEs unmapped 5614 block_gpu_pte_clear_big(block, gpu, new_big_ptes_local, unmapped_pte_val, pte_batch, NULL); 5615 } 5616 else { 5617 curr_residency = block_gpu_get_processor_to_map(block, gpu, 0); 5618 5619 // Case 2a: Write the new 4k PTEs under big_ptes_split 5620 block_gpu_pte_write_4k(block, 5621 gpu, 5622 curr_residency, 5623 curr_prot, 5624 &block_context->scratch_page_mask, 5625 pte_batch, 5626 NULL); 5627 5628 // Case 1: Write the new big PTEs 5629 block_gpu_pte_write_big(block, gpu, curr_residency, curr_prot, new_big_ptes_local, pte_batch, NULL); 5630 } 5631 5632 // Case 2: Make big_ptes_split invalid to activate the 4k PTEs 5633 if (gpu_state->page_table_range_big.table) 5634 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL); 5635 5636 // Activate the 2M PDE. This ends the pte_batch and issues a single TLB 5637 // invalidate for the 2M entry. No membar is necessary since we aren't 5638 // changing permissions. 5639 block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, UVM_MEMBAR_NONE); 5640 5641 gpu_state->pte_is_2m = false; 5642 bitmap_copy(gpu_state->big_ptes, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5643 } 5644 5645 // Split the big PTEs in big_ptes_to_split into 4k PTEs. No permissions are 5646 // changed. 5647 // 5648 // big_ptes_to_split must not be NULL. 5649 static void block_gpu_split_big(uvm_va_block_t *block, 5650 uvm_va_block_context_t *block_context, 5651 uvm_gpu_t *gpu, 5652 const unsigned long *big_ptes_to_split, 5653 uvm_push_t *push) 5654 { 5655 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5656 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5657 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5658 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5659 NvU32 big_page_size = tree->big_page_size; 5660 uvm_va_block_region_t big_region; 5661 uvm_processor_id_t resident_id; 5662 size_t big_page_index; 5663 uvm_prot_t curr_prot; 5664 DECLARE_BITMAP(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5665 5666 UVM_ASSERT(!gpu_state->pte_is_2m); 5667 UVM_ASSERT(bitmap_subset(big_ptes_to_split, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5668 UVM_ASSERT(!bitmap_empty(big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5669 5670 uvm_pte_batch_begin(push, pte_batch); 5671 uvm_tlb_batch_begin(tree, tlb_batch); 5672 5673 // Write all 4k PTEs under all big PTEs which are being split. We'll make 5674 // the big PTEs inactive below after flushing these writes. No TLB 5675 // invalidate is needed since the big PTE is active. 5676 bitmap_zero(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5677 for_each_set_bit(big_page_index, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 5678 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 5679 curr_prot = block_page_prot_gpu(block, gpu, big_region.first); 5680 5681 uvm_page_mask_zero(&block_context->scratch_page_mask); 5682 uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region); 5683 if (curr_prot == UVM_PROT_NONE) { 5684 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 5685 } 5686 else { 5687 __set_bit(big_page_index, big_ptes_valid); 5688 5689 resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first); 5690 5691 block_gpu_pte_write_4k(block, 5692 gpu, 5693 resident_id, 5694 curr_prot, 5695 &block_context->scratch_page_mask, 5696 pte_batch, 5697 NULL); 5698 } 5699 } 5700 5701 // Unmap the big PTEs which are valid and are being split to 4k. We can't 5702 // directly transition from a valid big PTE to valid lower PTEs, because 5703 // that could cause the GPU TLBs to cache the same VA in different cache 5704 // lines. That could cause memory ordering to not be maintained. 5705 block_gpu_pte_clear_big(block, gpu, big_ptes_valid, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch); 5706 5707 // End the batches. We have to commit the membars and TLB invalidates 5708 // before we finish splitting formerly-big PTEs. No membar is necessary 5709 // since we aren't changing permissions. 5710 uvm_pte_batch_end(pte_batch); 5711 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 5712 5713 // Finish the split by switching the big PTEs from unmapped to invalid. This 5714 // causes the GPU MMU to start reading the 4k PTEs instead of stopping at 5715 // the unmapped big PTEs. 5716 uvm_pte_batch_begin(push, pte_batch); 5717 uvm_tlb_batch_begin(tree, tlb_batch); 5718 5719 block_gpu_pte_clear_big(block, gpu, big_ptes_to_split, 0, pte_batch, tlb_batch); 5720 5721 uvm_pte_batch_end(pte_batch); 5722 5723 // Finally, activate the page tables if they're inactive 5724 if (block_gpu_needs_to_activate_table(block, gpu)) 5725 block_gpu_write_pde(block, gpu, push, tlb_batch); 5726 5727 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 5728 5729 bitmap_andnot(gpu_state->big_ptes, gpu_state->big_ptes, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5730 } 5731 5732 // Changes permissions on some pre-existing mix of big and 4k PTEs into some 5733 // other mix of big and 4k PTEs, as described by 5734 // block_context->mapping.new_pte_state. 5735 // 5736 // The PTEs covering the pages in pages_to_write are written to the memory on 5737 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE. 5738 // 5739 // pte_op specifies whether this is a MAP or REVOKE operation, which determines 5740 // the TLB membar required. 5741 static void block_gpu_map_big_and_4k(uvm_va_block_t *block, 5742 uvm_va_block_context_t *block_context, 5743 uvm_gpu_t *gpu, 5744 uvm_processor_id_t resident_id, 5745 const uvm_page_mask_t *pages_to_write, 5746 uvm_prot_t new_prot, 5747 uvm_push_t *push, 5748 block_pte_op_t pte_op) 5749 { 5750 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5751 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5752 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 5753 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5754 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5755 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5756 DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5757 DECLARE_BITMAP(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5758 DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5759 uvm_va_block_region_t big_region; 5760 size_t big_page_index; 5761 NvU32 big_page_size = tree->big_page_size; 5762 uvm_membar_t tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id); 5763 5764 UVM_ASSERT(!gpu_state->pte_is_2m); 5765 5766 uvm_pte_batch_begin(push, pte_batch); 5767 uvm_tlb_batch_begin(tree, tlb_batch); 5768 5769 // All of these cases might be perfomed in the same call: 5770 // 1) Split currently-big PTEs to 4k 5771 // a) Write new 4k PTEs which inherit curr_prot under the split big PTEs 5772 // b) Write new 4k PTEs which get new_prot under the split big PTEs 5773 // 2) Merge currently-4k PTEs to big with new_prot 5774 // 3) Write currently-big PTEs which wholly get new_prot 5775 // 4) Write currently-4k PTEs which get new_prot 5776 // 5) Initialize big PTEs which are not covered by this operation 5777 5778 // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are 5779 // being split. We'll make the big PTEs inactive below after flushing these 5780 // writes. No TLB invalidate is needed since the big PTE is active. 5781 // 5782 // Mask computation: big_before && !big_after 5783 bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5784 5785 block_gpu_pte_big_split_write_4k(block, 5786 block_context, 5787 gpu, 5788 resident_id, 5789 new_prot, 5790 big_ptes_split, 5791 pages_to_write, 5792 pte_batch); 5793 5794 // Case 4: Write the 4k PTEs which weren't covered by a big PTE before, and 5795 // remain uncovered after the operation. 5796 // 5797 // Mask computation: !big_before && !big_after 5798 bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5799 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after); 5800 if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_write, &block_context->scratch_page_mask)) { 5801 block_gpu_pte_write_4k(block, 5802 gpu, 5803 resident_id, 5804 new_prot, 5805 &block_context->scratch_page_mask, 5806 pte_batch, 5807 tlb_batch); 5808 } 5809 5810 // Case 5: If the big page table is newly-allocated, make sure that all big 5811 // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are 5812 // all initialized to invalid. 5813 // 5814 // The similar case of making newly-allocated big PTEs unmapped when no 5815 // lower 4k table is present is handled by having 5816 // block_gpu_compute_new_pte_state set new_pte_state->big_ptes 5817 // appropriately. 5818 if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) { 5819 // TODO: Bug 1766424: If we have the 4k page table already, we could 5820 // attempt to merge all uncovered big PTE regions when first 5821 // allocating the big table. That's probably not worth doing. 5822 UVM_ASSERT(gpu_state->page_table_range_4k.table); 5823 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5824 bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size)); 5825 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch); 5826 gpu_state->initialized_big = true; 5827 } 5828 5829 // Case 1 (step 1): Unmap the currently-big PTEs which are valid and are 5830 // being split to 4k. We can't directly transition from a valid big PTE to 5831 // valid lower PTEs, because that could cause the GPU TLBs to cache the same 5832 // VA in different cache lines. That could cause memory ordering to not be 5833 // maintained. 5834 bitmap_zero(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5835 for_each_set_bit(big_page_index, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 5836 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 5837 if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region.first)) 5838 __set_bit(big_page_index, big_ptes_mask); 5839 } 5840 5841 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch); 5842 5843 // Case 3: Write the currently-big PTEs which remain big PTEs, and are 5844 // wholly changing permissions. 5845 // 5846 // Mask computation: big_before && big_after && covered 5847 bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5848 if (bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 5849 block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_mask, pte_batch, tlb_batch); 5850 5851 // Case 2 (step 1): Merge the new big PTEs and end the batches, now that 5852 // we've done all of the independent PTE writes we can. This also merges 5853 // newly-allocated uncovered big PTEs to unmapped (see 5854 // block_gpu_compute_new_pte_state). 5855 // 5856 // Mask computation: !big_before && big_after 5857 if (bitmap_andnot(big_ptes_merge, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 5858 // This writes the newly-big PTEs to unmapped and ends the PTE and TLB 5859 // batches. 5860 block_gpu_pte_merge_big_and_end(block, 5861 block_context, 5862 gpu, 5863 big_ptes_merge, 5864 push, 5865 pte_batch, 5866 tlb_batch, 5867 tlb_membar); 5868 5869 // Remove uncovered big PTEs. We needed to merge them to unmapped above, 5870 // but they shouldn't get new_prot below. 5871 bitmap_and(big_ptes_merge, big_ptes_merge, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5872 } 5873 else { 5874 // End the batches. We have to commit the membars and TLB invalidates 5875 // before we finish splitting formerly-big PTEs. 5876 uvm_pte_batch_end(pte_batch); 5877 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5878 } 5879 5880 if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) || 5881 !bitmap_empty(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) || 5882 block_gpu_needs_to_activate_table(block, gpu)) { 5883 5884 uvm_pte_batch_begin(push, pte_batch); 5885 uvm_tlb_batch_begin(tree, tlb_batch); 5886 5887 // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by 5888 // switching them from unmapped to invalid. This causes the GPU MMU to 5889 // start reading the 4k PTEs instead of stopping at the unmapped big 5890 // PTEs. 5891 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch); 5892 5893 // Case 2 (step 2): Finish merging our big PTEs, if we have any, by 5894 // switching them from unmapped to new_prot. 5895 block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_merge, pte_batch, tlb_batch); 5896 5897 uvm_pte_batch_end(pte_batch); 5898 5899 // Finally, activate the page tables if they're inactive 5900 if (block_gpu_needs_to_activate_table(block, gpu)) 5901 block_gpu_write_pde(block, gpu, push, tlb_batch); 5902 5903 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 5904 } 5905 5906 // Update gpu_state 5907 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5908 } 5909 5910 // Unmap all PTEs for {block, gpu}. If the 2M entry is currently a PDE, it is 5911 // merged into a PTE. 5912 static void block_gpu_unmap_to_2m(uvm_va_block_t *block, 5913 uvm_va_block_context_t *block_context, 5914 uvm_gpu_t *gpu, 5915 uvm_push_t *push, 5916 uvm_membar_t tlb_membar) 5917 { 5918 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5919 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 5920 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5921 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5922 5923 if (gpu_state->pte_is_2m) { 5924 // If we're already mapped as a valid 2M PTE, just write it to invalid 5925 uvm_pte_batch_begin(push, pte_batch); 5926 uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch); 5927 5928 block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch); 5929 5930 uvm_pte_batch_end(pte_batch); 5931 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5932 } 5933 else { 5934 // Otherwise we have a mix of big and 4K PTEs which need to be merged 5935 // into an invalid 2M PTE. 5936 block_gpu_pte_merge_2m(block, block_context, gpu, push, tlb_membar); 5937 5938 gpu_state->pte_is_2m = true; 5939 bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5940 } 5941 } 5942 5943 // Combination split + unmap operation, called when only part of a valid 2M PTE 5944 // mapping is being unmapped. The 2M PTE is split into a mix of valid and 5945 // invalid big and/or 4k PTEs, as described by 5946 // block_context->mapping.new_pte_state. 5947 // 5948 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped). 5949 // 5950 // The PTEs covering the pages not set in pages_to_unmap inherit the mapping of 5951 // the current 2M PTE. 5952 static void block_gpu_unmap_split_2m(uvm_va_block_t *block, 5953 uvm_va_block_context_t *block_context, 5954 uvm_gpu_t *gpu, 5955 const uvm_page_mask_t *pages_to_unmap, 5956 uvm_push_t *push, 5957 uvm_membar_t tlb_membar) 5958 { 5959 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5960 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5961 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 5962 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5963 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5964 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); 5965 uvm_processor_id_t resident_id; 5966 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5967 DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5968 DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5969 5970 UVM_ASSERT(gpu_state->pte_is_2m); 5971 5972 resident_id = block_gpu_get_processor_to_map(block, gpu, 0); 5973 5974 uvm_pte_batch_begin(push, pte_batch); 5975 5976 // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries 5977 // from the lower levels. This means we don't need to issue a TLB invalidate 5978 // when writing those levels. 5979 5980 // Cases to handle: 5981 // 1) Big PTEs which inherit curr_prot 5982 // 2) Big PTEs which get unmapped 5983 // 3) Big PTEs which are split to 4k 5984 // a) 4k PTEs which inherit curr_prot under the split big PTEs 5985 // b) 4k PTEs which get unmapped under the split big PTEs 5986 5987 // Compute the big PTEs which will need to be split to 4k, if any. 5988 bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5989 5990 if (gpu_state->page_table_range_big.table) { 5991 // Case 1: Write the big PTEs which will inherit the 2M permissions, if 5992 // any. These are the big PTEs which are unchanged (uncovered) by the 5993 // operation. 5994 bitmap_andnot(big_ptes_inherit, 5995 new_pte_state->big_ptes, 5996 new_pte_state->big_ptes_covered, 5997 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5998 5999 block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL); 6000 6001 // Case 2: Clear the new big PTEs which get unmapped (those not covering 6002 // 4ks) 6003 bitmap_and(big_ptes_new_prot, 6004 new_pte_state->big_ptes, 6005 new_pte_state->big_ptes_covered, 6006 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6007 6008 block_gpu_pte_clear_big(block, 6009 gpu, 6010 big_ptes_new_prot, 6011 tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K), 6012 pte_batch, 6013 NULL); 6014 6015 // Case 3: Write the big PTEs which cover 4k PTEs 6016 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL); 6017 6018 // We just wrote all possible big PTEs, so mark them as initialized 6019 gpu_state->initialized_big = true; 6020 } 6021 else { 6022 UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6023 UVM_ASSERT(bitmap_full(new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6024 } 6025 6026 // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs 6027 block_gpu_pte_big_split_write_4k(block, 6028 block_context, 6029 gpu, 6030 resident_id, 6031 UVM_PROT_NONE, 6032 big_ptes_split, 6033 pages_to_unmap, 6034 pte_batch); 6035 6036 // And activate the 2M PDE. This ends the pte_batch and issues a single TLB 6037 // invalidate for the 2M entry. 6038 block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar); 6039 6040 gpu_state->pte_is_2m = false; 6041 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6042 } 6043 6044 // Unmap some pre-existing mix of big and 4k PTEs into some other mix of big 6045 // and 4k PTEs. 6046 // 6047 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped). 6048 static void block_gpu_unmap_big_and_4k(uvm_va_block_t *block, 6049 uvm_va_block_context_t *block_context, 6050 uvm_gpu_t *gpu, 6051 const uvm_page_mask_t *pages_to_unmap, 6052 uvm_push_t *push, 6053 uvm_membar_t tlb_membar) 6054 { 6055 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 6056 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 6057 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 6058 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 6059 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 6060 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6061 DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6062 DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6063 NvU32 big_page_size = tree->big_page_size; 6064 NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size); 6065 6066 UVM_ASSERT(!gpu_state->pte_is_2m); 6067 6068 uvm_pte_batch_begin(push, pte_batch); 6069 uvm_tlb_batch_begin(tree, tlb_batch); 6070 6071 // All of these cases might be perfomed in the same call: 6072 // 1) Split currently-big PTEs to 4k 6073 // a) Write new 4k PTEs which inherit curr_prot under the split big PTEs 6074 // b) Clear new 4k PTEs which get unmapped under the split big PTEs 6075 // 2) Merge currently-4k PTEs to unmapped big 6076 // 3) Clear currently-big PTEs which wholly get unmapped 6077 // 4) Clear currently-4k PTEs which get unmapped 6078 // 5) Initialize big PTEs which are not covered by this operation 6079 6080 // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are 6081 // being split. We'll make the big PTEs inactive below after flushing these 6082 // writes. No TLB invalidate is needed since the big PTE is active. 6083 // 6084 // Mask computation: big_before && !big_after 6085 bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6086 6087 block_gpu_pte_big_split_write_4k(block, 6088 block_context, 6089 gpu, 6090 UVM_ID_INVALID, 6091 UVM_PROT_NONE, 6092 big_ptes_split, 6093 pages_to_unmap, 6094 pte_batch); 6095 6096 // Case 4: Clear the 4k PTEs which weren't covered by a big PTE before, and 6097 // remain uncovered after the unmap. 6098 // 6099 // Mask computation: !big_before && !big_after 6100 bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6101 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after); 6102 if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_unmap, &block_context->scratch_page_mask)) 6103 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, tlb_batch); 6104 6105 // Case 5: If the big page table is newly-allocated, make sure that all big 6106 // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are 6107 // all initialized to invalid. 6108 // 6109 // The similar case of making newly-allocated big PTEs unmapped when no 6110 // lower 4k table is present is handled by having 6111 // block_gpu_compute_new_pte_state set new_pte_state->big_ptes 6112 // appropriately. 6113 if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) { 6114 // TODO: Bug 1766424: If we have the 4k page table already, we could 6115 // attempt to merge all uncovered big PTE regions when first 6116 // allocating the big table. That's probably not worth doing. 6117 UVM_ASSERT(gpu_state->page_table_range_4k.table); 6118 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6119 bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size)); 6120 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch); 6121 gpu_state->initialized_big = true; 6122 } 6123 6124 // Case 3 and step 1 of case 1: Unmap both currently-big PTEs which are 6125 // getting wholly unmapped, and those currently-big PTEs which are being 6126 // split to 4k. We can't directly transition from a valid big PTE to valid 6127 // lower PTEs, because that could cause the GPU TLBs to cache the same VA in 6128 // different cache lines. That could cause memory ordering to not be 6129 // maintained. 6130 // 6131 // Mask computation: (big_before && big_after && covered) || 6132 // (big_before && !big_after) 6133 bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6134 bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6135 bitmap_or(big_ptes_mask, big_ptes_mask, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6136 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, unmapped_pte_val, pte_batch, tlb_batch); 6137 6138 // Case 2: Merge the new big PTEs and end the batches, now that we've done 6139 // all of the independent PTE writes we can. 6140 // 6141 // Mask computation: !big_before && big_after 6142 if (bitmap_andnot(big_ptes_mask, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 6143 // This writes the newly-big PTEs to unmapped and ends the PTE and TLB 6144 // batches. 6145 block_gpu_pte_merge_big_and_end(block, 6146 block_context, 6147 gpu, 6148 big_ptes_mask, 6149 push, 6150 pte_batch, 6151 tlb_batch, 6152 tlb_membar); 6153 } 6154 else { 6155 // End the batches. We have to commit the membars and TLB invalidates 6156 // before we finish splitting formerly-big PTEs. 6157 uvm_pte_batch_end(pte_batch); 6158 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 6159 } 6160 6161 if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) || 6162 block_gpu_needs_to_activate_table(block, gpu)) { 6163 uvm_pte_batch_begin(push, pte_batch); 6164 uvm_tlb_batch_begin(tree, tlb_batch); 6165 6166 // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by 6167 // switching them from unmapped to invalid. This causes the GPU MMU to 6168 // start reading the 4k PTEs instead of stopping at the unmapped big 6169 // PTEs. 6170 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch); 6171 6172 uvm_pte_batch_end(pte_batch); 6173 6174 // Finally, activate the page tables if they're inactive 6175 if (block_gpu_needs_to_activate_table(block, gpu)) 6176 block_gpu_write_pde(block, gpu, push, tlb_batch); 6177 6178 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 6179 } 6180 6181 // Update gpu_state 6182 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6183 } 6184 6185 // When PTE state is about to change (for example due to a map/unmap/revoke 6186 // operation), this function decides how to split and merge the PTEs in response 6187 // to that operation. 6188 // 6189 // The operation is described with the two page masks: 6190 // 6191 // - pages_changing indicates which pages will have their PTE mappings changed 6192 // on the GPU in some way as a result of the operation (for example, which 6193 // pages will actually have their mapping permissions upgraded). 6194 // 6195 // - page_mask_after indicates which pages on this GPU will have exactly the 6196 // same PTE attributes (permissions, residency) as pages_changing after the 6197 // operation is applied. 6198 // 6199 // PTEs are merged eagerly. 6200 static void block_gpu_compute_new_pte_state(uvm_va_block_t *block, 6201 uvm_gpu_t *gpu, 6202 uvm_processor_id_t resident_id, 6203 const uvm_page_mask_t *pages_changing, 6204 const uvm_page_mask_t *page_mask_after, 6205 uvm_va_block_new_pte_state_t *new_pte_state) 6206 { 6207 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 6208 uvm_va_block_region_t big_region_all, big_page_region, region; 6209 NvU32 big_page_size; 6210 uvm_page_index_t page_index; 6211 size_t big_page_index; 6212 DECLARE_BITMAP(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6213 bool can_make_new_big_ptes; 6214 6215 memset(new_pte_state, 0, sizeof(*new_pte_state)); 6216 new_pte_state->needs_4k = true; 6217 6218 // TODO: Bug 1676485: Force a specific page size for perf testing 6219 6220 if (gpu_state->force_4k_ptes) 6221 return; 6222 6223 // Limit HMM GPU allocations to PAGE_SIZE since migrate_vma_*(), 6224 // hmm_range_fault(), and make_device_exclusive_range() don't handle folios 6225 // yet. Also, it makes mremap() difficult since the new address may not 6226 // align with the GPU block size otherwise. 6227 // If PAGE_SIZE is 64K, the code following this check is OK since 64K 6228 // big_pages is supported on all HMM supported GPUs (Turing+). 6229 // TODO: Bug 3368756: add support for transparent huge pages (THP). 6230 if (uvm_va_block_is_hmm(block) && PAGE_SIZE == UVM_PAGE_SIZE_4K) 6231 return; 6232 6233 UVM_ASSERT(uvm_page_mask_subset(pages_changing, page_mask_after)); 6234 6235 // If all pages in the 2M mask have the same attributes after the 6236 // operation is applied, we can use a 2M PTE. 6237 if (block_gpu_supports_2m(block, gpu) && 6238 uvm_page_mask_full(page_mask_after) && 6239 (UVM_ID_IS_INVALID(resident_id) || is_block_phys_contig(block, resident_id))) { 6240 new_pte_state->pte_is_2m = true; 6241 new_pte_state->needs_4k = false; 6242 return; 6243 } 6244 6245 // Find big PTEs with matching attributes 6246 6247 // Can this block fit any big pages? 6248 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); 6249 big_region_all = uvm_va_block_big_page_region_all(block, big_page_size); 6250 if (big_region_all.first >= big_region_all.outer) 6251 return; 6252 6253 new_pte_state->needs_4k = false; 6254 6255 can_make_new_big_ptes = true; 6256 6257 // Big pages can be used when mapping sysmem if the GPU supports it (Pascal+). 6258 if (UVM_ID_IS_CPU(resident_id) && !gpu->parent->can_map_sysmem_with_large_pages) 6259 can_make_new_big_ptes = false; 6260 6261 // We must not fail during teardown: unmap (resident_id == UVM_ID_INVALID) 6262 // with no splits required. That means we should avoid allocating PTEs 6263 // which are only needed for merges. 6264 // 6265 // This only matters if we're merging to big PTEs. If we're merging to 2M, 6266 // then we must already have the 2M level (since it has to be allocated 6267 // before the lower levels). 6268 // 6269 // If pte_is_2m already and we don't have a big table, we're splitting so we 6270 // have to allocate. 6271 if (UVM_ID_IS_INVALID(resident_id) && !gpu_state->page_table_range_big.table && !gpu_state->pte_is_2m) 6272 can_make_new_big_ptes = false; 6273 6274 for_each_va_block_page_in_region_mask(page_index, pages_changing, big_region_all) { 6275 uvm_va_block_region_t contig_region = {0}; 6276 6277 big_page_index = uvm_va_block_big_page_index(block, page_index, big_page_size); 6278 big_page_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 6279 6280 if (!UVM_ID_IS_INVALID(resident_id)) 6281 contig_region = block_phys_contig_region(block, page_index, resident_id); 6282 6283 __set_bit(big_page_index, new_pte_state->big_ptes_covered); 6284 6285 // When mapping sysmem, we can use big pages only if we are mapping all 6286 // pages in the big page subregion and the CPU pages backing the 6287 // subregion are physically contiguous. 6288 if (can_make_new_big_ptes && 6289 uvm_page_mask_region_full(page_mask_after, big_page_region) && 6290 (!UVM_ID_IS_CPU(resident_id) || 6291 (contig_region.first <= big_page_region.first && contig_region.outer >= big_page_region.outer))) { 6292 __set_bit(big_page_index, new_pte_state->big_ptes); 6293 } 6294 6295 if (!test_bit(big_page_index, new_pte_state->big_ptes)) 6296 new_pte_state->needs_4k = true; 6297 6298 // Skip to the end of the region 6299 page_index = big_page_region.outer - 1; 6300 } 6301 6302 if (!new_pte_state->needs_4k) { 6303 // All big page regions in pages_changing will be big PTEs. Now check if 6304 // there are any unaligned pages outside of big_region_all which are 6305 // changing. 6306 region = uvm_va_block_region(0, big_region_all.first); 6307 if (!uvm_page_mask_region_empty(pages_changing, region)) { 6308 new_pte_state->needs_4k = true; 6309 } 6310 else { 6311 region = uvm_va_block_region(big_region_all.outer, uvm_va_block_num_cpu_pages(block)); 6312 if (!uvm_page_mask_region_empty(pages_changing, region)) 6313 new_pte_state->needs_4k = true; 6314 } 6315 } 6316 6317 // Now add in the PTEs which should be big but weren't covered by this 6318 // operation. 6319 // 6320 // Note that we can't assume that a given page table range has been 6321 // initialized if it's present here, since it could have been allocated by a 6322 // thread which had to restart its operation due to allocation retry. 6323 if (gpu_state->pte_is_2m || (block_gpu_supports_2m(block, gpu) && !gpu_state->page_table_range_2m.table)) { 6324 // We're splitting a 2M PTE so all of the uncovered big PTE regions will 6325 // become big PTEs which inherit the 2M permissions. If we haven't 6326 // allocated the 2M table yet, it will start as a 2M PTE until the lower 6327 // levels are allocated, so it's the same split case regardless of 6328 // whether this operation will need to retry a later allocation. 6329 bitmap_complement(big_ptes_not_covered, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6330 } 6331 else if (!gpu_state->page_table_range_4k.table && !new_pte_state->needs_4k) { 6332 // If we don't have 4k PTEs and we won't be allocating them for this 6333 // operation, all of our PTEs need to be big. 6334 UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6335 bitmap_zero(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6336 bitmap_set(big_ptes_not_covered, 0, uvm_va_block_num_big_pages(block, big_page_size)); 6337 } 6338 else { 6339 // Otherwise, add in all of the currently-big PTEs which are unchanging. 6340 // They won't be written, but they need to be carried into the new 6341 // gpu_state->big_ptes when it's updated. 6342 bitmap_andnot(big_ptes_not_covered, 6343 gpu_state->big_ptes, 6344 new_pte_state->big_ptes_covered, 6345 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6346 } 6347 6348 bitmap_or(new_pte_state->big_ptes, new_pte_state->big_ptes, big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6349 } 6350 6351 // Wrapper around uvm_page_tree_get_ptes() and uvm_page_tree_alloc_table() that 6352 // handles allocation retry. If the block lock has been unlocked and relocked as 6353 // part of the allocation, NV_ERR_MORE_PROCESSING_REQUIRED is returned to signal 6354 // to the caller that the operation likely needs to be restarted. If that 6355 // happens, the pending tracker is added to the block's tracker. 6356 static NV_STATUS block_alloc_pt_range_with_retry(uvm_va_block_t *va_block, 6357 uvm_gpu_t *gpu, 6358 NvU32 page_size, 6359 uvm_page_table_range_t *page_table_range, 6360 uvm_tracker_t *pending_tracker) 6361 { 6362 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 6363 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu); 6364 uvm_page_tree_t *page_tables = &gpu_va_space->page_tables; 6365 uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block); 6366 uvm_page_table_range_t local_range; 6367 NV_STATUS status; 6368 6369 // Blocks may contain large PTEs without starting on a PTE boundary or 6370 // having an aligned size. Cover the PTEs of this size in the block's 6371 // interior so we match uvm_va_block_gpu_state_t::big_ptes. 6372 NvU64 start = UVM_ALIGN_UP(va_block->start, page_size); 6373 NvU64 size = UVM_ALIGN_DOWN(va_block->end + 1, page_size) - start; 6374 6375 // VA blocks which can use the 2MB level as either a PTE or a PDE need to 6376 // account for the PDE specially, so they must use uvm_page_tree_alloc_table 6377 // to allocate the lower levels. 6378 bool use_alloc_table = block_gpu_supports_2m(va_block, gpu) && page_size < UVM_PAGE_SIZE_2M; 6379 6380 UVM_ASSERT(page_table_range->table == NULL); 6381 6382 if (va_block_test && va_block_test->page_table_allocation_retry_force_count > 0) { 6383 --va_block_test->page_table_allocation_retry_force_count; 6384 status = NV_ERR_NO_MEMORY; 6385 } 6386 else if (use_alloc_table) { 6387 // Pascal+: 4k/64k tables under a 2M entry 6388 UVM_ASSERT(gpu_state->page_table_range_2m.table); 6389 status = uvm_page_tree_alloc_table(page_tables, 6390 page_size, 6391 UVM_PMM_ALLOC_FLAGS_NONE, 6392 &gpu_state->page_table_range_2m, 6393 page_table_range); 6394 } 6395 else { 6396 // 4k/big tables on pre-Pascal, and the 2M entry on Pascal+ 6397 status = uvm_page_tree_get_ptes(page_tables, 6398 page_size, 6399 start, 6400 size, 6401 UVM_PMM_ALLOC_FLAGS_NONE, 6402 page_table_range); 6403 } 6404 6405 if (status == NV_OK) 6406 goto allocated; 6407 6408 if (status != NV_ERR_NO_MEMORY) 6409 return status; 6410 6411 // Before unlocking the block lock, any pending work on the block has to be 6412 // added to the block's tracker. 6413 if (pending_tracker) { 6414 status = uvm_tracker_add_tracker_safe(&va_block->tracker, pending_tracker); 6415 if (status != NV_OK) 6416 return status; 6417 } 6418 6419 // Unlock the va block and retry with eviction enabled 6420 uvm_mutex_unlock(&va_block->lock); 6421 6422 if (use_alloc_table) { 6423 // Although we don't hold the block lock here, it's safe to pass 6424 // gpu_state->page_table_range_2m to the page tree code because we know 6425 // that the 2m range has already been allocated, and that it can't go 6426 // away while we have the va_space lock held. 6427 status = uvm_page_tree_alloc_table(page_tables, 6428 page_size, 6429 UVM_PMM_ALLOC_FLAGS_EVICT, 6430 &gpu_state->page_table_range_2m, 6431 &local_range); 6432 } 6433 else { 6434 status = uvm_page_tree_get_ptes(page_tables, 6435 page_size, 6436 start, 6437 size, 6438 UVM_PMM_ALLOC_FLAGS_EVICT, 6439 &local_range); 6440 } 6441 6442 uvm_mutex_lock(&va_block->lock); 6443 6444 if (status != NV_OK) 6445 return status; 6446 6447 status = NV_ERR_MORE_PROCESSING_REQUIRED; 6448 6449 if (page_table_range->table) { 6450 // A different caller allocated the page tables in the meantime, release the 6451 // local copy. 6452 uvm_page_tree_put_ptes(page_tables, &local_range); 6453 return status; 6454 } 6455 6456 *page_table_range = local_range; 6457 6458 allocated: 6459 // Mark the 2M PTE as active when we first allocate it, since we don't have 6460 // any PTEs below it yet. 6461 if (page_size == UVM_PAGE_SIZE_2M) { 6462 UVM_ASSERT(!gpu_state->pte_is_2m); 6463 gpu_state->pte_is_2m = true; 6464 } 6465 else if (page_size != UVM_PAGE_SIZE_4K) { 6466 // uvm_page_tree_get_ptes initializes big PTEs to invalid. 6467 // uvm_page_tree_alloc_table does not, so we'll have to do it later. 6468 if (use_alloc_table) 6469 UVM_ASSERT(!gpu_state->initialized_big); 6470 else 6471 gpu_state->initialized_big = true; 6472 } 6473 6474 return status; 6475 } 6476 6477 // Helper which allocates all page table ranges necessary for the given page 6478 // sizes. See block_alloc_pt_range_with_retry. 6479 static NV_STATUS block_alloc_ptes_with_retry(uvm_va_block_t *va_block, 6480 uvm_gpu_t *gpu, 6481 NvU32 page_sizes, 6482 uvm_tracker_t *pending_tracker) 6483 { 6484 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 6485 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu); 6486 uvm_page_table_range_t *range; 6487 NvU32 page_size; 6488 NV_STATUS status, final_status = NV_OK; 6489 6490 UVM_ASSERT(gpu_state); 6491 6492 // Blocks which can map 2M PTE/PDEs must always allocate the 2MB level first 6493 // in order to allocate the levels below. 6494 if (block_gpu_supports_2m(va_block, gpu)) 6495 page_sizes |= UVM_PAGE_SIZE_2M; 6496 6497 UVM_ASSERT((page_sizes & gpu_va_space->page_tables.hal->page_sizes()) == page_sizes); 6498 6499 for_each_chunk_size_rev(page_size, page_sizes) { 6500 if (page_size == UVM_PAGE_SIZE_2M) 6501 range = &gpu_state->page_table_range_2m; 6502 else if (page_size == UVM_PAGE_SIZE_4K) 6503 range = &gpu_state->page_table_range_4k; 6504 else 6505 range = &gpu_state->page_table_range_big; 6506 6507 if (range->table) 6508 continue; 6509 6510 if (page_size == UVM_PAGE_SIZE_2M) { 6511 UVM_ASSERT(!gpu_state->pte_is_2m); 6512 UVM_ASSERT(!gpu_state->page_table_range_big.table); 6513 UVM_ASSERT(!gpu_state->page_table_range_4k.table); 6514 } 6515 else if (page_size != UVM_PAGE_SIZE_4K) { 6516 UVM_ASSERT(uvm_va_block_num_big_pages(va_block, uvm_va_block_gpu_big_page_size(va_block, gpu)) > 0); 6517 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6518 } 6519 6520 status = block_alloc_pt_range_with_retry(va_block, gpu, page_size, range, pending_tracker); 6521 6522 // Keep going to allocate the remaining levels even if the allocation 6523 // requires a retry, since we'll likely still need them when we retry 6524 // anyway. 6525 if (status == NV_ERR_MORE_PROCESSING_REQUIRED) 6526 final_status = NV_ERR_MORE_PROCESSING_REQUIRED; 6527 else if (status != NV_OK) 6528 return status; 6529 } 6530 6531 return final_status; 6532 } 6533 6534 static NV_STATUS block_alloc_ptes_new_state(uvm_va_block_t *va_block, 6535 uvm_gpu_t *gpu, 6536 uvm_va_block_new_pte_state_t *new_pte_state, 6537 uvm_tracker_t *pending_tracker) 6538 { 6539 NvU32 page_sizes = 0; 6540 6541 if (new_pte_state->pte_is_2m) { 6542 page_sizes |= UVM_PAGE_SIZE_2M; 6543 } 6544 else { 6545 if (!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 6546 page_sizes |= uvm_va_block_gpu_big_page_size(va_block, gpu); 6547 6548 if (new_pte_state->needs_4k) 6549 page_sizes |= UVM_PAGE_SIZE_4K; 6550 else 6551 UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6552 } 6553 6554 return block_alloc_ptes_with_retry(va_block, gpu, page_sizes, pending_tracker); 6555 } 6556 6557 // Make sure that GMMU PDEs down to PDE1 are populated for the given VA block. 6558 // This is currently used on ATS systems to prevent GPUs from inadvertently 6559 // accessing sysmem via ATS because there is no PDE1 in the GMMU page tables, 6560 // which is where the NOATS bit resides. 6561 // 6562 // The current implementation simply pre-allocates the PTEs for the VA Block, 6563 // which is wasteful because the GPU may never need them. 6564 // 6565 // TODO: Bug 2064188: Change the MMU code to be able to directly refcount PDE1 6566 // page table entries without having to request PTEs. 6567 static NV_STATUS block_pre_populate_pde1_gpu(uvm_va_block_t *block, 6568 uvm_gpu_va_space_t *gpu_va_space, 6569 uvm_tracker_t *pending_tracker) 6570 { 6571 NvU32 page_sizes; 6572 NvU32 big_page_size; 6573 uvm_gpu_t *gpu; 6574 uvm_va_block_gpu_state_t *gpu_state; 6575 6576 UVM_ASSERT(block); 6577 UVM_ASSERT(gpu_va_space); 6578 UVM_ASSERT(gpu_va_space->ats.enabled); 6579 UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE); 6580 6581 gpu = gpu_va_space->gpu; 6582 big_page_size = gpu_va_space->page_tables.big_page_size; 6583 6584 gpu_state = block_gpu_state_get_alloc(block, gpu); 6585 if (!gpu_state) 6586 return NV_ERR_NO_MEMORY; 6587 6588 // If the VA Block supports 2M pages, allocate the 2M PTE only, as it 6589 // requires less memory 6590 if (block_gpu_supports_2m(block, gpu)) 6591 page_sizes = UVM_PAGE_SIZE_2M; 6592 else if (uvm_va_block_num_big_pages(block, big_page_size) > 0) 6593 page_sizes = big_page_size; 6594 else 6595 page_sizes = UVM_PAGE_SIZE_4K; 6596 6597 return block_alloc_ptes_with_retry(block, gpu, page_sizes, pending_tracker); 6598 } 6599 6600 static NV_STATUS block_pre_populate_pde1_all_gpus(uvm_va_block_t *block, uvm_tracker_t *pending_tracker) 6601 { 6602 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 6603 NV_STATUS status = NV_OK; 6604 6605 // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See 6606 // comments in block_pre_populate_pde1_gpu. 6607 if (g_uvm_global.ats.enabled && !block->cpu.ever_mapped) { 6608 uvm_gpu_va_space_t *gpu_va_space; 6609 6610 for_each_gpu_va_space(gpu_va_space, va_space) { 6611 // We only care about systems where ATS is supported and the application 6612 // enabled it. 6613 if (!gpu_va_space->ats.enabled) 6614 continue; 6615 6616 status = block_pre_populate_pde1_gpu(block, gpu_va_space, pending_tracker); 6617 if (status != NV_OK) 6618 break; 6619 } 6620 } 6621 6622 return status; 6623 } 6624 6625 static NV_STATUS block_unmap_gpu(uvm_va_block_t *block, 6626 uvm_va_block_context_t *block_context, 6627 uvm_gpu_t *gpu, 6628 const uvm_page_mask_t *unmap_page_mask, 6629 uvm_tracker_t *out_tracker) 6630 { 6631 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 6632 uvm_pte_bits_gpu_t pte_bit; 6633 uvm_push_t push; 6634 uvm_membar_t tlb_membar; 6635 bool only_local_mappings; 6636 uvm_page_mask_t *pages_to_unmap = &block_context->mapping.page_mask; 6637 NV_STATUS status; 6638 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 6639 bool mask_empty; 6640 6641 // We have to check gpu_state before looking at any VA space state like our 6642 // gpu_va_space, because we could be on the eviction path where we don't 6643 // have a lock on that state. However, since remove_gpu_va_space walks each 6644 // block to unmap the GPU before destroying the gpu_va_space, we're 6645 // guaranteed that if this GPU has page tables, the gpu_va_space can't go 6646 // away while we're holding the block lock. 6647 if (!block_gpu_has_page_tables(block, gpu)) 6648 return NV_OK; 6649 6650 if (!uvm_page_mask_and(pages_to_unmap, unmap_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ])) 6651 return NV_OK; 6652 6653 // block_gpu_compute_new_pte_state needs a mask of pages which will have 6654 // matching attributes after the operation is performed. In the case of 6655 // unmap, those are the pages with unset bits. 6656 uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], pages_to_unmap); 6657 uvm_page_mask_complement(&block_context->scratch_page_mask, &block_context->scratch_page_mask); 6658 block_gpu_compute_new_pte_state(block, 6659 gpu, 6660 UVM_ID_INVALID, 6661 pages_to_unmap, 6662 &block_context->scratch_page_mask, 6663 new_pte_state); 6664 6665 status = block_alloc_ptes_new_state(block, gpu, new_pte_state, out_tracker); 6666 if (status != NV_OK) 6667 return status; 6668 6669 only_local_mappings = !block_has_remote_mapping_gpu(block, block_context, gpu->id, pages_to_unmap); 6670 tlb_membar = uvm_hal_downgrade_membar_type(gpu, only_local_mappings); 6671 6672 status = uvm_push_begin_acquire(gpu->channel_manager, 6673 UVM_CHANNEL_TYPE_MEMOPS, 6674 &block->tracker, 6675 &push, 6676 "Unmapping pages in block [0x%llx, 0x%llx)", 6677 block->start, 6678 block->end + 1); 6679 if (status != NV_OK) 6680 return status; 6681 6682 if (new_pte_state->pte_is_2m) { 6683 // We're either unmapping a whole valid 2M PTE, or we're unmapping all 6684 // remaining pages in a split 2M PTE. 6685 block_gpu_unmap_to_2m(block, block_context, gpu, &push, tlb_membar); 6686 } 6687 else if (gpu_state->pte_is_2m) { 6688 // The block is currently mapped as a valid 2M PTE and we're unmapping 6689 // some pages within the 2M, so we have to split it into the appropriate 6690 // mix of big and 4k PTEs. 6691 block_gpu_unmap_split_2m(block, block_context, gpu, pages_to_unmap, &push, tlb_membar); 6692 } 6693 else { 6694 // We're unmapping some pre-existing mix of big and 4K PTEs into some 6695 // other mix of big and 4K PTEs. 6696 block_gpu_unmap_big_and_4k(block, block_context, gpu, pages_to_unmap, &push, tlb_membar); 6697 } 6698 6699 uvm_push_end(&push); 6700 6701 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) { 6702 uvm_processor_mask_t non_uvm_lite_gpus; 6703 uvm_processor_mask_andnot(&non_uvm_lite_gpus, &block->mapped, block_get_uvm_lite_gpus(block)); 6704 6705 UVM_ASSERT(uvm_processor_mask_test(&non_uvm_lite_gpus, gpu->id)); 6706 6707 // If the GPU is the only non-UVM-Lite processor with mappings, we can 6708 // safely mark pages as fully unmapped 6709 if (uvm_processor_mask_get_count(&non_uvm_lite_gpus) == 1) 6710 uvm_page_mask_andnot(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_unmap); 6711 } 6712 6713 // Clear block PTE state 6714 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { 6715 mask_empty = !uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit], 6716 &gpu_state->pte_bits[pte_bit], 6717 pages_to_unmap); 6718 if (pte_bit == UVM_PTE_BITS_GPU_READ && mask_empty) 6719 uvm_processor_mask_clear(&block->mapped, gpu->id); 6720 } 6721 6722 UVM_ASSERT(block_check_mappings(block)); 6723 6724 return uvm_tracker_add_push_safe(out_tracker, &push); 6725 } 6726 6727 NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block, 6728 uvm_va_block_context_t *va_block_context, 6729 uvm_processor_id_t id, 6730 uvm_va_block_region_t region, 6731 const uvm_page_mask_t *unmap_page_mask, 6732 uvm_tracker_t *out_tracker) 6733 { 6734 uvm_page_mask_t *region_page_mask = &va_block_context->mapping.map_running_page_mask; 6735 6736 UVM_ASSERT(!uvm_va_block_is_dead(va_block)); 6737 uvm_assert_mutex_locked(&va_block->lock); 6738 6739 if (UVM_ID_IS_CPU(id)) { 6740 block_unmap_cpu(va_block, region, unmap_page_mask); 6741 return NV_OK; 6742 } 6743 6744 uvm_page_mask_init_from_region(region_page_mask, region, unmap_page_mask); 6745 6746 return block_unmap_gpu(va_block, va_block_context, block_get_gpu(va_block, id), region_page_mask, out_tracker); 6747 } 6748 6749 // This function essentially works as a wrapper around vm_insert_page (hence 6750 // the similar function prototype). This is needed since vm_insert_page 6751 // doesn't take permissions as input, but uses vma->vm_page_prot instead. 6752 // Since we may have multiple VA blocks under one VMA which need to map 6753 // with different permissions, we have to manually change vma->vm_page_prot for 6754 // each call to vm_insert_page. Multiple faults under one VMA in separate 6755 // blocks can be serviced concurrently, so the VMA wrapper lock is used 6756 // to protect access to vma->vm_page_prot. 6757 static NV_STATUS uvm_cpu_insert_page(struct vm_area_struct *vma, 6758 NvU64 addr, 6759 struct page *page, 6760 uvm_prot_t new_prot) 6761 { 6762 uvm_vma_wrapper_t *vma_wrapper; 6763 unsigned long target_flags; 6764 pgprot_t target_pgprot; 6765 int ret; 6766 6767 UVM_ASSERT(vma); 6768 UVM_ASSERT(vma->vm_private_data); 6769 6770 vma_wrapper = vma->vm_private_data; 6771 target_flags = vma->vm_flags; 6772 6773 if (new_prot == UVM_PROT_READ_ONLY) 6774 target_flags &= ~VM_WRITE; 6775 6776 target_pgprot = vm_get_page_prot(target_flags); 6777 6778 // Take VMA wrapper lock to check vma->vm_page_prot 6779 uvm_down_read(&vma_wrapper->lock); 6780 6781 // Take a write lock if we need to modify the VMA vm_page_prot 6782 // - vma->vm_page_prot creates writable PTEs but new prot is RO 6783 // - vma->vm_page_prot creates read-only PTEs but new_prot is RW 6784 if (pgprot_val(vma->vm_page_prot) != pgprot_val(target_pgprot)) { 6785 uvm_up_read(&vma_wrapper->lock); 6786 uvm_down_write(&vma_wrapper->lock); 6787 6788 vma->vm_page_prot = target_pgprot; 6789 6790 uvm_downgrade_write(&vma_wrapper->lock); 6791 } 6792 6793 ret = vm_insert_page(vma, addr, page); 6794 uvm_up_read(&vma_wrapper->lock); 6795 if (ret) { 6796 UVM_ASSERT_MSG(ret == -ENOMEM, "ret: %d\n", ret); 6797 return errno_to_nv_status(ret); 6798 } 6799 6800 return NV_OK; 6801 } 6802 6803 static uvm_prot_t compute_logical_prot(uvm_va_block_t *va_block, 6804 uvm_va_block_context_t *va_block_context, 6805 uvm_page_index_t page_index) 6806 { 6807 struct vm_area_struct *vma; 6808 uvm_prot_t logical_prot; 6809 6810 if (uvm_va_block_is_hmm(va_block)) { 6811 NvU64 addr = uvm_va_block_cpu_page_address(va_block, page_index); 6812 6813 logical_prot = uvm_hmm_compute_logical_prot(va_block, va_block_context, addr); 6814 } 6815 else { 6816 uvm_va_range_t *va_range = va_block->va_range; 6817 6818 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 6819 6820 // Zombified VA ranges no longer have a vma, so they have no permissions 6821 if (uvm_va_range_is_managed_zombie(va_range)) { 6822 logical_prot = UVM_PROT_NONE; 6823 } 6824 else { 6825 vma = uvm_va_range_vma(va_range); 6826 6827 if (!(vma->vm_flags & VM_READ)) 6828 logical_prot = UVM_PROT_NONE; 6829 else if (!(vma->vm_flags & VM_WRITE)) 6830 logical_prot = UVM_PROT_READ_ONLY; 6831 else 6832 logical_prot = UVM_PROT_READ_WRITE_ATOMIC; 6833 } 6834 } 6835 6836 return logical_prot; 6837 } 6838 6839 static struct page *block_page_get(uvm_va_block_t *block, block_phys_page_t block_page) 6840 { 6841 struct page *page; 6842 6843 if (UVM_ID_IS_CPU(block_page.processor)) { 6844 page = uvm_cpu_chunk_get_cpu_page(block, block_page.page_index); 6845 } 6846 else { 6847 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 6848 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, block_page.processor); 6849 size_t chunk_offset; 6850 uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_page, &chunk_offset); 6851 6852 UVM_ASSERT(gpu->mem_info.numa.enabled); 6853 page = uvm_gpu_chunk_to_page(&gpu->pmm, chunk) + chunk_offset / PAGE_SIZE; 6854 } 6855 6856 UVM_ASSERT(page); 6857 return page; 6858 } 6859 6860 // Creates or upgrades a CPU mapping for the given page, updating the block's 6861 // mapping and pte_bits bitmaps as appropriate. Upon successful return, the page 6862 // will be mapped with at least new_prot permissions. 6863 // 6864 // This never downgrades mappings, so new_prot must not be UVM_PROT_NONE. Use 6865 // block_unmap_cpu or uvm_va_block_revoke_prot instead. 6866 // 6867 // If the existing mapping is >= new_prot already, this is a no-op. 6868 // 6869 // It is the caller's responsibility to: 6870 // - Revoke mappings from other processors as appropriate so the CPU can map 6871 // with new_prot permissions 6872 // - Guarantee that vm_insert_page is safe to use (vma->vm_mm has a reference 6873 // and mmap_lock is held in at least read mode) 6874 // - Ensure that the struct page corresponding to the physical memory being 6875 // mapped exists 6876 // - Manage the block's residency bitmap 6877 // - Ensure that the block hasn't been killed (block->va_range is present) 6878 // - Update the pte/mapping tracking state on success 6879 static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block, 6880 uvm_va_block_context_t *va_block_context, 6881 uvm_processor_id_t resident_id, 6882 uvm_page_index_t page_index, 6883 uvm_prot_t new_prot) 6884 { 6885 uvm_prot_t curr_prot = block_page_prot_cpu(block, page_index); 6886 uvm_va_range_t *va_range = block->va_range; 6887 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 6888 struct vm_area_struct *vma; 6889 NV_STATUS status; 6890 NvU64 addr; 6891 struct page *page; 6892 6893 UVM_ASSERT(uvm_va_block_is_hmm(block) || va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 6894 UVM_ASSERT(new_prot != UVM_PROT_NONE); 6895 UVM_ASSERT(new_prot < UVM_PROT_MAX); 6896 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU)); 6897 6898 uvm_assert_mutex_locked(&block->lock); 6899 if (UVM_ID_IS_CPU(resident_id)) 6900 UVM_ASSERT(uvm_page_mask_test(&block->cpu.allocated, page_index)); 6901 6902 // For the CPU, write implies atomic 6903 if (new_prot == UVM_PROT_READ_WRITE) 6904 new_prot = UVM_PROT_READ_WRITE_ATOMIC; 6905 6906 // Only upgrades are supported in this function 6907 UVM_ASSERT(curr_prot <= new_prot); 6908 6909 if (new_prot == curr_prot) 6910 return NV_OK; 6911 6912 // Check for existing VMA permissions. They could have been modified after 6913 // the initial mmap by mprotect. 6914 if (new_prot > compute_logical_prot(block, va_block_context, page_index)) 6915 return NV_ERR_INVALID_ACCESS_TYPE; 6916 6917 if (uvm_va_block_is_hmm(block)) { 6918 // Do not map CPU pages because they belong to the Linux kernel. 6919 return NV_OK; 6920 } 6921 6922 UVM_ASSERT(va_range); 6923 6924 if (UVM_ID_IS_CPU(resident_id) && UVM_ID_IS_CPU(uvm_va_range_get_policy(va_range)->preferred_location)) { 6925 // Add the page's range group range to the range group's migrated list. 6926 uvm_range_group_range_t *rgr = uvm_range_group_range_find(va_space, 6927 uvm_va_block_cpu_page_address(block, page_index)); 6928 if (rgr != NULL) { 6929 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock); 6930 if (list_empty(&rgr->range_group_migrated_list_node)) 6931 list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges); 6932 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock); 6933 } 6934 } 6935 6936 // It's possible here that current->mm != vma->vm_mm. That can happen for 6937 // example due to access_process_vm (ptrace) or get_user_pages from another 6938 // driver. 6939 // 6940 // In such cases the caller has taken care of ref counting vma->vm_mm for 6941 // us, so we can safely operate on the vma but we can't use 6942 // uvm_va_range_vma_current. 6943 vma = uvm_va_range_vma(va_range); 6944 uvm_assert_mmap_lock_locked(vma->vm_mm); 6945 UVM_ASSERT(!uvm_va_space_mm_enabled(va_space) || va_space->va_space_mm.mm == vma->vm_mm); 6946 6947 // Add the mapping 6948 addr = uvm_va_block_cpu_page_address(block, page_index); 6949 6950 // This unmap handles upgrades as vm_insert_page returns -EBUSY when 6951 // there's already a mapping present at fault_addr, so we have to unmap 6952 // first anyway when upgrading from RO -> RW. 6953 if (curr_prot != UVM_PROT_NONE) 6954 unmap_mapping_range(va_space->mapping, addr, PAGE_SIZE, 1); 6955 6956 // Don't map the CPU until prior copies and GPU PTE updates finish, 6957 // otherwise we might not stay coherent. 6958 status = uvm_tracker_wait(&block->tracker); 6959 if (status != NV_OK) 6960 return status; 6961 6962 page = block_page_get(block, block_phys_page(resident_id, page_index)); 6963 return uvm_cpu_insert_page(vma, addr, page, new_prot); 6964 } 6965 6966 // Maps the CPU to the given pages which are resident on resident_id. 6967 // map_page_mask is an in/out parameter: the pages which are mapped to 6968 // resident_id are removed from the mask before returning. 6969 // 6970 // Caller must ensure that: 6971 // - Pages in map_page_mask must not be set in the corresponding cpu.pte_bits 6972 // mask for the requested protection. 6973 static NV_STATUS block_map_cpu_to(uvm_va_block_t *block, 6974 uvm_va_block_context_t *block_context, 6975 uvm_processor_id_t resident_id, 6976 uvm_va_block_region_t region, 6977 uvm_page_mask_t *map_page_mask, 6978 uvm_prot_t new_prot, 6979 uvm_tracker_t *out_tracker) 6980 { 6981 NV_STATUS status = NV_OK; 6982 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 6983 uvm_page_index_t page_index; 6984 uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask; 6985 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, resident_id); 6986 uvm_pte_bits_cpu_t prot_pte_bit = get_cpu_pte_bit_index(new_prot); 6987 uvm_pte_bits_cpu_t pte_bit; 6988 6989 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU)); 6990 6991 // TODO: Bug 1766424: Check if optimizing the unmap_mapping_range calls 6992 // within block_map_cpu_page_to by doing them once here is helpful. 6993 6994 UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask, 6995 map_page_mask, 6996 &block->cpu.pte_bits[prot_pte_bit])); 6997 6998 // The pages which will actually change are those in the input page mask 6999 // which are resident on the target. 7000 if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask)) 7001 return NV_OK; 7002 7003 status = block_pre_populate_pde1_all_gpus(block, out_tracker); 7004 if (status != NV_OK) 7005 return status; 7006 7007 block->cpu.ever_mapped = true; 7008 7009 for_each_va_block_page_in_region_mask(page_index, pages_to_map, region) { 7010 status = block_map_cpu_page_to(block, 7011 block_context, 7012 resident_id, 7013 page_index, 7014 new_prot); 7015 if (status != NV_OK) 7016 break; 7017 7018 uvm_processor_mask_set(&block->mapped, UVM_ID_CPU); 7019 } 7020 7021 // If there was some error, shrink the region so that we only update the 7022 // pte/mapping tracking bits for the pages that succeeded 7023 if (status != NV_OK) { 7024 region = uvm_va_block_region(region.first, page_index); 7025 uvm_page_mask_region_clear_outside(pages_to_map, region); 7026 } 7027 7028 // If pages are mapped from a remote residency, notify the remote mapping 7029 // events to tools. We skip event notification if the cause is Invalid. We 7030 // use it to signal that this function is being called from the revocation 7031 // path to avoid reporting duplicate events. 7032 if (UVM_ID_IS_GPU(resident_id) && 7033 va_space->tools.enabled && 7034 block_context->mapping.cause != UvmEventMapRemoteCauseInvalid) { 7035 uvm_va_block_region_t subregion; 7036 for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) { 7037 uvm_tools_record_map_remote(block, 7038 NULL, 7039 UVM_ID_CPU, 7040 resident_id, 7041 uvm_va_block_region_start(block, subregion), 7042 uvm_va_block_region_size(subregion), 7043 block_context->mapping.cause); 7044 } 7045 } 7046 7047 // Update CPU mapping state 7048 for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++) 7049 uvm_page_mask_or(&block->cpu.pte_bits[pte_bit], &block->cpu.pte_bits[pte_bit], pages_to_map); 7050 7051 uvm_page_mask_or(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_map); 7052 7053 UVM_ASSERT(block_check_mappings(block)); 7054 7055 // Remove all pages that were newly-mapped from the input mask 7056 uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map); 7057 7058 return status; 7059 } 7060 7061 // Maps the GPU to the given pages which are resident on resident_id. 7062 // map_page_mask is an in/out parameter: the pages which are mapped 7063 // to resident_id are removed from the mask before returning. 7064 // 7065 // Caller must ensure that: 7066 // - Pages in map_page_mask must not be set in the corresponding pte_bits mask 7067 // for the requested protection on the mapping GPU. 7068 static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block, 7069 uvm_va_block_context_t *block_context, 7070 uvm_gpu_t *gpu, 7071 uvm_processor_id_t resident_id, 7072 uvm_page_mask_t *map_page_mask, 7073 uvm_prot_t new_prot, 7074 uvm_tracker_t *out_tracker) 7075 { 7076 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 7077 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7078 uvm_push_t push; 7079 NV_STATUS status; 7080 uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask; 7081 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id); 7082 uvm_pte_bits_gpu_t pte_bit; 7083 uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot); 7084 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 7085 block_pte_op_t pte_op; 7086 7087 UVM_ASSERT(map_page_mask); 7088 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], gpu->id)); 7089 7090 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) 7091 UVM_ASSERT(uvm_id_equal(resident_id, uvm_va_range_get_policy(va_block->va_range)->preferred_location)); 7092 7093 UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask, 7094 map_page_mask, 7095 &gpu_state->pte_bits[prot_pte_bit])); 7096 7097 // The pages which will actually change are those in the input page mask 7098 // which are resident on the target. 7099 if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask)) 7100 return NV_OK; 7101 7102 UVM_ASSERT(block_check_mapping_residency(va_block, gpu, resident_id, pages_to_map)); 7103 7104 // For PTE merge/split computation, compute all resident pages which will 7105 // have exactly new_prot after performing the mapping. 7106 uvm_page_mask_or(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_map); 7107 if (prot_pte_bit < UVM_PTE_BITS_GPU_ATOMIC) { 7108 uvm_page_mask_andnot(&block_context->scratch_page_mask, 7109 &block_context->scratch_page_mask, 7110 &gpu_state->pte_bits[prot_pte_bit + 1]); 7111 } 7112 uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask); 7113 7114 block_gpu_compute_new_pte_state(va_block, 7115 gpu, 7116 resident_id, 7117 pages_to_map, 7118 &block_context->scratch_page_mask, 7119 new_pte_state); 7120 7121 status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker); 7122 if (status != NV_OK) 7123 return status; 7124 7125 status = uvm_push_begin_acquire(gpu->channel_manager, 7126 UVM_CHANNEL_TYPE_MEMOPS, 7127 &va_block->tracker, 7128 &push, 7129 "Mapping pages in block [0x%llx, 0x%llx) as %s", 7130 va_block->start, 7131 va_block->end + 1, 7132 uvm_prot_string(new_prot)); 7133 if (status != NV_OK) 7134 return status; 7135 7136 pte_op = BLOCK_PTE_OP_MAP; 7137 if (new_pte_state->pte_is_2m) { 7138 // We're either modifying permissions of a pre-existing 2M PTE, or all 7139 // permissions match so we can merge to a new 2M PTE. 7140 block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op); 7141 } 7142 else if (gpu_state->pte_is_2m) { 7143 // Permissions on a subset of the existing 2M PTE are being upgraded, so 7144 // we have to split it into the appropriate mix of big and 4k PTEs. 7145 block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op); 7146 } 7147 else { 7148 // We're upgrading permissions on some pre-existing mix of big and 4K 7149 // PTEs into some other mix of big and 4K PTEs. 7150 block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op); 7151 } 7152 7153 // If we are mapping remotely, record the event 7154 if (va_space->tools.enabled && !uvm_id_equal(resident_id, gpu->id)) { 7155 uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(va_block); 7156 7157 UVM_ASSERT(block_context->mapping.cause != UvmEventMapRemoteCauseInvalid); 7158 7159 for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) { 7160 uvm_tools_record_map_remote(va_block, 7161 &push, 7162 gpu->id, 7163 resident_id, 7164 uvm_va_block_region_start(va_block, subregion), 7165 uvm_va_block_region_size(subregion), 7166 block_context->mapping.cause); 7167 } 7168 } 7169 7170 uvm_push_end(&push); 7171 7172 // Update GPU mapping state 7173 for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++) 7174 uvm_page_mask_or(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_map); 7175 7176 uvm_processor_mask_set(&va_block->mapped, gpu->id); 7177 7178 // If we are mapping a UVM-Lite GPU do not update maybe_mapped_pages 7179 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) 7180 uvm_page_mask_or(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, pages_to_map); 7181 7182 // Remove all pages resident on this processor from the input mask, which 7183 // were newly-mapped. 7184 uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map); 7185 7186 UVM_ASSERT(block_check_mappings(va_block)); 7187 7188 return uvm_tracker_add_push_safe(out_tracker, &push); 7189 } 7190 7191 static void map_get_allowed_destinations(uvm_va_block_t *block, 7192 uvm_va_block_context_t *va_block_context, 7193 const uvm_va_policy_t *policy, 7194 uvm_processor_id_t id, 7195 uvm_processor_mask_t *allowed_mask) 7196 { 7197 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7198 7199 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), id)) { 7200 // UVM-Lite can only map resident pages on the preferred location 7201 uvm_processor_mask_zero(allowed_mask); 7202 uvm_processor_mask_set(allowed_mask, policy->preferred_location); 7203 } 7204 else if ((uvm_va_policy_is_read_duplicate(policy, va_space) || 7205 (uvm_id_equal(policy->preferred_location, id) && 7206 !is_uvm_fault_force_sysmem_set() && 7207 !uvm_hmm_must_use_sysmem(block, va_block_context))) && 7208 uvm_va_space_processor_has_memory(va_space, id)) { 7209 // When operating under read-duplication we should only map the local 7210 // processor to cause fault-and-duplicate of remote pages. 7211 // 7212 // The same holds when this processor is the preferred location: only 7213 // create local mappings to force remote pages to fault-and-migrate. 7214 uvm_processor_mask_zero(allowed_mask); 7215 uvm_processor_mask_set(allowed_mask, id); 7216 } 7217 else { 7218 // Common case: Just map wherever the memory happens to reside 7219 uvm_processor_mask_and(allowed_mask, &block->resident, &va_space->can_access[uvm_id_value(id)]); 7220 return; 7221 } 7222 7223 // Clamp to resident and accessible processors 7224 uvm_processor_mask_and(allowed_mask, allowed_mask, &block->resident); 7225 uvm_processor_mask_and(allowed_mask, allowed_mask, &va_space->can_access[uvm_id_value(id)]); 7226 } 7227 7228 NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block, 7229 uvm_va_block_context_t *va_block_context, 7230 uvm_processor_id_t id, 7231 uvm_va_block_region_t region, 7232 const uvm_page_mask_t *map_page_mask, 7233 uvm_prot_t new_prot, 7234 UvmEventMapRemoteCause cause, 7235 uvm_tracker_t *out_tracker) 7236 { 7237 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7238 uvm_gpu_t *gpu = NULL; 7239 uvm_processor_mask_t allowed_destinations; 7240 uvm_processor_id_t resident_id; 7241 const uvm_page_mask_t *pte_mask; 7242 uvm_page_mask_t *running_page_mask = &va_block_context->mapping.map_running_page_mask; 7243 NV_STATUS status; 7244 7245 va_block_context->mapping.cause = cause; 7246 7247 UVM_ASSERT(new_prot != UVM_PROT_NONE); 7248 UVM_ASSERT(new_prot < UVM_PROT_MAX); 7249 uvm_assert_mutex_locked(&va_block->lock); 7250 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region)); 7251 7252 // Mapping is not supported on the eviction path that doesn't hold the VA 7253 // space lock. 7254 uvm_assert_rwsem_locked(&va_space->lock); 7255 7256 if (UVM_ID_IS_CPU(id)) { 7257 uvm_pte_bits_cpu_t prot_pte_bit; 7258 7259 // Check if the current thread is allowed to call vm_insert_page 7260 if (!uvm_va_block_is_hmm(va_block) && !uvm_va_range_vma_check(va_block->va_range, va_block_context->mm)) 7261 return NV_OK; 7262 7263 prot_pte_bit = get_cpu_pte_bit_index(new_prot); 7264 pte_mask = &va_block->cpu.pte_bits[prot_pte_bit]; 7265 } 7266 else { 7267 uvm_va_block_gpu_state_t *gpu_state; 7268 uvm_pte_bits_gpu_t prot_pte_bit; 7269 7270 gpu = uvm_va_space_get_gpu(va_space, id); 7271 7272 // Although this GPU UUID is registered in the VA space, it might not have a 7273 // GPU VA space registered. 7274 if (!uvm_gpu_va_space_get(va_space, gpu)) 7275 return NV_OK; 7276 7277 gpu_state = block_gpu_state_get_alloc(va_block, gpu); 7278 if (!gpu_state) 7279 return NV_ERR_NO_MEMORY; 7280 7281 prot_pte_bit = get_gpu_pte_bit_index(new_prot); 7282 pte_mask = &gpu_state->pte_bits[prot_pte_bit]; 7283 } 7284 7285 uvm_page_mask_init_from_region(running_page_mask, region, map_page_mask); 7286 7287 if (!uvm_page_mask_andnot(running_page_mask, running_page_mask, pte_mask)) 7288 return NV_OK; 7289 7290 // Map per resident location so we can more easily detect physically- 7291 // contiguous mappings. 7292 map_get_allowed_destinations(va_block, va_block_context, va_block_context->policy, id, &allowed_destinations); 7293 7294 for_each_closest_id(resident_id, &allowed_destinations, id, va_space) { 7295 if (UVM_ID_IS_CPU(id)) { 7296 status = block_map_cpu_to(va_block, 7297 va_block_context, 7298 resident_id, 7299 region, 7300 running_page_mask, 7301 new_prot, 7302 out_tracker); 7303 } 7304 else { 7305 status = block_map_gpu_to(va_block, 7306 va_block_context, 7307 gpu, 7308 resident_id, 7309 running_page_mask, 7310 new_prot, 7311 out_tracker); 7312 } 7313 7314 if (status != NV_OK) 7315 return status; 7316 7317 // If we've mapped all requested pages, we're done 7318 if (uvm_page_mask_region_empty(running_page_mask, region)) 7319 break; 7320 } 7321 7322 return NV_OK; 7323 } 7324 7325 // Revokes the given pages mapped by cpu. This is implemented by unmapping all 7326 // pages and mapping them later with the lower permission. This is required 7327 // because vm_insert_page can only be used for upgrades from Invalid. 7328 // 7329 // Caller must ensure that: 7330 // - Pages in revoke_page_mask must be set in the 7331 // cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE] mask. 7332 static NV_STATUS block_revoke_cpu_write(uvm_va_block_t *block, 7333 uvm_va_block_context_t *block_context, 7334 uvm_va_block_region_t region, 7335 const uvm_page_mask_t *revoke_page_mask, 7336 uvm_tracker_t *out_tracker) 7337 { 7338 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7339 uvm_va_block_region_t subregion; 7340 7341 UVM_ASSERT(revoke_page_mask); 7342 7343 UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])); 7344 7345 block_unmap_cpu(block, region, revoke_page_mask); 7346 7347 // Coalesce revocation event notification 7348 for_each_va_block_subregion_in_mask(subregion, revoke_page_mask, region) { 7349 uvm_perf_event_notify_revocation(&va_space->perf_events, 7350 block, 7351 UVM_ID_CPU, 7352 uvm_va_block_region_start(block, subregion), 7353 uvm_va_block_region_size(subregion), 7354 UVM_PROT_READ_WRITE_ATOMIC, 7355 UVM_PROT_READ_ONLY); 7356 } 7357 7358 // uvm_va_block_map will skip this remap if we aren't holding the right mm 7359 // lock. 7360 return uvm_va_block_map(block, 7361 block_context, 7362 UVM_ID_CPU, 7363 region, 7364 revoke_page_mask, 7365 UVM_PROT_READ_ONLY, 7366 UvmEventMapRemoteCauseInvalid, 7367 out_tracker); 7368 } 7369 7370 static void block_revoke_prot_gpu_perf_notify(uvm_va_block_t *block, 7371 uvm_va_block_context_t *block_context, 7372 uvm_gpu_t *gpu, 7373 uvm_prot_t prot_revoked, 7374 const uvm_page_mask_t *pages_revoked) 7375 { 7376 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7377 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 7378 uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(block); 7379 uvm_pte_bits_gpu_t pte_bit; 7380 7381 for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= get_gpu_pte_bit_index(prot_revoked); pte_bit--) { 7382 uvm_prot_t old_prot; 7383 7384 if (!uvm_page_mask_and(&block_context->scratch_page_mask, &gpu_state->pte_bits[pte_bit], pages_revoked)) 7385 continue; 7386 7387 if (pte_bit == UVM_PTE_BITS_GPU_ATOMIC) 7388 old_prot = UVM_PROT_READ_WRITE_ATOMIC; 7389 else 7390 old_prot = UVM_PROT_READ_WRITE; 7391 7392 for_each_va_block_subregion_in_mask(subregion, &block_context->scratch_page_mask, region) { 7393 uvm_perf_event_notify_revocation(&va_space->perf_events, 7394 block, 7395 gpu->id, 7396 uvm_va_block_region_start(block, subregion), 7397 uvm_va_block_region_size(subregion), 7398 old_prot, 7399 prot_revoked - 1); 7400 } 7401 } 7402 } 7403 7404 // Revokes the given pages mapped by gpu which are resident on resident_id. 7405 // revoke_page_mask is an in/out parameter: the pages which have the appropriate 7406 // permissions and are mapped to resident_id are removed from the mask before 7407 // returning. 7408 // 7409 // Caller must ensure that: 7410 // - Pages in map_page_mask must be set in the corresponding pte_bits mask for 7411 // the protection to be revoked on the mapping GPU. 7412 static NV_STATUS block_revoke_prot_gpu_to(uvm_va_block_t *va_block, 7413 uvm_va_block_context_t *block_context, 7414 uvm_gpu_t *gpu, 7415 uvm_processor_id_t resident_id, 7416 uvm_page_mask_t *revoke_page_mask, 7417 uvm_prot_t prot_to_revoke, 7418 uvm_tracker_t *out_tracker) 7419 { 7420 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 7421 uvm_push_t push; 7422 NV_STATUS status; 7423 uvm_pte_bits_gpu_t pte_bit; 7424 uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke); 7425 uvm_prot_t new_prot = prot_to_revoke - 1; 7426 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 7427 block_pte_op_t pte_op; 7428 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id); 7429 uvm_page_mask_t *pages_to_revoke = &block_context->mapping.page_mask; 7430 7431 UVM_ASSERT(revoke_page_mask); 7432 UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &gpu_state->pte_bits[prot_pte_bit])); 7433 7434 // The pages which will actually change are those in the input page mask 7435 // which are resident on the target. 7436 if (!uvm_page_mask_and(pages_to_revoke, revoke_page_mask, resident_mask)) 7437 return NV_OK; 7438 7439 UVM_ASSERT(block_check_mapping_residency(va_block, gpu, resident_id, pages_to_revoke)); 7440 7441 // For PTE merge/split computation, compute all resident pages which will 7442 // have exactly prot_to_revoke-1 after performing the revocation. 7443 uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_revoke); 7444 uvm_page_mask_andnot(&block_context->scratch_page_mask, 7445 &gpu_state->pte_bits[prot_pte_bit - 1], 7446 &block_context->scratch_page_mask); 7447 uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask); 7448 7449 block_gpu_compute_new_pte_state(va_block, 7450 gpu, 7451 resident_id, 7452 pages_to_revoke, 7453 &block_context->scratch_page_mask, 7454 new_pte_state); 7455 7456 status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker); 7457 if (status != NV_OK) 7458 return status; 7459 7460 status = uvm_push_begin_acquire(gpu->channel_manager, 7461 UVM_CHANNEL_TYPE_MEMOPS, 7462 &va_block->tracker, 7463 &push, 7464 "Revoking %s access privileges in block [0x%llx, 0x%llx) ", 7465 uvm_prot_string(prot_to_revoke), 7466 va_block->start, 7467 va_block->end + 1); 7468 if (status != NV_OK) 7469 return status; 7470 7471 pte_op = BLOCK_PTE_OP_REVOKE; 7472 if (new_pte_state->pte_is_2m) { 7473 // We're either modifying permissions of a pre-existing 2M PTE, or all 7474 // permissions match so we can merge to a new 2M PTE. 7475 block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op); 7476 } 7477 else if (gpu_state->pte_is_2m) { 7478 // Permissions on a subset of the existing 2M PTE are being downgraded, 7479 // so we have to split it into the appropriate mix of big and 4k PTEs. 7480 block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op); 7481 } 7482 else { 7483 // We're downgrading permissions on some pre-existing mix of big and 4K 7484 // PTEs into some other mix of big and 4K PTEs. 7485 block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op); 7486 } 7487 7488 uvm_push_end(&push); 7489 7490 block_revoke_prot_gpu_perf_notify(va_block, block_context, gpu, prot_to_revoke, pages_to_revoke); 7491 7492 // Update GPU mapping state 7493 for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= prot_pte_bit; pte_bit--) 7494 uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_revoke); 7495 7496 // Remove all pages resident on this processor from the input mask, which 7497 // pages which were revoked and pages which already had the correct 7498 // permissions. 7499 uvm_page_mask_andnot(revoke_page_mask, revoke_page_mask, pages_to_revoke); 7500 7501 UVM_ASSERT(block_check_mappings(va_block)); 7502 7503 return uvm_tracker_add_push_safe(out_tracker, &push); 7504 } 7505 7506 NV_STATUS uvm_va_block_revoke_prot(uvm_va_block_t *va_block, 7507 uvm_va_block_context_t *va_block_context, 7508 uvm_processor_id_t id, 7509 uvm_va_block_region_t region, 7510 const uvm_page_mask_t *revoke_page_mask, 7511 uvm_prot_t prot_to_revoke, 7512 uvm_tracker_t *out_tracker) 7513 { 7514 uvm_gpu_t *gpu; 7515 uvm_va_block_gpu_state_t *gpu_state; 7516 uvm_processor_mask_t resident_procs; 7517 uvm_processor_id_t resident_id; 7518 uvm_page_mask_t *running_page_mask = &va_block_context->mapping.revoke_running_page_mask; 7519 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7520 uvm_pte_bits_gpu_t prot_pte_bit; 7521 7522 UVM_ASSERT(prot_to_revoke > UVM_PROT_READ_ONLY); 7523 UVM_ASSERT(prot_to_revoke < UVM_PROT_MAX); 7524 uvm_assert_mutex_locked(&va_block->lock); 7525 7526 if (UVM_ID_IS_CPU(id)) { 7527 if (prot_to_revoke == UVM_PROT_READ_WRITE_ATOMIC) 7528 return NV_OK; 7529 7530 if (uvm_va_block_is_hmm(va_block)) { 7531 // Linux is responsible for CPU page table updates. 7532 uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], region); 7533 return NV_OK; 7534 } 7535 7536 uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask); 7537 7538 if (uvm_page_mask_and(running_page_mask, running_page_mask, &va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])) 7539 return block_revoke_cpu_write(va_block, va_block_context, region, running_page_mask, out_tracker); 7540 7541 return NV_OK; 7542 } 7543 7544 gpu = uvm_va_space_get_gpu(va_space, id); 7545 7546 // UVM-Lite GPUs should never have access revoked 7547 UVM_ASSERT_MSG(!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id), 7548 "GPU %s\n", uvm_gpu_name(gpu)); 7549 7550 // Return early if there are no mappings for the GPU present in the block 7551 if (!uvm_processor_mask_test(&va_block->mapped, gpu->id)) 7552 return NV_OK; 7553 7554 gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 7555 prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke); 7556 7557 uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask); 7558 7559 if (!uvm_page_mask_and(running_page_mask, running_page_mask, &gpu_state->pte_bits[prot_pte_bit])) 7560 return NV_OK; 7561 7562 // Revoke per resident location so we can more easily detect physically- 7563 // contiguous mappings. 7564 uvm_processor_mask_copy(&resident_procs, &va_block->resident); 7565 7566 for_each_closest_id(resident_id, &resident_procs, gpu->id, va_space) { 7567 NV_STATUS status = block_revoke_prot_gpu_to(va_block, 7568 va_block_context, 7569 gpu, 7570 resident_id, 7571 running_page_mask, 7572 prot_to_revoke, 7573 out_tracker); 7574 if (status != NV_OK) 7575 return status; 7576 7577 // If we've revoked all requested pages, we're done 7578 if (uvm_page_mask_region_empty(running_page_mask, region)) 7579 break; 7580 } 7581 7582 return NV_OK; 7583 } 7584 7585 NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block, 7586 uvm_va_block_context_t *va_block_context, 7587 const uvm_processor_mask_t *map_processor_mask, 7588 uvm_va_block_region_t region, 7589 const uvm_page_mask_t *map_page_mask, 7590 uvm_prot_t new_prot, 7591 UvmEventMapRemoteCause cause) 7592 { 7593 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 7594 NV_STATUS status = NV_OK; 7595 NV_STATUS tracker_status; 7596 uvm_processor_id_t id; 7597 7598 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region)); 7599 7600 for_each_id_in_mask(id, map_processor_mask) { 7601 status = uvm_va_block_map(va_block, 7602 va_block_context, 7603 id, 7604 region, 7605 map_page_mask, 7606 new_prot, 7607 cause, 7608 &local_tracker); 7609 if (status != NV_OK) 7610 break; 7611 } 7612 7613 // Regardless of error, add the successfully-pushed mapping operations into 7614 // the block's tracker. Note that we can't overwrite the tracker because we 7615 // aren't guaranteed that the map actually pushed anything (in which case it 7616 // would've acquired the block tracker first). 7617 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 7618 uvm_tracker_deinit(&local_tracker); 7619 7620 return status == NV_OK ? tracker_status : status; 7621 } 7622 7623 NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block, 7624 uvm_va_block_context_t *va_block_context, 7625 const uvm_processor_mask_t *unmap_processor_mask, 7626 uvm_va_block_region_t region, 7627 const uvm_page_mask_t *unmap_page_mask) 7628 { 7629 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 7630 NV_STATUS status = NV_OK; 7631 NV_STATUS tracker_status; 7632 uvm_processor_id_t id; 7633 7634 // Watch out, unmap_mask could change during iteration since it could be 7635 // va_block->mapped. 7636 for_each_id_in_mask(id, unmap_processor_mask) { 7637 // Errors could either be a system-fatal error (ECC) or an allocation 7638 // retry due to PTE splitting. In either case we should stop after 7639 // hitting the first one. 7640 status = uvm_va_block_unmap(va_block, va_block_context, id, region, unmap_page_mask, &local_tracker); 7641 if (status != NV_OK) 7642 break; 7643 } 7644 7645 // See the comment in uvm_va_block_map_mask for adding to the tracker. 7646 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 7647 uvm_tracker_deinit(&local_tracker); 7648 7649 return status == NV_OK ? tracker_status : status; 7650 } 7651 7652 NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block, 7653 uvm_va_block_context_t *va_block_context, 7654 const uvm_processor_mask_t *revoke_processor_mask, 7655 uvm_va_block_region_t region, 7656 const uvm_page_mask_t *revoke_page_mask, 7657 uvm_prot_t prot_to_revoke) 7658 { 7659 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 7660 NV_STATUS status = NV_OK; 7661 NV_STATUS tracker_status; 7662 uvm_processor_id_t id; 7663 7664 for_each_id_in_mask(id, revoke_processor_mask) { 7665 status = uvm_va_block_revoke_prot(va_block, 7666 va_block_context, 7667 id, 7668 region, 7669 revoke_page_mask, 7670 prot_to_revoke, 7671 &local_tracker); 7672 if (status != NV_OK) 7673 break; 7674 } 7675 7676 // See the comment in uvm_va_block_map_mask for adding to the tracker. 7677 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 7678 uvm_tracker_deinit(&local_tracker); 7679 7680 return status == NV_OK ? tracker_status : status; 7681 } 7682 7683 // Updates the read_duplicated_pages mask in the block when the state of GPU id 7684 // is being destroyed 7685 static void update_read_duplicated_pages_mask(uvm_va_block_t *block, 7686 uvm_gpu_id_t id, 7687 uvm_va_block_gpu_state_t *gpu_state) 7688 { 7689 uvm_gpu_id_t running_id; 7690 bool first = true; 7691 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7692 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 7693 uvm_page_mask_t *running_page_mask = &block_context->update_read_duplicated_pages.running_page_mask; 7694 uvm_page_mask_t *tmp_page_mask = &block_context->scratch_page_mask; 7695 7696 uvm_page_mask_zero(&block->read_duplicated_pages); 7697 7698 for_each_id_in_mask(running_id, &block->resident) { 7699 const uvm_page_mask_t *running_residency_mask; 7700 7701 if (uvm_id_equal(running_id, id)) 7702 continue; 7703 7704 running_residency_mask = uvm_va_block_resident_mask_get(block, running_id); 7705 7706 if (first) { 7707 uvm_page_mask_copy(running_page_mask, running_residency_mask); 7708 first = false; 7709 continue; 7710 } 7711 7712 if (uvm_page_mask_and(tmp_page_mask, running_page_mask, running_residency_mask)) 7713 uvm_page_mask_or(&block->read_duplicated_pages, &block->read_duplicated_pages, tmp_page_mask); 7714 7715 uvm_page_mask_or(running_page_mask, running_page_mask, running_residency_mask); 7716 } 7717 } 7718 7719 // Unmaps all GPU mappings under this block, frees the page tables, and frees 7720 // all the GPU chunks. This simply drops the chunks on the floor, so the caller 7721 // must take care of copying the data elsewhere if it needs to remain intact. 7722 // 7723 // This serializes on the block tracker since it must unmap page tables. 7724 static void block_destroy_gpu_state(uvm_va_block_t *block, uvm_gpu_id_t id) 7725 { 7726 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id); 7727 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7728 uvm_gpu_va_space_t *gpu_va_space; 7729 uvm_gpu_t *gpu, *other_gpu; 7730 7731 if (!gpu_state) 7732 return; 7733 7734 uvm_assert_mutex_locked(&block->lock); 7735 7736 // Unmap PTEs and free page tables 7737 gpu = uvm_va_space_get_gpu(va_space, id); 7738 gpu_va_space = uvm_gpu_va_space_get(va_space, gpu); 7739 if (gpu_va_space) { 7740 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 7741 7742 uvm_va_block_remove_gpu_va_space(block, gpu_va_space, block_context); 7743 } 7744 7745 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id)); 7746 7747 // No processor should have this GPU mapped at this point 7748 UVM_ASSERT(block_check_processor_not_mapped(block, id)); 7749 7750 // We need to remove the mappings of the indirect peers from the reverse 7751 // map when the GPU state is being destroyed (for example, on 7752 // unregister_gpu) and when peer access between indirect peers is disabled. 7753 // However, we need to avoid double mapping removals. There are two 7754 // possible scenarios: 7755 // - Disable peer access first. This will remove all mappings between A and 7756 // B GPUs, and the indirect_peers bit is cleared. Thus, the later call to 7757 // unregister_gpu will not operate on that pair of GPUs. 7758 // - Unregister GPU first. This will remove all mappings from all indirect 7759 // peers to the GPU being unregistered. It will also destroy its GPU state. 7760 // Subsequent calls to disable peers will remove the mappings from the GPU 7761 // being unregistered, but never to the GPU being unregistered (since it no 7762 // longer has a valid GPU state). 7763 for_each_va_space_gpu_in_mask(other_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) 7764 block_gpu_unmap_all_chunks_indirect_peer(block, gpu, other_gpu); 7765 7766 if (gpu_state->chunks) { 7767 size_t i, num_chunks; 7768 7769 update_read_duplicated_pages_mask(block, id, gpu_state); 7770 uvm_page_mask_zero(&gpu_state->resident); 7771 block_clear_resident_processor(block, id); 7772 7773 num_chunks = block_num_gpu_chunks(block, gpu); 7774 for (i = 0; i < num_chunks; i++) { 7775 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 7776 if (!chunk) 7777 continue; 7778 7779 uvm_mmu_chunk_unmap(chunk, &block->tracker); 7780 uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker); 7781 } 7782 7783 uvm_kvfree(gpu_state->chunks); 7784 } 7785 else { 7786 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id)); 7787 } 7788 7789 7790 // Pending operations may still need the DMA memory to be mapped. 7791 uvm_tracker_wait(&block->tracker); 7792 7793 block_gpu_unmap_phys_all_cpu_pages(block, gpu); 7794 uvm_processor_mask_clear(&block->evicted_gpus, id); 7795 7796 kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state); 7797 block->gpus[uvm_id_gpu_index(id)] = NULL; 7798 } 7799 7800 static void block_put_ptes_safe(uvm_page_tree_t *tree, uvm_page_table_range_t *range) 7801 { 7802 if (range->table) { 7803 uvm_page_tree_put_ptes(tree, range); 7804 memset(range, 0, sizeof(*range)); 7805 } 7806 } 7807 7808 NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space) 7809 { 7810 uvm_assert_mutex_locked(&va_block->lock); 7811 7812 if (!gpu_va_space->ats.enabled || !va_block->cpu.ever_mapped) 7813 return NV_OK; 7814 7815 // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See 7816 // comments in pre_populate_pde1_gpu. 7817 return block_pre_populate_pde1_gpu(va_block, gpu_va_space, NULL); 7818 } 7819 7820 void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block, 7821 uvm_gpu_va_space_t *gpu_va_space, 7822 uvm_va_block_context_t *block_context) 7823 { 7824 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 7825 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 7826 uvm_gpu_t *gpu = gpu_va_space->gpu; 7827 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 7828 uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block); 7829 uvm_push_t push; 7830 NV_STATUS status; 7831 7832 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 7833 7834 if (!gpu_state) 7835 return; 7836 7837 uvm_assert_mutex_locked(&va_block->lock); 7838 7839 // Unmapping the whole block won't cause a page table split, so this should 7840 // only fail if we have a system-fatal error. 7841 status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &local_tracker); 7842 if (status != NV_OK) { 7843 UVM_ASSERT(status == uvm_global_get_status()); 7844 return; // Just leak 7845 } 7846 7847 UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id)); 7848 7849 // Reset the page tables if other allocations could reuse them 7850 if (!block_gpu_supports_2m(va_block, gpu) && 7851 !bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 7852 7853 status = uvm_push_begin_acquire(gpu->channel_manager, 7854 UVM_CHANNEL_TYPE_MEMOPS, 7855 &local_tracker, 7856 &push, 7857 "Resetting PTEs for block [0x%llx, 0x%llx)", 7858 va_block->start, 7859 va_block->end + 1); 7860 if (status != NV_OK) { 7861 UVM_ASSERT(status == uvm_global_get_status()); 7862 return; // Just leak 7863 } 7864 7865 uvm_pte_batch_begin(&push, pte_batch); 7866 uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch); 7867 7868 // When the big PTEs is active, the 4k PTEs under it are garbage. Make 7869 // them invalid so the page tree code can reuse them for other 7870 // allocations on this VA. These don't need TLB invalidates since the 7871 // big PTEs above them are active. 7872 if (gpu_state->page_table_range_4k.table) { 7873 uvm_page_mask_init_from_big_ptes(va_block, gpu, &block_context->scratch_page_mask, gpu_state->big_ptes); 7874 block_gpu_pte_clear_4k(va_block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 7875 } 7876 7877 // We unmapped all big PTEs above, which means they have the unmapped 7878 // pattern so the GPU MMU won't read 4k PTEs under them. Set them to 7879 // invalid to activate the 4ks below so new allocations using just those 7880 // 4k PTEs will work. 7881 block_gpu_pte_clear_big(va_block, gpu, gpu_state->big_ptes, 0, pte_batch, tlb_batch); 7882 7883 uvm_pte_batch_end(pte_batch); 7884 uvm_tlb_batch_end(tlb_batch, &push, UVM_MEMBAR_NONE); 7885 7886 uvm_push_end(&push); 7887 uvm_tracker_overwrite_with_push(&local_tracker, &push); 7888 } 7889 7890 // The unmap must finish before we free the page tables 7891 status = uvm_tracker_wait_deinit(&local_tracker); 7892 if (status != NV_OK) 7893 return; // System-fatal error, just leak 7894 7895 // Note that if the PTE is currently 2M with lower tables allocated but not 7896 // in use, calling put_ptes on those lower ranges will re-write the 2M entry 7897 // to be a PDE. 7898 block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_4k); 7899 block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_big); 7900 block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_2m); 7901 7902 gpu_state->pte_is_2m = false; 7903 gpu_state->initialized_big = false; 7904 gpu_state->activated_big = false; 7905 gpu_state->activated_4k = false; 7906 bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 7907 7908 UVM_ASSERT(block_check_mappings(va_block)); 7909 } 7910 7911 NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) 7912 { 7913 NV_STATUS status; 7914 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7915 7916 UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type != UVM_GPU_LINK_INVALID); 7917 uvm_assert_rwsem_locked_write(&va_space->lock); 7918 uvm_assert_mutex_locked(&va_block->lock); 7919 7920 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) { 7921 status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu0, gpu1); 7922 if (status != NV_OK) 7923 return status; 7924 7925 status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu1, gpu0); 7926 if (status != NV_OK) { 7927 block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1); 7928 return status; 7929 } 7930 } 7931 7932 // TODO: Bug 1767224: Refactor the uvm_va_block_set_accessed_by logic so we 7933 // call it here. 7934 7935 return NV_OK; 7936 } 7937 7938 void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) 7939 { 7940 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7941 NV_STATUS status; 7942 uvm_tracker_t tracker = UVM_TRACKER_INIT(); 7943 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 7944 uvm_page_mask_t *unmap_page_mask = &block_context->caller_page_mask; 7945 const uvm_page_mask_t *resident0; 7946 const uvm_page_mask_t *resident1; 7947 7948 uvm_assert_mutex_locked(&va_block->lock); 7949 7950 // See comment in block_destroy_gpu_state 7951 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) { 7952 block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1); 7953 block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu1, gpu0); 7954 } 7955 7956 // If either of the GPUs doesn't have GPU state then nothing could be mapped 7957 // between them. 7958 if (!uvm_va_block_gpu_state_get(va_block, gpu0->id) || !uvm_va_block_gpu_state_get(va_block, gpu1->id)) 7959 return; 7960 7961 resident0 = uvm_va_block_resident_mask_get(va_block, gpu0->id); 7962 resident1 = uvm_va_block_resident_mask_get(va_block, gpu1->id); 7963 7964 // Unmap all pages resident on gpu1, but not on gpu0, from gpu0 7965 if (uvm_page_mask_andnot(unmap_page_mask, resident1, resident0)) { 7966 status = block_unmap_gpu(va_block, block_context, gpu0, unmap_page_mask, &tracker); 7967 if (status != NV_OK) { 7968 // Since all PTEs unmapped by this call have the same aperture, page 7969 // splits should never be required so any failure should be the 7970 // result of a system-fatal error. 7971 UVM_ASSERT_MSG(status == uvm_global_get_status(), 7972 "Unmapping failed: %s, GPU %s\n", 7973 nvstatusToString(status), 7974 uvm_gpu_name(gpu0)); 7975 } 7976 } 7977 7978 // Unmap all pages resident on gpu0, but not on gpu1, from gpu1 7979 if (uvm_page_mask_andnot(unmap_page_mask, resident0, resident1)) { 7980 status = block_unmap_gpu(va_block, block_context, gpu1, unmap_page_mask, &tracker); 7981 if (status != NV_OK) { 7982 UVM_ASSERT_MSG(status == uvm_global_get_status(), 7983 "Unmapping failed: %s, GPU %s\n", 7984 nvstatusToString(status), 7985 uvm_gpu_name(gpu0)); 7986 } 7987 } 7988 7989 status = uvm_tracker_add_tracker_safe(&va_block->tracker, &tracker); 7990 if (status != NV_OK) 7991 UVM_ASSERT(status == uvm_global_get_status()); 7992 7993 status = uvm_tracker_wait_deinit(&tracker); 7994 if (status != NV_OK) 7995 UVM_ASSERT(status == uvm_global_get_status()); 7996 } 7997 7998 void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uvm_gpu_t *gpu) 7999 { 8000 NV_STATUS status; 8001 uvm_va_range_t *va_range = va_block->va_range; 8002 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 8003 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 8004 uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block); 8005 8006 uvm_assert_mutex_locked(&va_block->lock); 8007 UVM_ASSERT(uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu->id)); 8008 8009 // If the GPU doesn't have GPU state then nothing could be mapped. 8010 if (!uvm_va_block_gpu_state_get(va_block, gpu->id)) 8011 return; 8012 8013 // In UVM-Lite mode, mappings to the preferred location are not tracked 8014 // directly, so just unmap the whole block. 8015 status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &va_block->tracker); 8016 if (status != NV_OK) { 8017 // Unmapping the whole block should not cause page splits so any failure 8018 // should be the result of a system-fatal error. 8019 UVM_ASSERT_MSG(status == uvm_global_get_status(), 8020 "Unmapping failed: %s, GPU %s\n", 8021 nvstatusToString(status), uvm_gpu_name(gpu)); 8022 } 8023 8024 status = uvm_tracker_wait(&va_block->tracker); 8025 if (status != NV_OK) { 8026 UVM_ASSERT_MSG(status == uvm_global_get_status(), 8027 "Unmapping failed: %s, GPU %s\n", 8028 nvstatusToString(status), uvm_gpu_name(gpu)); 8029 } 8030 } 8031 8032 // Evict pages from the GPU by moving each resident region to the CPU 8033 // 8034 // Notably the caller needs to support allocation-retry as 8035 // uvm_va_block_migrate_locked() requires that. 8036 static NV_STATUS block_evict_pages_from_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm) 8037 { 8038 NV_STATUS status = NV_OK; 8039 const uvm_page_mask_t *resident = uvm_va_block_resident_mask_get(va_block, gpu->id); 8040 uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block); 8041 uvm_va_block_region_t subregion; 8042 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 8043 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, mm); 8044 8045 // Move all subregions resident on the GPU to the CPU 8046 for_each_va_block_subregion_in_mask(subregion, resident, region) { 8047 if (uvm_va_block_is_hmm(va_block)) { 8048 status = uvm_hmm_va_block_evict_pages_from_gpu(va_block, 8049 gpu, 8050 block_context, 8051 resident, 8052 subregion); 8053 } 8054 else { 8055 status = uvm_va_block_migrate_locked(va_block, 8056 NULL, 8057 block_context, 8058 subregion, 8059 UVM_ID_CPU, 8060 UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP, 8061 NULL); 8062 } 8063 if (status != NV_OK) 8064 return status; 8065 } 8066 8067 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu->id)); 8068 return NV_OK; 8069 } 8070 8071 void uvm_va_block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm) 8072 { 8073 NV_STATUS status; 8074 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 8075 8076 uvm_assert_mutex_locked(&va_block->lock); 8077 8078 if (!gpu_state) 8079 return; 8080 8081 // The mappings should've already been torn down by GPU VA space unregister 8082 UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id)); 8083 UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ])); 8084 UVM_ASSERT(!block_gpu_has_page_tables(va_block, gpu)); 8085 8086 // Use UVM_VA_BLOCK_RETRY_LOCKED() as the va block lock is already taken and 8087 // we don't rely on any state of the block across the call. 8088 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL, block_evict_pages_from_gpu(va_block, gpu, mm)); 8089 if (status != NV_OK) { 8090 UVM_ERR_PRINT("Failed to evict GPU pages on GPU unregister: %s, GPU %s\n", 8091 nvstatusToString(status), 8092 uvm_gpu_name(gpu)); 8093 uvm_global_set_fatal_error(status); 8094 } 8095 8096 // This function will copy the block's tracker into each chunk then free the 8097 // chunk to PMM. If we do this before waiting for the block tracker below 8098 // we'll populate PMM's free chunks with tracker entries, which gives us 8099 // better testing coverage of chunk synchronization on GPU unregister. 8100 block_destroy_gpu_state(va_block, gpu->id); 8101 8102 // Any time a GPU is unregistered we need to make sure that there are no 8103 // pending (direct or indirect) tracker entries for that GPU left in the 8104 // block's tracker. The only way to ensure that is to wait for the whole 8105 // tracker. 8106 status = uvm_tracker_wait(&va_block->tracker); 8107 if (status != NV_OK) 8108 UVM_ASSERT(status == uvm_global_get_status()); 8109 } 8110 8111 void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm) 8112 { 8113 // Take the lock internally to not expose the caller to allocation-retry. 8114 uvm_mutex_lock(&va_block->lock); 8115 8116 uvm_va_block_unregister_gpu_locked(va_block, gpu, mm); 8117 8118 uvm_mutex_unlock(&va_block->lock); 8119 } 8120 8121 static void block_mark_region_cpu_dirty(uvm_va_block_t *va_block, uvm_va_block_region_t region) 8122 { 8123 uvm_page_index_t page_index; 8124 8125 uvm_assert_mutex_locked(&va_block->lock); 8126 8127 for_each_va_block_page_in_region_mask (page_index, &va_block->cpu.resident, region) 8128 block_mark_cpu_page_dirty(va_block, page_index); 8129 } 8130 8131 // Tears down everything within the block, but doesn't free the block itself. 8132 // Note that when uvm_va_block_kill is called, this is called twice: once for 8133 // the initial kill itself, then again when the block's ref count is eventually 8134 // destroyed. block->va_range is used to track whether the block has already 8135 // been killed. 8136 static void block_kill(uvm_va_block_t *block) 8137 { 8138 uvm_va_space_t *va_space; 8139 uvm_perf_event_data_t event_data; 8140 uvm_cpu_chunk_t *chunk; 8141 uvm_gpu_id_t id; 8142 NV_STATUS status; 8143 uvm_va_block_region_t region = uvm_va_block_region_from_block(block); 8144 uvm_page_index_t page_index; 8145 uvm_page_index_t next_page_index; 8146 8147 if (uvm_va_block_is_dead(block)) 8148 return; 8149 8150 va_space = uvm_va_block_get_va_space(block); 8151 event_data.block_destroy.block = block; 8152 uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_DESTROY, &event_data); 8153 8154 // Unmap all processors in parallel first. Unmapping the whole block won't 8155 // cause a page table split, so this should only fail if we have a system- 8156 // fatal error. 8157 if (!uvm_processor_mask_empty(&block->mapped)) { 8158 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 8159 8160 // HMM CPU mappings are controlled by Linux so no need to unmap. 8161 // Remote GPU mappings will be removed below. 8162 if (uvm_va_block_is_hmm(block) && uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) { 8163 uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]); 8164 uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]); 8165 uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU); 8166 } 8167 8168 // We could only be killed with mapped GPU state by VA range free or VA 8169 // space teardown, so it's safe to use the va_space's block_context 8170 // because both of those have the VA space lock held in write mode. 8171 status = uvm_va_block_unmap_mask(block, block_context, &block->mapped, region, NULL); 8172 UVM_ASSERT(status == uvm_global_get_status()); 8173 } 8174 8175 UVM_ASSERT(uvm_processor_mask_empty(&block->mapped)); 8176 8177 // Free the GPU page tables and chunks 8178 for_each_gpu_id(id) 8179 block_destroy_gpu_state(block, id); 8180 8181 // Wait for the GPU PTE unmaps before freeing CPU memory 8182 uvm_tracker_wait_deinit(&block->tracker); 8183 8184 // No processor should have the CPU mapped at this point 8185 UVM_ASSERT(block_check_processor_not_mapped(block, UVM_ID_CPU)); 8186 8187 // Free CPU pages 8188 for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, block) { 8189 // be conservative. 8190 // Tell the OS we wrote to the page because we sometimes clear the dirty 8191 // bit after writing to it. HMM dirty flags are managed by the kernel. 8192 if (!uvm_va_block_is_hmm(block)) 8193 uvm_cpu_chunk_mark_dirty(chunk, 0); 8194 uvm_cpu_chunk_remove_from_block(block, page_index); 8195 uvm_cpu_chunk_free(chunk); 8196 } 8197 8198 uvm_kvfree((void *)block->cpu.chunks); 8199 block->cpu.chunks = 0; 8200 8201 // Clearing the resident bit isn't strictly necessary since this block 8202 // is getting destroyed, but it keeps state consistent for assertions. 8203 uvm_page_mask_zero(&block->cpu.resident); 8204 block_clear_resident_processor(block, UVM_ID_CPU); 8205 8206 if (uvm_va_block_is_hmm(block)) 8207 uvm_va_policy_clear(block, block->start, block->end); 8208 8209 block->va_range = NULL; 8210 #if UVM_IS_CONFIG_HMM() 8211 block->hmm.va_space = NULL; 8212 #endif 8213 } 8214 8215 // Called when the block's ref count drops to 0 8216 void uvm_va_block_destroy(nv_kref_t *nv_kref) 8217 { 8218 uvm_va_block_t *block = container_of(nv_kref, uvm_va_block_t, kref); 8219 8220 // Nobody else should have a reference when freeing 8221 uvm_assert_mutex_unlocked(&block->lock); 8222 8223 uvm_mutex_lock(&block->lock); 8224 block_kill(block); 8225 uvm_mutex_unlock(&block->lock); 8226 8227 if (uvm_enable_builtin_tests) { 8228 uvm_va_block_wrapper_t *block_wrapper = container_of(block, uvm_va_block_wrapper_t, block); 8229 8230 kmem_cache_free(g_uvm_va_block_cache, block_wrapper); 8231 } 8232 else { 8233 kmem_cache_free(g_uvm_va_block_cache, block); 8234 } 8235 } 8236 8237 void uvm_va_block_kill(uvm_va_block_t *va_block) 8238 { 8239 uvm_mutex_lock(&va_block->lock); 8240 block_kill(va_block); 8241 uvm_mutex_unlock(&va_block->lock); 8242 8243 // May call block_kill again 8244 uvm_va_block_release(va_block); 8245 } 8246 8247 static void block_gpu_release_region(uvm_va_block_t *va_block, 8248 uvm_gpu_id_t gpu_id, 8249 uvm_va_block_gpu_state_t *gpu_state, 8250 uvm_page_mask_t *page_mask, 8251 uvm_va_block_region_t region) 8252 { 8253 uvm_page_index_t page_index; 8254 8255 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 8256 uvm_gpu_chunk_t *gpu_chunk = gpu_state->chunks[page_index]; 8257 8258 if (!gpu_chunk) 8259 continue; 8260 8261 // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks 8262 8263 uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker); 8264 8265 // The GPU chunk will be freed when the device private reference drops. 8266 if (uvm_page_mask_test_and_clear(&gpu_state->resident, page_index) && 8267 uvm_page_mask_empty(&gpu_state->resident)) 8268 block_clear_resident_processor(va_block, gpu_id); 8269 8270 gpu_state->chunks[page_index] = NULL; 8271 } 8272 } 8273 8274 void uvm_va_block_munmap_region(uvm_va_block_t *va_block, 8275 uvm_va_block_region_t region) 8276 { 8277 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 8278 uvm_perf_event_data_t event_data; 8279 uvm_gpu_id_t gpu_id; 8280 8281 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 8282 uvm_assert_mutex_locked(&va_block->lock); 8283 8284 // Reset thrashing state for the region. 8285 event_data.block_munmap.block = va_block; 8286 event_data.block_munmap.region = region; 8287 uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_MUNMAP, &event_data); 8288 8289 // Set a flag so that GPU fault events are flushed since they might refer 8290 // to the region being unmapped. 8291 // Note that holding the va_block lock prevents GPU VA spaces from 8292 // being removed so the registered_gpu_va_spaces mask is stable. 8293 for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) { 8294 uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id); 8295 } 8296 8297 // Release any remaining vidmem chunks in the given region. 8298 for_each_gpu_id(gpu_id) { 8299 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 8300 8301 if (!gpu_state) 8302 continue; 8303 8304 uvm_page_mask_region_clear(&gpu_state->evicted, region); 8305 if (uvm_page_mask_empty(&gpu_state->evicted)) 8306 uvm_processor_mask_clear(&va_block->evicted_gpus, gpu_id); 8307 8308 if (gpu_state->chunks) { 8309 block_gpu_release_region(va_block, gpu_id, gpu_state, NULL, region); 8310 8311 // TODO: bug 3660922: Need to update the read duplicated pages mask 8312 // when read duplication is supported for HMM. 8313 } 8314 else { 8315 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu_id)); 8316 } 8317 } 8318 8319 uvm_va_policy_clear(va_block, 8320 uvm_va_block_region_start(va_block, region), 8321 uvm_va_block_region_end(va_block, region)); 8322 } 8323 8324 static NV_STATUS block_split_presplit_ptes_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu) 8325 { 8326 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id); 8327 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing); 8328 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 8329 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu); 8330 NvU32 alloc_sizes; 8331 DECLARE_BITMAP(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 8332 uvm_page_index_t new_start_page_index = uvm_va_block_cpu_page_index(existing, new->start); 8333 size_t big_page_index; 8334 uvm_push_t push; 8335 NV_STATUS status; 8336 8337 // We only have to split to big PTEs if we're currently a 2M PTE 8338 if (existing_gpu_state->pte_is_2m) { 8339 // We can skip the split if the 2M PTE is invalid and we have no lower 8340 // PTEs. 8341 if (block_page_prot_gpu(existing, gpu, 0) == UVM_PROT_NONE && 8342 !existing_gpu_state->page_table_range_big.table && 8343 !existing_gpu_state->page_table_range_4k.table) 8344 return NV_OK; 8345 8346 alloc_sizes = big_page_size; 8347 bitmap_fill(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 8348 8349 if (!IS_ALIGNED(new->start, big_page_size)) { 8350 alloc_sizes |= UVM_PAGE_SIZE_4K; 8351 8352 big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size); 8353 __clear_bit(big_page_index, new_big_ptes); 8354 } 8355 8356 status = block_alloc_ptes_with_retry(existing, gpu, alloc_sizes, NULL); 8357 if (status != NV_OK) 8358 return status; 8359 8360 status = uvm_push_begin_acquire(gpu->channel_manager, 8361 UVM_CHANNEL_TYPE_MEMOPS, 8362 &existing->tracker, 8363 &push, 8364 "Splitting 2M PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)", 8365 existing->start, existing->end + 1, 8366 new->start, new->end + 1); 8367 if (status != NV_OK) 8368 return status; 8369 8370 block_gpu_split_2m(existing, block_context, gpu, new_big_ptes, &push); 8371 } 8372 else { 8373 big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size); 8374 8375 // If the split point is on a big page boundary, or if the split point 8376 // is not currently covered by a big PTE, we don't have to split 8377 // anything. 8378 if (IS_ALIGNED(new->start, big_page_size) || 8379 big_page_index == MAX_BIG_PAGES_PER_UVM_VA_BLOCK || 8380 !test_bit(big_page_index, existing_gpu_state->big_ptes)) 8381 return NV_OK; 8382 8383 status = block_alloc_ptes_with_retry(existing, gpu, UVM_PAGE_SIZE_4K, NULL); 8384 if (status != NV_OK) 8385 return status; 8386 8387 bitmap_zero(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 8388 __set_bit(big_page_index, new_big_ptes); 8389 8390 status = uvm_push_begin_acquire(gpu->channel_manager, 8391 UVM_CHANNEL_TYPE_MEMOPS, 8392 &existing->tracker, 8393 &push, 8394 "Splitting big PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)", 8395 existing->start, existing->end + 1, 8396 new->start, new->end + 1); 8397 if (status != NV_OK) 8398 return status; 8399 8400 block_gpu_split_big(existing, block_context, gpu, new_big_ptes, &push); 8401 } 8402 8403 uvm_push_end(&push); 8404 8405 // Adding this push to existing block tracker will cause all GPU PTE splits 8406 // to serialize on each other, but it's simpler than maintaining a separate 8407 // tracker and this path isn't performance-critical. 8408 return uvm_tracker_add_push_safe(&existing->tracker, &push); 8409 } 8410 8411 static NV_STATUS block_split_presplit_ptes(uvm_va_block_t *existing, uvm_va_block_t *new) 8412 { 8413 uvm_gpu_t *gpu; 8414 uvm_gpu_id_t id; 8415 NV_STATUS status; 8416 8417 for_each_gpu_id(id) { 8418 if (!uvm_va_block_gpu_state_get(existing, id)) 8419 continue; 8420 8421 gpu = block_get_gpu(existing, id); 8422 8423 if (block_gpu_has_page_tables(existing, gpu)) { 8424 status = block_split_presplit_ptes_gpu(existing, new, gpu); 8425 if (status != NV_OK) 8426 return status; 8427 } 8428 } 8429 8430 return NV_OK; 8431 } 8432 8433 typedef struct 8434 { 8435 // Number of chunks contained by this VA block 8436 size_t num_chunks; 8437 8438 // Index of the "interesting" chunk, either adjacent to or spanning the 8439 // split point depending on which block this is. 8440 size_t chunk_index; 8441 8442 // Size of the chunk referenced by chunk_index 8443 uvm_chunk_size_t chunk_size; 8444 } block_gpu_chunk_split_state_t; 8445 8446 static void block_gpu_chunk_get_split_state(uvm_va_block_t *block, 8447 block_gpu_chunk_split_state_t *state, 8448 NvU64 start, 8449 NvU64 end, 8450 uvm_page_index_t page_index, 8451 uvm_gpu_t *gpu) 8452 { 8453 NvU64 size = end - start + 1; 8454 state->num_chunks = block_num_gpu_chunks_range(block, start, size, gpu); 8455 state->chunk_index = block_gpu_chunk_index_range(block, start, size, gpu, page_index, &state->chunk_size); 8456 } 8457 8458 static void block_merge_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) 8459 { 8460 uvm_gpu_t *accessing_gpu; 8461 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 8462 8463 uvm_pmm_gpu_merge_chunk(&gpu->pmm, chunk); 8464 8465 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 8466 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); 8467 8468 uvm_pmm_sysmem_mappings_merge_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings, 8469 peer_addr, 8470 uvm_gpu_chunk_get_size(chunk)); 8471 } 8472 } 8473 8474 // Perform any chunk splitting and array growing required for this block split, 8475 // but don't actually move chunk pointers anywhere. 8476 static NV_STATUS block_presplit_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu) 8477 { 8478 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id); 8479 uvm_gpu_t *accessing_gpu; 8480 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing); 8481 uvm_gpu_chunk_t **temp_chunks; 8482 uvm_gpu_chunk_t *original_chunk, *curr_chunk; 8483 uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start); 8484 uvm_chunk_sizes_mask_t split_sizes; 8485 uvm_chunk_size_t subchunk_size; 8486 NV_STATUS status; 8487 block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state; 8488 8489 block_gpu_chunk_get_split_state(existing, 8490 &existing_before_state, 8491 existing->start, 8492 existing->end, 8493 split_page_index, 8494 gpu); 8495 block_gpu_chunk_get_split_state(existing, 8496 &existing_after_state, 8497 existing->start, 8498 new->start - 1, 8499 split_page_index - 1, 8500 gpu); 8501 block_gpu_chunk_get_split_state(new, 8502 &new_state, 8503 new->start, 8504 new->end, 8505 0, 8506 gpu); 8507 8508 // Even though we're splitting existing, we could wind up requiring a larger 8509 // chunks array if we split a large chunk into many smaller ones. 8510 if (existing_after_state.num_chunks > existing_before_state.num_chunks) { 8511 temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks, 8512 existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0])); 8513 if (!temp_chunks) 8514 return NV_ERR_NO_MEMORY; 8515 existing_gpu_state->chunks = temp_chunks; 8516 } 8517 8518 original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index]; 8519 8520 // If the chunk covering the split point is not populated, we're done. We've 8521 // already grown the array to cover any new chunks which may be populated 8522 // later. 8523 if (!original_chunk) 8524 return NV_OK; 8525 8526 // Figure out the splits we need to perform. Remove all sizes >= the current 8527 // size, and all sizes < the target size. Note that the resulting mask will 8528 // be 0 if the sizes match (we're already splitting at a chunk boundary). 8529 UVM_ASSERT(uvm_gpu_chunk_get_size(original_chunk) == existing_before_state.chunk_size); 8530 UVM_ASSERT(existing_before_state.chunk_size >= new_state.chunk_size); 8531 split_sizes = gpu->parent->mmu_user_chunk_sizes; 8532 split_sizes &= existing_before_state.chunk_size - 1; 8533 split_sizes &= ~(new_state.chunk_size - 1); 8534 8535 // Keep splitting the chunk covering the split point until we hit the target 8536 // size. 8537 curr_chunk = original_chunk; 8538 for_each_chunk_size_rev(subchunk_size, split_sizes) { 8539 size_t last_index, num_subchunks; 8540 8541 status = uvm_pmm_gpu_split_chunk(&gpu->pmm, curr_chunk, subchunk_size, NULL); 8542 if (status != NV_OK) 8543 goto error; 8544 8545 // Split physical GPU mappings for indirect peers 8546 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 8547 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, curr_chunk, accessing_gpu); 8548 8549 status = uvm_pmm_sysmem_mappings_split_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings, 8550 peer_addr, 8551 subchunk_size); 8552 if (status != NV_OK) 8553 goto error; 8554 } 8555 8556 if (subchunk_size == new_state.chunk_size) 8557 break; 8558 8559 // Compute the last subchunk index prior to the split point. Divide the 8560 // entire address space into units of subchunk_size, then mod by the 8561 // number of subchunks within the parent. 8562 last_index = (size_t)uvm_div_pow2_64(new->start - 1, subchunk_size); 8563 num_subchunks = (size_t)uvm_div_pow2_64(uvm_gpu_chunk_get_size(curr_chunk), subchunk_size); 8564 UVM_ASSERT(num_subchunks > 1); 8565 last_index &= num_subchunks - 1; 8566 8567 uvm_pmm_gpu_get_subchunks(&gpu->pmm, curr_chunk, last_index, 1, &curr_chunk); 8568 UVM_ASSERT(uvm_gpu_chunk_get_size(curr_chunk) == subchunk_size); 8569 } 8570 8571 // Note that existing's chunks array still has a pointer to original_chunk, 8572 // not to any newly-split subchunks. If a subsequent split failure occurs on 8573 // a later GPU we'll have to merge it back. Once we're past the preallocate 8574 // stage we'll remove it from the chunks array and move the new split chunks 8575 // in. 8576 8577 return NV_OK; 8578 8579 error: 8580 // On error we need to leave the chunk in its initial state 8581 block_merge_chunk(existing, gpu, original_chunk); 8582 8583 return status; 8584 } 8585 8586 static NV_STATUS block_split_cpu_chunk_to_64k(uvm_va_block_t *block) 8587 { 8588 uvm_cpu_chunk_storage_mixed_t *mixed; 8589 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, 0); 8590 NV_STATUS status; 8591 8592 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M); 8593 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_CHUNK); 8594 8595 mixed = uvm_kvmalloc_zero(sizeof(*mixed)); 8596 if (!mixed) 8597 return NV_ERR_NO_MEMORY; 8598 8599 status = uvm_cpu_chunk_split(chunk, (uvm_cpu_chunk_t **)&mixed->slots); 8600 if (status != NV_OK) { 8601 uvm_kvfree(mixed); 8602 return status; 8603 } 8604 8605 bitmap_fill(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK); 8606 block->cpu.chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED; 8607 return status; 8608 } 8609 8610 static NV_STATUS block_split_cpu_chunk_to_4k(uvm_va_block_t *block, uvm_page_index_t page_index) 8611 { 8612 uvm_cpu_chunk_storage_mixed_t *mixed; 8613 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 8614 uvm_cpu_chunk_t **small_chunks; 8615 size_t slot_index; 8616 NV_STATUS status; 8617 8618 UVM_ASSERT(chunk); 8619 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K); 8620 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED); 8621 8622 mixed = uvm_cpu_storage_get_ptr(block); 8623 slot_index = compute_slot_index(block, page_index); 8624 small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT); 8625 if (!small_chunks) 8626 return NV_ERR_NO_MEMORY; 8627 8628 status = uvm_cpu_chunk_split(chunk, small_chunks); 8629 if (status != NV_OK) { 8630 uvm_kvfree(small_chunks); 8631 return status; 8632 } 8633 8634 mixed->slots[slot_index] = small_chunks; 8635 clear_bit(slot_index, mixed->big_chunks); 8636 return status; 8637 } 8638 8639 static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index_t page_index) 8640 { 8641 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 8642 uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk); 8643 uvm_chunk_size_t new_size; 8644 uvm_gpu_t *gpu; 8645 NvU64 gpu_mapping_addr; 8646 uvm_processor_mask_t gpu_split_mask; 8647 uvm_gpu_id_t id; 8648 NV_STATUS status; 8649 8650 if (chunk_size == UVM_CHUNK_SIZE_2M) 8651 new_size = UVM_CHUNK_SIZE_64K; 8652 else 8653 new_size = UVM_CHUNK_SIZE_4K; 8654 8655 UVM_ASSERT(IS_ALIGNED(chunk_size, new_size)); 8656 8657 uvm_processor_mask_zero(&gpu_split_mask); 8658 for_each_gpu_id(id) { 8659 if (!uvm_va_block_gpu_state_get(block, id)) 8660 continue; 8661 8662 gpu = block_get_gpu(block, id); 8663 8664 // If the parent chunk has not been mapped, there is nothing to split. 8665 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 8666 if (gpu_mapping_addr == 0) 8667 continue; 8668 8669 status = uvm_pmm_sysmem_mappings_split_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings, 8670 gpu_mapping_addr, 8671 new_size); 8672 if (status != NV_OK) 8673 goto merge; 8674 8675 uvm_processor_mask_set(&gpu_split_mask, id); 8676 } 8677 8678 if (new_size == UVM_CHUNK_SIZE_64K) 8679 status = block_split_cpu_chunk_to_64k(block); 8680 else 8681 status = block_split_cpu_chunk_to_4k(block, page_index); 8682 8683 if (status != NV_OK) { 8684 merge: 8685 for_each_gpu_id_in_mask(id, &gpu_split_mask) { 8686 gpu = block_get_gpu(block, id); 8687 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 8688 uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings, 8689 gpu_mapping_addr, 8690 chunk_size); 8691 } 8692 } 8693 8694 return status; 8695 } 8696 8697 static NV_STATUS block_prealloc_cpu_chunk_storage(uvm_va_block_t *existing, uvm_va_block_t *new) 8698 { 8699 uvm_cpu_chunk_storage_mixed_t *existing_mixed; 8700 uvm_cpu_chunk_storage_mixed_t *new_mixed = NULL; 8701 size_t slot_offset; 8702 size_t existing_slot; 8703 NV_STATUS status = NV_OK; 8704 8705 UVM_ASSERT(uvm_cpu_storage_get_type(existing) == UVM_CPU_CHUNK_STORAGE_MIXED); 8706 existing_mixed = uvm_cpu_storage_get_ptr(existing); 8707 8708 // Pre-allocate chunk storage for the new block. By definition, the new block 8709 // will contain either 64K and/or 4K chunks. 8710 // 8711 // We do this here so there are no failures in block_split_cpu(). 8712 new_mixed = uvm_kvmalloc_zero(sizeof(*new_mixed)); 8713 if (!new_mixed) 8714 return NV_ERR_NO_MEMORY; 8715 8716 slot_offset = compute_slot_index(existing, uvm_va_block_cpu_page_index(existing, new->start)); 8717 existing_slot = slot_offset; 8718 for_each_clear_bit_from(existing_slot, existing_mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK) { 8719 size_t new_slot = existing_slot - slot_offset; 8720 8721 if (existing_mixed->slots[existing_slot]) { 8722 uvm_cpu_chunk_t **small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT); 8723 8724 if (!small_chunks) { 8725 status = NV_ERR_NO_MEMORY; 8726 goto done; 8727 } 8728 8729 new_mixed->slots[new_slot] = small_chunks; 8730 } 8731 } 8732 8733 new->cpu.chunks = (unsigned long)new_mixed | UVM_CPU_CHUNK_STORAGE_MIXED; 8734 UVM_ASSERT(status == NV_OK); 8735 8736 done: 8737 if (status != NV_OK) { 8738 for (; existing_slot > slot_offset; existing_slot--) 8739 uvm_kvfree(new_mixed->slots[existing_slot - slot_offset]); 8740 8741 uvm_kvfree(new_mixed); 8742 } 8743 8744 return status; 8745 } 8746 8747 static void block_free_cpu_chunk_storage(uvm_va_block_t *block) 8748 { 8749 if (block->cpu.chunks) { 8750 uvm_cpu_chunk_storage_mixed_t *mixed; 8751 size_t slot_index; 8752 8753 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED); 8754 mixed = uvm_cpu_storage_get_ptr(block); 8755 for (slot_index = 0; slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK; slot_index++) 8756 uvm_kvfree(mixed->slots[slot_index]); 8757 8758 uvm_kvfree(mixed); 8759 block->cpu.chunks = 0; 8760 } 8761 } 8762 8763 // Perform any CPU chunk splitting that may be required for this block split. 8764 // Just like block_presplit_gpu_chunks, no chunks are moved to the new block. 8765 static NV_STATUS block_presplit_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new) 8766 { 8767 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start); 8768 uvm_cpu_chunk_t *splitting_chunk; 8769 uvm_chunk_sizes_mask_t split_sizes = uvm_cpu_chunk_get_allocation_sizes(); 8770 uvm_chunk_size_t subchunk_size; 8771 NV_STATUS status = NV_OK; 8772 8773 UVM_ASSERT(!IS_ALIGNED(new->start, UVM_VA_BLOCK_SIZE)); 8774 splitting_chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index); 8775 8776 // If the page covering the split point has not been populated, there is no 8777 // need to split. 8778 if (!splitting_chunk) 8779 return NV_OK; 8780 8781 // If the split point is aligned on the chunk size, there is no need to 8782 // split. 8783 if (IS_ALIGNED(new->start, uvm_cpu_chunk_get_size(splitting_chunk))) 8784 return NV_OK; 8785 8786 // Remove all sizes above the chunk's current size. 8787 split_sizes &= uvm_cpu_chunk_get_size(splitting_chunk) - 1; 8788 // Remove all sizes below the alignment of the new block's start. 8789 split_sizes &= ~(IS_ALIGNED(new->start, UVM_CHUNK_SIZE_64K) ? UVM_CHUNK_SIZE_64K - 1 : 0); 8790 8791 for_each_chunk_size_rev(subchunk_size, split_sizes) { 8792 status = block_split_cpu_chunk_one(existing, page_index); 8793 if (status != NV_OK) 8794 return status; 8795 } 8796 8797 return block_prealloc_cpu_chunk_storage(existing, new); 8798 } 8799 8800 static void block_merge_cpu_chunks_to_64k(uvm_va_block_t *block, uvm_page_index_t page_index) 8801 { 8802 uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(block); 8803 size_t slot_index = compute_slot_index(block, page_index); 8804 uvm_cpu_chunk_t **small_chunks = mixed->slots[slot_index]; 8805 uvm_cpu_chunk_t *merged_chunk; 8806 8807 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED); 8808 UVM_ASSERT(small_chunks); 8809 UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks)); 8810 8811 merged_chunk = uvm_cpu_chunk_merge(small_chunks); 8812 mixed->slots[slot_index] = merged_chunk; 8813 set_bit(slot_index, mixed->big_chunks); 8814 uvm_kvfree(small_chunks); 8815 } 8816 8817 static void block_merge_cpu_chunks_to_2m(uvm_va_block_t *block, uvm_page_index_t page_index) 8818 { 8819 uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(block); 8820 uvm_cpu_chunk_t **big_chunks = (uvm_cpu_chunk_t **)&mixed->slots; 8821 uvm_cpu_chunk_t *merged_chunk; 8822 8823 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED); 8824 UVM_ASSERT(bitmap_full(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK)); 8825 8826 merged_chunk = uvm_cpu_chunk_merge(big_chunks); 8827 block->cpu.chunks = (unsigned long)merged_chunk | UVM_CPU_CHUNK_STORAGE_CHUNK; 8828 uvm_kvfree(mixed); 8829 } 8830 8831 static void block_merge_cpu_chunks_one(uvm_va_block_t *block, uvm_page_index_t page_index) 8832 { 8833 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 8834 uvm_gpu_id_t id; 8835 8836 if (uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K) { 8837 block_merge_cpu_chunks_to_64k(block, page_index); 8838 } 8839 else { 8840 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K); 8841 block_merge_cpu_chunks_to_2m(block, page_index); 8842 } 8843 8844 chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 8845 8846 for_each_gpu_id(id) { 8847 NvU64 gpu_mapping_addr; 8848 uvm_gpu_t *gpu; 8849 8850 if (!uvm_va_block_gpu_state_get(block, id)) 8851 continue; 8852 8853 gpu = block_get_gpu(block, id); 8854 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 8855 if (gpu_mapping_addr == 0) 8856 continue; 8857 8858 uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings, 8859 gpu_mapping_addr, 8860 uvm_cpu_chunk_get_size(chunk)); 8861 } 8862 } 8863 8864 static void block_merge_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new) 8865 { 8866 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start); 8867 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index); 8868 uvm_chunk_sizes_mask_t merge_sizes = uvm_cpu_chunk_get_allocation_sizes(); 8869 uvm_chunk_size_t largest_size; 8870 uvm_chunk_size_t chunk_size; 8871 uvm_chunk_size_t merge_size; 8872 size_t block_size = uvm_va_block_size(existing); 8873 8874 if (!chunk || uvm_cpu_chunk_is_physical(chunk)) 8875 return; 8876 8877 chunk_size = uvm_cpu_chunk_get_size(chunk); 8878 8879 // Remove all CPU chunk sizes above the size of the existing VA block. 8880 // Since block sizes are not always powers of 2, use the largest power of 2 8881 // less than or equal to the block size since we can't merge to a size 8882 // larger than the block's size. 8883 largest_size = rounddown_pow_of_two(block_size); 8884 merge_sizes &= (largest_size | (largest_size - 1)); 8885 8886 // Remove all CPU chunk sizes smaller than the size of the chunk being merged up. 8887 merge_sizes &= ~(chunk_size | (chunk_size - 1)); 8888 8889 for_each_chunk_size(merge_size, merge_sizes) { 8890 uvm_va_block_region_t chunk_region; 8891 8892 // The block has to fully contain the VA range after the merge. 8893 if (!uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size)) || 8894 !uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size) + merge_size - 1)) 8895 break; 8896 8897 chunk_region = uvm_va_block_chunk_region(existing, merge_size, page_index); 8898 8899 // If not all pages in the region covered by the chunk are allocated, 8900 // we can't merge. 8901 if (!uvm_page_mask_region_full(&existing->cpu.allocated, chunk_region)) 8902 break; 8903 8904 block_merge_cpu_chunks_one(existing, chunk_region.first); 8905 chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index); 8906 if (uvm_cpu_chunk_is_physical(chunk)) 8907 break; 8908 } 8909 8910 block_free_cpu_chunk_storage(new); 8911 } 8912 8913 // Pre-allocate everything which doesn't require retry on both existing and new 8914 // which will be needed to handle a split. If this fails, existing must remain 8915 // functionally unmodified. 8916 static NV_STATUS block_split_preallocate_no_retry(uvm_va_block_t *existing, uvm_va_block_t *new) 8917 { 8918 NV_STATUS status; 8919 uvm_gpu_t *gpu; 8920 uvm_gpu_id_t id; 8921 uvm_page_index_t split_page_index; 8922 uvm_va_block_test_t *block_test; 8923 8924 status = block_presplit_cpu_chunks(existing, new); 8925 if (status != NV_OK) 8926 goto error; 8927 8928 for_each_gpu_id(id) { 8929 if (!uvm_va_block_gpu_state_get(existing, id)) 8930 continue; 8931 8932 gpu = block_get_gpu(existing, id); 8933 8934 status = block_presplit_gpu_chunks(existing, new, gpu); 8935 if (status != NV_OK) 8936 goto error; 8937 8938 if (!block_gpu_state_get_alloc(new, gpu)) { 8939 status = NV_ERR_NO_MEMORY; 8940 goto error; 8941 } 8942 } 8943 8944 block_test = uvm_va_block_get_test(existing); 8945 if (block_test && block_test->inject_split_error) { 8946 block_test->inject_split_error = false; 8947 if (!uvm_va_block_is_hmm(existing)) { 8948 UVM_ASSERT(existing->va_range->inject_split_error); 8949 existing->va_range->inject_split_error = false; 8950 } 8951 status = NV_ERR_NO_MEMORY; 8952 goto error; 8953 } 8954 8955 if (uvm_va_block_is_hmm(existing)) { 8956 uvm_va_policy_node_t *node = uvm_va_policy_node_find(existing, new->start); 8957 8958 if (node && node->node.start != new->start) { 8959 status = uvm_va_policy_node_split(existing, node, new->start - 1, NULL); 8960 if (status != NV_OK) 8961 goto error; 8962 } 8963 } 8964 8965 return NV_OK; 8966 8967 error: 8968 // Merge back the chunks we split 8969 split_page_index = uvm_va_block_cpu_page_index(existing, new->start); 8970 8971 for_each_gpu_id(id) { 8972 uvm_gpu_chunk_t *chunk; 8973 size_t chunk_index; 8974 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, id); 8975 8976 if (!existing_gpu_state) 8977 continue; 8978 8979 // If the chunk spanning the split point was split, merge it back 8980 gpu = block_get_gpu(existing, id); 8981 chunk_index = block_gpu_chunk_index(existing, gpu, split_page_index, NULL); 8982 chunk = existing_gpu_state->chunks[chunk_index]; 8983 if (!chunk || chunk->state != UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT) 8984 continue; 8985 8986 block_merge_chunk(existing, gpu, chunk); 8987 8988 // We could attempt to shrink the chunks array back down, but it doesn't 8989 // hurt much to have it larger than necessary, and we'd have to handle 8990 // the shrink call failing anyway on this error path. 8991 8992 } 8993 8994 block_merge_cpu_chunks(existing, new); 8995 8996 return status; 8997 } 8998 8999 // Re-calculate the block's top-level processor masks: 9000 // - block->mapped 9001 // - block->resident 9002 // 9003 // This is called on block split. 9004 static void block_set_processor_masks(uvm_va_block_t *block) 9005 { 9006 size_t num_pages = uvm_va_block_num_cpu_pages(block); 9007 uvm_va_block_region_t block_region = uvm_va_block_region(0, num_pages); 9008 uvm_gpu_id_t id; 9009 9010 if (uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], block_region)) { 9011 UVM_ASSERT(uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], block_region)); 9012 uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU); 9013 } 9014 else { 9015 uvm_processor_mask_set(&block->mapped, UVM_ID_CPU); 9016 } 9017 9018 if (uvm_page_mask_region_empty(&block->cpu.resident, block_region)) { 9019 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 9020 9021 if (uvm_processor_mask_get_gpu_count(&va_space->can_access[UVM_ID_CPU_VALUE]) == 0) 9022 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)); 9023 9024 block_clear_resident_processor(block, UVM_ID_CPU); 9025 } 9026 else { 9027 block_set_resident_processor(block, UVM_ID_CPU); 9028 } 9029 9030 for_each_gpu_id(id) { 9031 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id); 9032 if (!gpu_state) 9033 continue; 9034 9035 if (uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], block_region)) { 9036 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], block_region)); 9037 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], block_region)); 9038 uvm_processor_mask_clear(&block->mapped, id); 9039 } 9040 else { 9041 uvm_processor_mask_set(&block->mapped, id); 9042 } 9043 9044 if (uvm_page_mask_region_empty(&gpu_state->resident, block_region)) 9045 block_clear_resident_processor(block, id); 9046 else 9047 block_set_resident_processor(block, id); 9048 9049 if (uvm_page_mask_region_empty(&gpu_state->evicted, block_region)) 9050 uvm_processor_mask_clear(&block->evicted_gpus, id); 9051 else 9052 uvm_processor_mask_set(&block->evicted_gpus, id); 9053 } 9054 } 9055 9056 // Split a PAGES_PER_UVM_VA_BLOCK sized bitmap into new and existing parts 9057 // corresponding to a block split. 9058 static void block_split_page_mask(uvm_page_mask_t *existing_mask, 9059 size_t existing_pages, 9060 uvm_page_mask_t *new_mask, 9061 size_t new_pages) 9062 { 9063 UVM_ASSERT_MSG(existing_pages + new_pages <= PAGES_PER_UVM_VA_BLOCK, "existing %zu new %zu\n", 9064 existing_pages, new_pages); 9065 9066 // The new block is always in the upper region of existing, so shift the bit 9067 // vectors down. 9068 // 9069 // Note that bitmap_shift_right requires both dst and src to be the same 9070 // size. That's ok since we don't scale them by block size. 9071 uvm_page_mask_shift_right(new_mask, existing_mask, existing_pages); 9072 uvm_page_mask_region_clear(existing_mask, uvm_va_block_region(existing_pages, existing_pages + new_pages)); 9073 } 9074 9075 // Split the CPU state within the existing block. existing's start is correct 9076 // but its end has not yet been adjusted. 9077 static void block_split_cpu(uvm_va_block_t *existing, uvm_va_block_t *new) 9078 { 9079 size_t existing_pages, new_pages = uvm_va_block_num_cpu_pages(new); 9080 uvm_pte_bits_cpu_t pte_bit; 9081 uvm_va_block_region_t block_region = uvm_va_block_region_from_block(existing); 9082 uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start); 9083 uvm_page_index_t page_index; 9084 uvm_page_index_t next_page_index; 9085 uvm_cpu_chunk_t *chunk; 9086 uvm_va_range_t *existing_va_range = existing->va_range; 9087 9088 if (existing_va_range) { 9089 UVM_ASSERT(existing->va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 9090 UVM_ASSERT(existing->va_range->type == new->va_range->type); 9091 } 9092 9093 UVM_ASSERT(existing->start < new->start); 9094 UVM_ASSERT(existing->end == new->end); 9095 9096 UVM_ASSERT(PAGE_ALIGNED(new->start)); 9097 UVM_ASSERT(PAGE_ALIGNED(existing->start)); 9098 9099 existing_pages = (new->start - existing->start) / PAGE_SIZE; 9100 9101 // We don't have to unmap the CPU since its virtual -> physical mappings 9102 // don't change. 9103 9104 page_index = uvm_va_block_next_page_in_mask(block_region, &existing->cpu.allocated, split_page_index - 1); 9105 9106 for_each_cpu_chunk_in_block_region_safe(chunk, 9107 page_index, 9108 next_page_index, 9109 existing, 9110 uvm_va_block_region(split_page_index, block_region.outer)) { 9111 uvm_page_index_t new_chunk_page_index; 9112 NV_STATUS status; 9113 9114 uvm_cpu_chunk_remove_from_block(existing, page_index); 9115 9116 // The chunk has to be adjusted for the new block before inserting it. 9117 new_chunk_page_index = page_index - split_page_index; 9118 9119 // This should never fail because all necessary storage was allocated 9120 // in block_presplit_cpu_chunks(). 9121 status = uvm_cpu_chunk_insert_in_block(new, chunk, new_chunk_page_index); 9122 UVM_ASSERT(status == NV_OK); 9123 } 9124 9125 new->cpu.ever_mapped = existing->cpu.ever_mapped; 9126 9127 block_split_page_mask(&existing->cpu.resident, existing_pages, &new->cpu.resident, new_pages); 9128 9129 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++) 9130 block_split_page_mask(&existing->cpu.pte_bits[pte_bit], existing_pages, &new->cpu.pte_bits[pte_bit], new_pages); 9131 } 9132 9133 // Fill out the blocks' chunks arrays with the chunks split by 9134 // block_presplit_gpu_chunks. 9135 static void block_copy_split_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu) 9136 { 9137 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id); 9138 uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu->id); 9139 uvm_gpu_chunk_t **temp_chunks; 9140 uvm_gpu_chunk_t *original_chunk; 9141 block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state; 9142 size_t num_pre_chunks, num_post_chunks, num_split_chunks_existing, num_split_chunks_new; 9143 uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start); 9144 size_t i; 9145 9146 block_gpu_chunk_get_split_state(existing, 9147 &existing_before_state, 9148 existing->start, 9149 existing->end, 9150 split_page_index, 9151 gpu); 9152 block_gpu_chunk_get_split_state(existing, 9153 &existing_after_state, 9154 existing->start, 9155 new->start - 1, 9156 split_page_index - 1, 9157 gpu); 9158 block_gpu_chunk_get_split_state(new, 9159 &new_state, 9160 new->start, 9161 new->end, 9162 0, 9163 gpu); 9164 9165 // General case (B is original_chunk): 9166 // split 9167 // v 9168 // existing (before) [------ A -----][------ B -----][------ C -----] 9169 // existing (after) [------ A -----][- B0 -] 9170 // new [- B1 -][------ C -----] 9171 // 9172 // Note that the logic below also handles the case of the split happening at 9173 // a chunk boundary. That case behaves as though there is no B0 chunk. 9174 9175 // Number of chunks to the left and right of original_chunk (A and C above). 9176 // Either or both of these may be 0. 9177 num_pre_chunks = existing_before_state.chunk_index; 9178 num_post_chunks = existing_before_state.num_chunks - num_pre_chunks - 1; 9179 9180 // Number of subchunks under existing's portion of original_chunk (B0 above) 9181 num_split_chunks_existing = existing_after_state.num_chunks - num_pre_chunks; 9182 9183 // Number of subchunks under new's portion of original_chunk (B1 above) 9184 num_split_chunks_new = new_state.num_chunks - num_post_chunks; 9185 9186 UVM_ASSERT(num_pre_chunks + num_split_chunks_existing > 0); 9187 UVM_ASSERT(num_split_chunks_new > 0); 9188 9189 // Copy post chunks from the end of existing into new (C above) 9190 memcpy(&new_gpu_state->chunks[num_split_chunks_new], 9191 &existing_gpu_state->chunks[existing_before_state.chunk_index + 1], 9192 num_post_chunks * sizeof(new_gpu_state->chunks[0])); 9193 9194 // Save off the original split chunk since we may overwrite the array 9195 original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index]; 9196 9197 // Fill out the new pointers 9198 if (original_chunk) { 9199 // Note that if the split happened at a chunk boundary, original_chunk 9200 // will not be split. In that case, num_split_chunks_existing will be 0 9201 // and num_split_chunks_new will be 1, so the left copy will be skipped 9202 // and the right copy will pick up the chunk. 9203 9204 // Copy left newly-split chunks into existing (B0 above). The array was 9205 // re-sized in block_presplit_gpu_chunks as necessary. 9206 size_t num_subchunks; 9207 9208 num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm, 9209 original_chunk, 9210 0, // start_index 9211 num_split_chunks_existing, 9212 &existing_gpu_state->chunks[existing_before_state.chunk_index]); 9213 UVM_ASSERT(num_subchunks == num_split_chunks_existing); 9214 9215 // Copy right newly-split chunks into new (B1 above), overwriting the 9216 // pointer to the original chunk. 9217 num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm, 9218 original_chunk, 9219 num_split_chunks_existing, // start_index 9220 num_split_chunks_new, 9221 &new_gpu_state->chunks[0]); 9222 UVM_ASSERT(num_subchunks == num_split_chunks_new); 9223 } 9224 else { 9225 // If the chunk wasn't already populated we don't need to copy pointers 9226 // anywhere, but we need to clear out stale pointers from existing's 9227 // array covering the new elements. new's chunks array was already zero- 9228 // initialized. 9229 memset(&existing_gpu_state->chunks[existing_before_state.chunk_index], 9230 0, 9231 num_split_chunks_existing * sizeof(existing_gpu_state->chunks[0])); 9232 } 9233 9234 // Since we update the reverse map information, protect it against a 9235 // concurrent lookup 9236 uvm_spin_lock(&gpu->pmm.list_lock); 9237 9238 // Update the reverse map of all the chunks that are now under the new block 9239 for (i = 0; i < new_state.num_chunks; ++i) { 9240 if (new_gpu_state->chunks[i]) { 9241 UVM_ASSERT(new_gpu_state->chunks[i]->va_block == existing); 9242 new_gpu_state->chunks[i]->va_block = new; 9243 9244 // Adjust the page_index within the VA block for the new subchunks in 9245 // the new VA block 9246 UVM_ASSERT(new_gpu_state->chunks[i]->va_block_page_index >= split_page_index); 9247 new_gpu_state->chunks[i]->va_block_page_index -= split_page_index; 9248 } 9249 } 9250 9251 uvm_spin_unlock(&gpu->pmm.list_lock); 9252 9253 // Attempt to shrink existing's chunk allocation. If the realloc fails, just 9254 // keep on using the old larger one. 9255 if (existing_after_state.num_chunks < existing_before_state.num_chunks) { 9256 temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks, 9257 existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0])); 9258 if (temp_chunks) 9259 existing_gpu_state->chunks = temp_chunks; 9260 } 9261 } 9262 9263 static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_id_t gpu_id) 9264 { 9265 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu_id); 9266 uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu_id); 9267 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing); 9268 uvm_gpu_va_space_t *gpu_va_space; 9269 uvm_gpu_t *gpu; 9270 uvm_gpu_t *accessing_gpu; 9271 size_t new_pages = uvm_va_block_num_cpu_pages(new); 9272 size_t existing_pages, existing_pages_4k, existing_pages_big, new_pages_big; 9273 uvm_pte_bits_gpu_t pte_bit; 9274 size_t num_chunks, i; 9275 uvm_cpu_chunk_t *cpu_chunk; 9276 uvm_page_index_t page_index; 9277 9278 if (!existing_gpu_state) 9279 return; 9280 9281 gpu = uvm_va_space_get_gpu(va_space, gpu_id); 9282 UVM_ASSERT(new_gpu_state); 9283 9284 new_gpu_state->force_4k_ptes = existing_gpu_state->force_4k_ptes; 9285 9286 UVM_ASSERT(PAGE_ALIGNED(new->start)); 9287 UVM_ASSERT(PAGE_ALIGNED(existing->start)); 9288 existing_pages = (new->start - existing->start) / PAGE_SIZE; 9289 9290 for_each_cpu_chunk_in_block(cpu_chunk, page_index, new) { 9291 uvm_pmm_sysmem_mappings_reparent_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, 9292 uvm_cpu_chunk_get_gpu_phys_addr(cpu_chunk, gpu->parent), 9293 new); 9294 } 9295 9296 block_copy_split_gpu_chunks(existing, new, gpu); 9297 9298 num_chunks = block_num_gpu_chunks(new, gpu); 9299 9300 // Reparent GPU mappings for indirect peers 9301 for (i = 0; i < num_chunks; ++i) { 9302 uvm_gpu_chunk_t *chunk = new_gpu_state->chunks[i]; 9303 if (!chunk) 9304 continue; 9305 9306 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 9307 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); 9308 9309 uvm_pmm_sysmem_mappings_reparent_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, 9310 peer_addr, 9311 new); 9312 } 9313 } 9314 9315 block_split_page_mask(&existing_gpu_state->resident, 9316 existing_pages, 9317 &new_gpu_state->resident, 9318 new_pages); 9319 9320 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { 9321 block_split_page_mask(&existing_gpu_state->pte_bits[pte_bit], existing_pages, 9322 &new_gpu_state->pte_bits[pte_bit], new_pages); 9323 } 9324 9325 // Adjust page table ranges. 9326 gpu_va_space = uvm_gpu_va_space_get(va_space, gpu); 9327 if (gpu_va_space) { 9328 if (existing_gpu_state->page_table_range_big.table) { 9329 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu); 9330 9331 // existing's end has not been adjusted yet 9332 existing_pages_big = range_num_big_pages(existing->start, new->start - 1, big_page_size); 9333 9334 // Take references on all big pages covered by new 9335 new_pages_big = uvm_va_block_num_big_pages(new, big_page_size); 9336 if (new_pages_big) { 9337 uvm_page_table_range_get_upper(&gpu_va_space->page_tables, 9338 &existing_gpu_state->page_table_range_big, 9339 &new_gpu_state->page_table_range_big, 9340 new_pages_big); 9341 9342 // If the split point is within a big page region, we might have 9343 // a gap since neither existing nor new can use it anymore. 9344 // Get the top N bits from existing's mask to handle that. 9345 bitmap_shift_right(new_gpu_state->big_ptes, 9346 existing_gpu_state->big_ptes, 9347 uvm_va_block_num_big_pages(existing, big_page_size) - new_pages_big, 9348 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 9349 9350 new_gpu_state->initialized_big = existing_gpu_state->initialized_big; 9351 } 9352 9353 // Drop existing's references on the big PTEs it no longer covers 9354 // now that new has references on them. Note that neither existing 9355 // nor new might have big PTEs after the split. In that case, this 9356 // shrink will free the entire old range. 9357 uvm_page_table_range_shrink(&gpu_va_space->page_tables, 9358 &existing_gpu_state->page_table_range_big, 9359 existing_pages_big); 9360 9361 if (existing_pages_big == 0) { 9362 memset(&existing_gpu_state->page_table_range_big, 0, sizeof(existing_gpu_state->page_table_range_big)); 9363 existing_gpu_state->initialized_big = false; 9364 } 9365 9366 bitmap_clear(existing_gpu_state->big_ptes, 9367 existing_pages_big, 9368 MAX_BIG_PAGES_PER_UVM_VA_BLOCK - existing_pages_big); 9369 } 9370 9371 if (existing_gpu_state->page_table_range_4k.table) { 9372 // Since existing and new share the same PDE we just need to bump 9373 // the ref-count on new's sub-range. 9374 uvm_page_table_range_get_upper(&gpu_va_space->page_tables, 9375 &existing_gpu_state->page_table_range_4k, 9376 &new_gpu_state->page_table_range_4k, 9377 uvm_va_block_size(new) / UVM_PAGE_SIZE_4K); 9378 9379 // Drop existing's references on the PTEs it no longer covers now 9380 // that new has references on them. 9381 existing_pages_4k = existing_pages * (PAGE_SIZE / UVM_PAGE_SIZE_4K); 9382 uvm_page_table_range_shrink(&gpu_va_space->page_tables, 9383 &existing_gpu_state->page_table_range_4k, 9384 existing_pages_4k); 9385 } 9386 9387 // We have to set this explicitly to handle the case of splitting an 9388 // invalid, active 2M PTE with no lower page tables allocated. 9389 if (existing_gpu_state->pte_is_2m) { 9390 UVM_ASSERT(!existing_gpu_state->page_table_range_big.table); 9391 UVM_ASSERT(!existing_gpu_state->page_table_range_4k.table); 9392 existing_gpu_state->pte_is_2m = false; 9393 } 9394 9395 // existing can't possibly cover 2MB after a split, so drop any 2M PTE 9396 // references it has. We've taken the necessary references on the lower 9397 // tables above. 9398 block_put_ptes_safe(&gpu_va_space->page_tables, &existing_gpu_state->page_table_range_2m); 9399 existing_gpu_state->activated_big = false; 9400 existing_gpu_state->activated_4k = false; 9401 } 9402 9403 block_split_page_mask(&existing_gpu_state->evicted, existing_pages, &new_gpu_state->evicted, new_pages); 9404 } 9405 9406 NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block, 9407 NvU64 new_end, 9408 uvm_va_block_t **new_va_block, 9409 uvm_va_range_t *new_va_range) 9410 { 9411 uvm_va_space_t *va_space; 9412 uvm_va_block_t *new_block = NULL; 9413 NV_STATUS status; 9414 9415 va_space = new_va_range->va_space; 9416 UVM_ASSERT(existing_va_block->va_range); 9417 UVM_ASSERT(existing_va_block->va_range->va_space == va_space); 9418 UVM_ASSERT(!uvm_va_block_is_hmm(existing_va_block)); 9419 9420 // External range types can't be split 9421 UVM_ASSERT(existing_va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 9422 UVM_ASSERT(new_va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 9423 uvm_assert_rwsem_locked_write(&va_space->lock); 9424 9425 UVM_ASSERT(new_end > existing_va_block->start); 9426 UVM_ASSERT(new_end < existing_va_block->end); 9427 UVM_ASSERT(PAGE_ALIGNED(new_end + 1)); 9428 9429 status = uvm_va_block_create(new_va_range, new_end + 1, existing_va_block->end, &new_block); 9430 if (status != NV_OK) 9431 return status; 9432 9433 // We're protected from other splits and faults by the va_space lock being 9434 // held in write mode, but that doesn't stop the reverse mapping (eviction 9435 // path) from inspecting the existing block. Stop those threads by taking 9436 // the block lock. When a reverse mapping thread takes this lock after the 9437 // split has been performed, it will have to re-inspect state and may see 9438 // that it should use the newly-split block instead. 9439 uvm_mutex_lock(&existing_va_block->lock); 9440 9441 status = uvm_va_block_split_locked(existing_va_block, new_end, new_block, new_va_range); 9442 9443 uvm_mutex_unlock(&existing_va_block->lock); 9444 9445 if (status != NV_OK) 9446 uvm_va_block_release(new_block); 9447 else if (new_va_block) 9448 *new_va_block = new_block; 9449 9450 return status; 9451 } 9452 9453 NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block, 9454 NvU64 new_end, 9455 uvm_va_block_t *new_block, 9456 uvm_va_range_t *new_va_range) 9457 { 9458 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing_va_block); 9459 uvm_gpu_id_t id; 9460 NV_STATUS status; 9461 uvm_perf_event_data_t event_data; 9462 9463 UVM_ASSERT(block_check_chunks(existing_va_block)); 9464 9465 // As soon as we update existing's reverse mappings to point to the newly- 9466 // split block, the eviction path could try to operate on the new block. 9467 // Lock that out too until new is ready. 9468 // 9469 // Note that we usually shouldn't nest block locks, but it's ok here because 9470 // we just created new_block so no other thread could possibly take it out 9471 // of order with existing's lock. 9472 uvm_mutex_lock_no_tracking(&new_block->lock); 9473 9474 // The split has to be transactional, meaning that if we fail, the existing 9475 // block must not be modified. Handle that by pre-allocating everything we 9476 // might need under both existing and new at the start so we only have a 9477 // single point of failure. 9478 9479 // Since pre-allocation might require allocating new PTEs, we have to handle 9480 // allocation retry which might drop existing's block lock. The 9481 // preallocation is split into two steps for that: the first part which 9482 // allocates and splits PTEs can handle having the block lock dropped then 9483 // re-taken. It won't modify existing_va_block other than adding new PTE 9484 // allocations and splitting existing PTEs, which is always safe. 9485 status = UVM_VA_BLOCK_RETRY_LOCKED(existing_va_block, 9486 NULL, 9487 block_split_presplit_ptes(existing_va_block, new_block)); 9488 if (status != NV_OK) 9489 goto out; 9490 9491 // Pre-allocate, stage two. This modifies existing_va_block in ways which 9492 // violate many assumptions (such as changing chunk size), but it will put 9493 // things back into place on a failure without dropping the block lock. 9494 status = block_split_preallocate_no_retry(existing_va_block, new_block); 9495 if (status != NV_OK) 9496 goto out; 9497 9498 // We'll potentially be freeing page tables, so we need to wait for any 9499 // outstanding work before we start 9500 status = uvm_tracker_wait(&existing_va_block->tracker); 9501 if (status != NV_OK) 9502 goto out; 9503 9504 // Update existing's state only once we're past all failure points 9505 9506 event_data.block_shrink.block = existing_va_block; 9507 uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_SHRINK, &event_data); 9508 9509 block_split_cpu(existing_va_block, new_block); 9510 9511 for_each_gpu_id(id) 9512 block_split_gpu(existing_va_block, new_block, id); 9513 9514 // Update the size of the existing block first so that 9515 // block_set_processor_masks can use block_{set,clear}_resident_processor 9516 // that relies on the size to be correct. 9517 existing_va_block->end = new_end; 9518 9519 block_split_page_mask(&existing_va_block->read_duplicated_pages, 9520 uvm_va_block_num_cpu_pages(existing_va_block), 9521 &new_block->read_duplicated_pages, 9522 uvm_va_block_num_cpu_pages(new_block)); 9523 9524 block_split_page_mask(&existing_va_block->maybe_mapped_pages, 9525 uvm_va_block_num_cpu_pages(existing_va_block), 9526 &new_block->maybe_mapped_pages, 9527 uvm_va_block_num_cpu_pages(new_block)); 9528 9529 block_set_processor_masks(existing_va_block); 9530 block_set_processor_masks(new_block); 9531 9532 if (uvm_va_block_is_hmm(existing_va_block)) { 9533 uvm_hmm_va_block_split_tree(existing_va_block, new_block); 9534 uvm_va_policy_node_split_move(existing_va_block, new_block); 9535 } 9536 9537 out: 9538 // Run checks on existing_va_block even on failure, since an error must 9539 // leave the block in a consistent state. 9540 UVM_ASSERT(block_check_chunks(existing_va_block)); 9541 UVM_ASSERT(block_check_mappings(existing_va_block)); 9542 if (status == NV_OK) { 9543 UVM_ASSERT(block_check_chunks(new_block)); 9544 UVM_ASSERT(block_check_mappings(new_block)); 9545 } 9546 else { 9547 block_free_cpu_chunk_storage(new_block); 9548 } 9549 9550 uvm_mutex_unlock_no_tracking(&new_block->lock); 9551 9552 return status; 9553 } 9554 9555 static bool block_region_might_read_duplicate(uvm_va_block_t *va_block, 9556 uvm_va_block_region_t region) 9557 { 9558 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9559 uvm_va_range_t *va_range = va_block->va_range; 9560 9561 if (!uvm_va_space_can_read_duplicate(va_space, NULL)) 9562 return false; 9563 9564 // TODO: Bug 3660922: need to implement HMM read duplication support. 9565 if (uvm_va_block_is_hmm(va_block) || 9566 uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_DISABLED) 9567 return false; 9568 9569 if (uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_UNSET 9570 && uvm_page_mask_region_weight(&va_block->read_duplicated_pages, region) == 0) 9571 return false; 9572 9573 return true; 9574 } 9575 9576 // Returns the new access permission for the processor that faulted or 9577 // triggered access counter notifications on the given page 9578 // 9579 // TODO: Bug 1766424: this function works on a single page at a time. This 9580 // could be changed in the future to optimize multiple faults/counters on 9581 // contiguous pages. 9582 static uvm_prot_t compute_new_permission(uvm_va_block_t *va_block, 9583 uvm_va_block_context_t *va_block_context, 9584 uvm_page_index_t page_index, 9585 uvm_processor_id_t fault_processor_id, 9586 uvm_processor_id_t new_residency, 9587 uvm_fault_access_type_t access_type) 9588 { 9589 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9590 uvm_prot_t logical_prot, new_prot; 9591 9592 // TODO: Bug 1766432: Refactor into policies. Current policy is 9593 // query_promote: upgrade access privileges to avoid future faults IF 9594 // they don't trigger further revocations. 9595 new_prot = uvm_fault_access_type_to_prot(access_type); 9596 logical_prot = compute_logical_prot(va_block, va_block_context, page_index); 9597 9598 UVM_ASSERT(logical_prot >= new_prot); 9599 9600 if (logical_prot > UVM_PROT_READ_ONLY && new_prot == UVM_PROT_READ_ONLY && 9601 !block_region_might_read_duplicate(va_block, uvm_va_block_region_for_page(page_index))) { 9602 uvm_processor_mask_t processors_with_atomic_mapping; 9603 uvm_processor_mask_t revoke_processors; 9604 9605 block_page_authorized_processors(va_block, 9606 page_index, 9607 UVM_PROT_READ_WRITE_ATOMIC, 9608 &processors_with_atomic_mapping); 9609 9610 uvm_processor_mask_andnot(&revoke_processors, 9611 &processors_with_atomic_mapping, 9612 &va_space->has_native_atomics[uvm_id_value(new_residency)]); 9613 9614 // Only check if there are no faultable processors in the revoke 9615 // processors mask. 9616 uvm_processor_mask_and(&revoke_processors, &revoke_processors, &va_space->faultable_processors); 9617 9618 if (uvm_processor_mask_empty(&revoke_processors)) 9619 new_prot = UVM_PROT_READ_WRITE; 9620 } 9621 if (logical_prot == UVM_PROT_READ_WRITE_ATOMIC) { 9622 // HMM allocations with logical read/write/atomic permission can be 9623 // upgraded without notifying the driver so assume read/write/atomic 9624 // even if the fault is only for reading. 9625 if (new_prot == UVM_PROT_READ_WRITE || 9626 (UVM_ID_IS_CPU(fault_processor_id) && uvm_va_block_is_hmm(va_block))) { 9627 if (uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], fault_processor_id)) 9628 new_prot = UVM_PROT_READ_WRITE_ATOMIC; 9629 } 9630 } 9631 9632 return new_prot; 9633 } 9634 9635 static NV_STATUS do_block_add_mappings_after_migration(uvm_va_block_t *va_block, 9636 uvm_va_block_context_t *va_block_context, 9637 uvm_processor_id_t new_residency, 9638 uvm_processor_id_t processor_id, 9639 const uvm_processor_mask_t *map_processors, 9640 uvm_va_block_region_t region, 9641 const uvm_page_mask_t *map_page_mask, 9642 uvm_prot_t max_prot, 9643 const uvm_processor_mask_t *thrashing_processors, 9644 uvm_tracker_t *tracker) 9645 { 9646 NV_STATUS status; 9647 uvm_processor_id_t map_processor_id; 9648 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9649 uvm_prot_t new_map_prot = max_prot; 9650 uvm_processor_mask_t map_processors_local; 9651 9652 uvm_processor_mask_copy(&map_processors_local, map_processors); 9653 9654 // Handle atomic mappings separately 9655 if (max_prot == UVM_PROT_READ_WRITE_ATOMIC) { 9656 bool this_processor_has_native_atomics; 9657 9658 this_processor_has_native_atomics = 9659 uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id); 9660 9661 if (this_processor_has_native_atomics) { 9662 uvm_processor_mask_t map_atomic_processors; 9663 9664 // Compute processors with native atomics to the residency 9665 uvm_processor_mask_and(&map_atomic_processors, 9666 &map_processors_local, 9667 &va_space->has_native_atomics[uvm_id_value(new_residency)]); 9668 9669 // Filter out these mapped processors for the next steps 9670 uvm_processor_mask_andnot(&map_processors_local, &map_processors_local, &map_atomic_processors); 9671 9672 for_each_id_in_mask(map_processor_id, &map_atomic_processors) { 9673 UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy; 9674 if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id)) 9675 cause = UvmEventMapRemoteCauseThrashing; 9676 9677 status = uvm_va_block_map(va_block, 9678 va_block_context, 9679 map_processor_id, 9680 region, 9681 map_page_mask, 9682 UVM_PROT_READ_WRITE_ATOMIC, 9683 cause, 9684 tracker); 9685 if (status != NV_OK) 9686 return status; 9687 } 9688 9689 new_map_prot = UVM_PROT_READ_WRITE; 9690 } 9691 else { 9692 if (UVM_ID_IS_CPU(processor_id)) 9693 new_map_prot = UVM_PROT_READ_WRITE; 9694 else 9695 new_map_prot = UVM_PROT_READ_ONLY; 9696 } 9697 } 9698 9699 // Map the rest of processors 9700 for_each_id_in_mask(map_processor_id, &map_processors_local) { 9701 UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy; 9702 uvm_prot_t final_map_prot; 9703 bool map_processor_has_enabled_system_wide_atomics = 9704 uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, map_processor_id); 9705 9706 // Write mappings from processors with disabled system-wide atomics are treated like atomics 9707 if (new_map_prot == UVM_PROT_READ_WRITE && !map_processor_has_enabled_system_wide_atomics) 9708 final_map_prot = UVM_PROT_READ_WRITE_ATOMIC; 9709 else 9710 final_map_prot = new_map_prot; 9711 9712 if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id)) 9713 cause = UvmEventMapRemoteCauseThrashing; 9714 9715 status = uvm_va_block_map(va_block, 9716 va_block_context, 9717 map_processor_id, 9718 region, 9719 map_page_mask, 9720 final_map_prot, 9721 cause, 9722 tracker); 9723 if (status != NV_OK) 9724 return status; 9725 } 9726 9727 return NV_OK; 9728 } 9729 9730 NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block, 9731 uvm_va_block_context_t *va_block_context, 9732 uvm_processor_id_t new_residency, 9733 uvm_processor_id_t processor_id, 9734 uvm_va_block_region_t region, 9735 const uvm_page_mask_t *map_page_mask, 9736 uvm_prot_t max_prot, 9737 const uvm_processor_mask_t *thrashing_processors) 9738 { 9739 NV_STATUS tracker_status, status = NV_OK; 9740 uvm_processor_mask_t map_other_processors, map_uvm_lite_gpus; 9741 uvm_processor_id_t map_processor_id; 9742 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9743 const uvm_page_mask_t *final_page_mask = map_page_mask; 9744 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 9745 const uvm_va_policy_t *policy = va_block_context->policy; 9746 uvm_processor_id_t preferred_location; 9747 9748 uvm_assert_mutex_locked(&va_block->lock); 9749 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, policy, region)); 9750 9751 // Read duplication takes precedence over SetAccessedBy. 9752 // 9753 // Exclude ranges with read duplication set... 9754 if (uvm_va_policy_is_read_duplicate(policy, va_space)) { 9755 status = NV_OK; 9756 goto out; 9757 } 9758 9759 // ... and pages read-duplicated by performance heuristics 9760 if (policy->read_duplication == UVM_READ_DUPLICATION_UNSET) { 9761 if (map_page_mask) { 9762 uvm_page_mask_andnot(&va_block_context->mapping.filtered_page_mask, 9763 map_page_mask, 9764 &va_block->read_duplicated_pages); 9765 } 9766 else { 9767 uvm_page_mask_complement(&va_block_context->mapping.filtered_page_mask, &va_block->read_duplicated_pages); 9768 } 9769 final_page_mask = &va_block_context->mapping.filtered_page_mask; 9770 } 9771 9772 // Add mappings for accessed_by processors and the given processor mask 9773 if (thrashing_processors) 9774 uvm_processor_mask_or(&map_other_processors, &policy->accessed_by, thrashing_processors); 9775 else 9776 uvm_processor_mask_copy(&map_other_processors, &policy->accessed_by); 9777 9778 // Only processors that can access the new location must be considered 9779 uvm_processor_mask_and(&map_other_processors, 9780 &map_other_processors, 9781 &va_space->accessible_from[uvm_id_value(new_residency)]); 9782 9783 // Exclude caller processor as it must have already been mapped 9784 uvm_processor_mask_clear(&map_other_processors, processor_id); 9785 9786 // Exclude preferred location so it won't get remote mappings 9787 preferred_location = policy->preferred_location; 9788 if (UVM_ID_IS_VALID(preferred_location) && 9789 !uvm_id_equal(new_residency, preferred_location) && 9790 uvm_va_space_processor_has_memory(va_space, preferred_location)) { 9791 uvm_processor_mask_clear(&map_other_processors, preferred_location); 9792 } 9793 9794 // Map the UVM-Lite GPUs if the new location is the preferred location. This 9795 // will only create mappings on first touch. After that they're persistent 9796 // so uvm_va_block_map will be a no-op. 9797 uvm_processor_mask_and(&map_uvm_lite_gpus, &map_other_processors, block_get_uvm_lite_gpus(va_block)); 9798 if (!uvm_processor_mask_empty(&map_uvm_lite_gpus) && 9799 uvm_id_equal(new_residency, preferred_location)) { 9800 for_each_id_in_mask(map_processor_id, &map_uvm_lite_gpus) { 9801 status = uvm_va_block_map(va_block, 9802 va_block_context, 9803 map_processor_id, 9804 region, 9805 final_page_mask, 9806 UVM_PROT_READ_WRITE_ATOMIC, 9807 UvmEventMapRemoteCauseCoherence, 9808 &local_tracker); 9809 if (status != NV_OK) 9810 goto out; 9811 } 9812 } 9813 9814 uvm_processor_mask_andnot(&map_other_processors, &map_other_processors, block_get_uvm_lite_gpus(va_block)); 9815 9816 // We can't map non-migratable pages to the CPU. If we have any, build a 9817 // new mask of migratable pages and map the CPU separately. 9818 if (uvm_processor_mask_test(&map_other_processors, UVM_ID_CPU) && 9819 !uvm_range_group_all_migratable(va_space, 9820 uvm_va_block_region_start(va_block, region), 9821 uvm_va_block_region_end(va_block, region))) { 9822 uvm_page_mask_t *migratable_mask = &va_block_context->mapping.migratable_mask; 9823 9824 uvm_range_group_migratable_page_mask(va_block, region, migratable_mask); 9825 if (uvm_page_mask_and(migratable_mask, migratable_mask, final_page_mask)) { 9826 uvm_processor_mask_t cpu_mask; 9827 uvm_processor_mask_zero(&cpu_mask); 9828 uvm_processor_mask_set(&cpu_mask, UVM_ID_CPU); 9829 9830 status = do_block_add_mappings_after_migration(va_block, 9831 va_block_context, 9832 new_residency, 9833 processor_id, 9834 &cpu_mask, 9835 region, 9836 migratable_mask, 9837 max_prot, 9838 thrashing_processors, 9839 &local_tracker); 9840 if (status != NV_OK) 9841 goto out; 9842 } 9843 9844 uvm_processor_mask_clear(&map_other_processors, UVM_ID_CPU); 9845 } 9846 9847 status = do_block_add_mappings_after_migration(va_block, 9848 va_block_context, 9849 new_residency, 9850 processor_id, 9851 &map_other_processors, 9852 region, 9853 final_page_mask, 9854 max_prot, 9855 thrashing_processors, 9856 &local_tracker); 9857 if (status != NV_OK) 9858 goto out; 9859 9860 out: 9861 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 9862 uvm_tracker_deinit(&local_tracker); 9863 return status == NV_OK ? tracker_status : status; 9864 } 9865 9866 // TODO: Bug 1750144: check logical permissions from HMM to know what's the 9867 // maximum allowed. 9868 uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block, 9869 uvm_processor_id_t processor_id, 9870 uvm_page_index_t page_index) 9871 { 9872 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9873 uvm_processor_mask_t resident_processors; 9874 NvU32 resident_processors_count; 9875 9876 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), processor_id)) 9877 return UVM_PROT_READ_WRITE_ATOMIC; 9878 9879 uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors); 9880 resident_processors_count = uvm_processor_mask_get_count(&resident_processors); 9881 9882 if (resident_processors_count == 0) { 9883 return UVM_PROT_NONE; 9884 } 9885 else if (resident_processors_count > 1) { 9886 // If there are many copies, we can only map READ ONLY 9887 // 9888 // The block state doesn't track the mapping target (aperture) of each 9889 // individual PTE, just the permissions and where the data is resident. 9890 // If the data is resident in multiple places, then we have a problem 9891 // since we can't know where the PTE points. This means we won't know 9892 // what needs to be unmapped for cases like UvmUnregisterGpu and 9893 // UvmDisablePeerAccess. 9894 // 9895 // The simple way to solve this is to enforce that a read-duplication 9896 // mapping always points to local memory. 9897 if (uvm_processor_mask_test(&resident_processors, processor_id)) 9898 return UVM_PROT_READ_ONLY; 9899 9900 return UVM_PROT_NONE; 9901 } 9902 else { 9903 uvm_processor_id_t atomic_id; 9904 uvm_processor_id_t residency; 9905 uvm_processor_mask_t atomic_mappings; 9906 uvm_processor_mask_t write_mappings; 9907 9908 // Search the id of the processor with the only resident copy 9909 residency = uvm_processor_mask_find_first_id(&resident_processors); 9910 UVM_ASSERT(UVM_ID_IS_VALID(residency)); 9911 9912 // If we cannot map the processor with the resident copy, exit 9913 if (!uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(residency)], processor_id)) 9914 return UVM_PROT_NONE; 9915 9916 // Fast path: if the page is not mapped anywhere else, it can be safely 9917 // mapped with RWA permission 9918 if (!uvm_page_mask_test(&va_block->maybe_mapped_pages, page_index)) 9919 return UVM_PROT_READ_WRITE_ATOMIC; 9920 9921 block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings); 9922 9923 // Exclude processors with system-wide atomics disabled from atomic_mappings 9924 uvm_processor_mask_and(&atomic_mappings, 9925 &atomic_mappings, 9926 &va_space->system_wide_atomics_enabled_processors); 9927 9928 // Exclude the processor for which the mapping protections are being computed 9929 uvm_processor_mask_clear(&atomic_mappings, processor_id); 9930 9931 // If there is any processor with atomic mapping, check if it has native atomics to the processor 9932 // with the resident copy. If it does not, we can only map READ ONLY 9933 atomic_id = uvm_processor_mask_find_first_id(&atomic_mappings); 9934 if (UVM_ID_IS_VALID(atomic_id) && 9935 !uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], atomic_id)) { 9936 return UVM_PROT_READ_ONLY; 9937 } 9938 9939 block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE, &write_mappings); 9940 9941 // Exclude the processor for which the mapping protections are being computed 9942 uvm_processor_mask_clear(&write_mappings, processor_id); 9943 9944 // At this point, any processor with atomic mappings either has native atomics support to the 9945 // processor with the resident copy or has disabled system-wide atomics. If the requesting 9946 // processor has disabled system-wide atomics or has native atomics to that processor, we can 9947 // map with ATOMIC privileges. Likewise, if there are no other processors with WRITE or ATOMIC 9948 // mappings, we can map with ATOMIC privileges. 9949 if (!uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, processor_id) || 9950 uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id) || 9951 uvm_processor_mask_empty(&write_mappings)) { 9952 return UVM_PROT_READ_WRITE_ATOMIC; 9953 } 9954 9955 return UVM_PROT_READ_WRITE; 9956 } 9957 } 9958 9959 NV_STATUS uvm_va_block_add_mappings(uvm_va_block_t *va_block, 9960 uvm_va_block_context_t *va_block_context, 9961 uvm_processor_id_t processor_id, 9962 uvm_va_block_region_t region, 9963 const uvm_page_mask_t *page_mask, 9964 UvmEventMapRemoteCause cause) 9965 { 9966 uvm_va_range_t *va_range = va_block->va_range; 9967 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9968 NV_STATUS status = NV_OK; 9969 uvm_page_index_t page_index; 9970 uvm_range_group_range_iter_t iter; 9971 uvm_prot_t prot_to_map; 9972 9973 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region)); 9974 9975 if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) { 9976 if (!uvm_va_range_vma_check(va_range, va_block_context->mm)) 9977 return NV_OK; 9978 9979 uvm_range_group_range_migratability_iter_first(va_space, 9980 uvm_va_block_region_start(va_block, region), 9981 uvm_va_block_region_end(va_block, region), 9982 &iter); 9983 } 9984 9985 for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map) 9986 va_block_context->mask_by_prot[prot_to_map - 1].count = 0; 9987 9988 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 9989 // Read duplication takes precedence over SetAccessedBy. Exclude pages 9990 // read-duplicated by performance heuristics 9991 if (uvm_page_mask_test(&va_block->read_duplicated_pages, page_index)) 9992 continue; 9993 9994 prot_to_map = uvm_va_block_page_compute_highest_permission(va_block, processor_id, page_index); 9995 if (prot_to_map == UVM_PROT_NONE) 9996 continue; 9997 9998 if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) { 9999 while (uvm_va_block_cpu_page_index(va_block, iter.end) < page_index) { 10000 uvm_range_group_range_migratability_iter_next(va_space, 10001 &iter, 10002 uvm_va_block_region_end(va_block, region)); 10003 } 10004 10005 if (!iter.migratable) 10006 continue; 10007 } 10008 10009 if (va_block_context->mask_by_prot[prot_to_map - 1].count++ == 0) 10010 uvm_page_mask_zero(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask); 10011 10012 uvm_page_mask_set(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask, page_index); 10013 } 10014 10015 for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map) { 10016 if (va_block_context->mask_by_prot[prot_to_map - 1].count == 0) 10017 continue; 10018 10019 status = uvm_va_block_map(va_block, 10020 va_block_context, 10021 processor_id, 10022 region, 10023 &va_block_context->mask_by_prot[prot_to_map - 1].page_mask, 10024 prot_to_map, 10025 cause, 10026 &va_block->tracker); 10027 if (status != NV_OK) 10028 break; 10029 } 10030 10031 return status; 10032 } 10033 10034 static bool can_read_duplicate(uvm_va_block_t *va_block, 10035 uvm_page_index_t page_index, 10036 const uvm_va_policy_t *policy, 10037 const uvm_perf_thrashing_hint_t *thrashing_hint) 10038 { 10039 if (uvm_va_policy_is_read_duplicate(policy, uvm_va_block_get_va_space(va_block))) 10040 return true; 10041 10042 if (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED && 10043 uvm_page_mask_test(&va_block->read_duplicated_pages, page_index) && 10044 thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_PIN) 10045 return true; 10046 10047 return false; 10048 } 10049 10050 // TODO: Bug 1827400: If the faulting processor has support for native 10051 // atomics to the current location and the faults on the page were 10052 // triggered by atomic accesses only, we keep the current residency. 10053 // This is a short-term solution to exercise remote atomics over 10054 // NVLINK when possible (not only when preferred location is set to 10055 // the remote GPU) as they are much faster than relying on page 10056 // faults and permission downgrades, which cause thrashing. In the 10057 // future, the thrashing detection/prevention heuristics should 10058 // detect and handle this case. 10059 static bool map_remote_on_atomic_fault(uvm_va_space_t *va_space, 10060 NvU32 access_type_mask, 10061 uvm_processor_id_t processor_id, 10062 uvm_processor_id_t residency) 10063 { 10064 // This policy can be enabled/disabled using a module parameter 10065 if (!uvm_perf_map_remote_on_native_atomics_fault) 10066 return false; 10067 10068 // Only consider atomics faults 10069 if (uvm_fault_access_type_mask_lowest(access_type_mask) < UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK) 10070 return false; 10071 10072 // We cannot differentiate CPU writes from atomics. We exclude CPU faults 10073 // from the logic explained above in order to avoid mapping CPU to vidmem 10074 // memory due to a write. 10075 if (UVM_ID_IS_CPU(processor_id)) 10076 return false; 10077 10078 // On P9 systems (which have native HW support for system-wide atomics), we 10079 // have determined experimentally that placing memory on a GPU yields the 10080 // best performance on most cases (since CPU can cache vidmem but not vice 10081 // versa). Therefore, don't map remotely if the current residency is 10082 // sysmem. 10083 if (UVM_ID_IS_CPU(residency)) 10084 return false; 10085 10086 return uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id); 10087 } 10088 10089 // TODO: Bug 1766424: this function works on a single page at a time. This 10090 // could be changed in the future to optimize multiple faults or access 10091 // counter notifications on contiguous pages. 10092 static uvm_processor_id_t block_select_residency(uvm_va_block_t *va_block, 10093 uvm_va_block_context_t *va_block_context, 10094 uvm_page_index_t page_index, 10095 uvm_processor_id_t processor_id, 10096 NvU32 access_type_mask, 10097 const uvm_va_policy_t *policy, 10098 const uvm_perf_thrashing_hint_t *thrashing_hint, 10099 uvm_service_operation_t operation, 10100 bool *read_duplicate) 10101 { 10102 uvm_processor_id_t closest_resident_processor; 10103 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10104 bool may_read_duplicate; 10105 uvm_processor_id_t preferred_location; 10106 10107 // TODO: Bug 3660968: Remove uvm_hmm_force_sysmem_set() check as soon as 10108 // HMM migration is implemented VMAs other than anonymous memory. 10109 if (is_uvm_fault_force_sysmem_set() || uvm_hmm_must_use_sysmem(va_block, va_block_context)) { 10110 *read_duplicate = false; 10111 return UVM_ID_CPU; 10112 } 10113 10114 may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint); 10115 10116 // Read/prefetch faults on a VA range with read duplication enabled 10117 // always create a copy of the page on the faulting processor's memory. 10118 // Note that access counters always use UVM_FAULT_ACCESS_TYPE_PREFETCH, 10119 // which will lead to read duplication if it is enabled. 10120 *read_duplicate = may_read_duplicate && 10121 (uvm_fault_access_type_mask_highest(access_type_mask) <= UVM_FAULT_ACCESS_TYPE_READ); 10122 10123 if (*read_duplicate) 10124 return processor_id; 10125 10126 *read_duplicate = false; 10127 10128 // If read-duplication is active in the page but we are not 10129 // read-duplicating because the access type is not a read or a prefetch, 10130 // the faulting processor should get a local copy 10131 if (may_read_duplicate) 10132 return processor_id; 10133 10134 // If the faulting processor is the preferred location always migrate 10135 preferred_location = policy->preferred_location; 10136 if (uvm_id_equal(processor_id, preferred_location)) { 10137 if (thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_NONE) { 10138 UVM_ASSERT(thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN); 10139 if (uvm_va_space_processor_has_memory(va_space, processor_id)) 10140 UVM_ASSERT(uvm_id_equal(thrashing_hint->pin.residency, processor_id)); 10141 } 10142 10143 return processor_id; 10144 } 10145 10146 // If the faulting processor is the CPU, HMM has to migrate the block to 10147 // system memory. 10148 // TODO: Bug 3900021: [UVM-HMM] investigate thrashing improvements. 10149 if (UVM_ID_IS_CPU(processor_id) && uvm_va_block_is_hmm(va_block)) 10150 return processor_id; 10151 10152 if (thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN) { 10153 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(thrashing_hint->pin.residency)], 10154 processor_id)); 10155 return thrashing_hint->pin.residency; 10156 } 10157 10158 closest_resident_processor = uvm_va_block_page_get_closest_resident(va_block, page_index, processor_id); 10159 10160 // If the page is not resident anywhere, select the preferred location as 10161 // long as the preferred location is accessible from the faulting processor. 10162 // Otherwise select the faulting processor. 10163 if (UVM_ID_IS_INVALID(closest_resident_processor)) { 10164 if (UVM_ID_IS_VALID(preferred_location) && 10165 uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], 10166 processor_id)) { 10167 return preferred_location; 10168 } 10169 10170 return processor_id; 10171 } 10172 10173 // AccessedBy mappings might have not been created for the CPU if the thread 10174 // which made the memory resident did not have the proper references on the 10175 // mm_struct (for example, the GPU fault handling path when 10176 // uvm_va_space_mm_enabled() is false). 10177 // 10178 // Also, in uvm_migrate_*, we implement a two-pass scheme in which 10179 // AccessedBy mappings may be delayed to the second pass. This can produce 10180 // faults even if the faulting processor is in the accessed_by mask. 10181 // 10182 // Here, we keep it on the current residency and we just add the missing 10183 // mapping. 10184 if (uvm_processor_mask_test(&policy->accessed_by, processor_id) && 10185 uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) && 10186 operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS) { 10187 return closest_resident_processor; 10188 } 10189 10190 // Check if we should map the closest resident processor remotely on atomic 10191 // fault 10192 if (map_remote_on_atomic_fault(va_space, access_type_mask, processor_id, closest_resident_processor)) 10193 return closest_resident_processor; 10194 10195 // If the processor has access to the preferred location, and the page is 10196 // not resident on the accessing processor, move it to the preferred 10197 // location. 10198 if (!uvm_id_equal(closest_resident_processor, processor_id) && 10199 UVM_ID_IS_VALID(preferred_location) && 10200 uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id)) 10201 return preferred_location; 10202 10203 // If the page is resident on a processor other than the preferred location, 10204 // or the faulting processor can't access the preferred location, we select 10205 // the faulting processor as the new residency. 10206 return processor_id; 10207 } 10208 10209 uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block, 10210 uvm_va_block_context_t *va_block_context, 10211 uvm_page_index_t page_index, 10212 uvm_processor_id_t processor_id, 10213 NvU32 access_type_mask, 10214 const uvm_va_policy_t *policy, 10215 const uvm_perf_thrashing_hint_t *thrashing_hint, 10216 uvm_service_operation_t operation, 10217 bool *read_duplicate) 10218 { 10219 uvm_processor_id_t id; 10220 10221 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, 10222 va_block_context->policy, 10223 uvm_va_block_region_for_page(page_index))); 10224 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, 10225 va_block_context, 10226 uvm_va_block_region_for_page(page_index))); 10227 10228 id = block_select_residency(va_block, 10229 va_block_context, 10230 page_index, 10231 processor_id, 10232 access_type_mask, 10233 policy, 10234 thrashing_hint, 10235 operation, 10236 read_duplicate); 10237 10238 // If the intended residency doesn't have memory, fall back to the CPU. 10239 if (!block_processor_has_memory(va_block, id)) { 10240 *read_duplicate = false; 10241 return UVM_ID_CPU; 10242 } 10243 10244 return id; 10245 } 10246 10247 static bool check_access_counters_dont_revoke(uvm_va_block_t *block, 10248 uvm_va_block_context_t *block_context, 10249 uvm_va_block_region_t region, 10250 const uvm_processor_mask_t *revoke_processors, 10251 const uvm_page_mask_t *revoke_page_mask, 10252 uvm_prot_t revoke_prot) 10253 { 10254 uvm_processor_id_t id; 10255 for_each_id_in_mask(id, revoke_processors) { 10256 const uvm_page_mask_t *mapped_with_prot = block_map_with_prot_mask_get(block, id, revoke_prot); 10257 10258 uvm_page_mask_and(&block_context->caller_page_mask, revoke_page_mask, mapped_with_prot); 10259 10260 UVM_ASSERT(uvm_page_mask_region_weight(&block_context->caller_page_mask, region) == 0); 10261 } 10262 10263 return true; 10264 } 10265 10266 // Update service_context->prefetch_hint, service_context->per_processor_masks, 10267 // and service_context->region. 10268 static void uvm_va_block_get_prefetch_hint(uvm_va_block_t *va_block, 10269 uvm_service_block_context_t *service_context) 10270 { 10271 uvm_processor_id_t new_residency; 10272 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10273 10274 // Performance heuristics policy: we only consider prefetching when there 10275 // are migrations to a single processor, only. 10276 if (uvm_processor_mask_get_count(&service_context->resident_processors) == 1) { 10277 uvm_page_index_t page_index; 10278 uvm_page_mask_t *new_residency_mask; 10279 const uvm_va_policy_t *policy = service_context->block_context.policy; 10280 10281 new_residency = uvm_processor_mask_find_first_id(&service_context->resident_processors); 10282 new_residency_mask = &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency; 10283 10284 // Update prefetch tracking structure with the pages that will migrate 10285 // due to faults 10286 uvm_perf_prefetch_get_hint(va_block, 10287 &service_context->block_context, 10288 new_residency, 10289 new_residency_mask, 10290 service_context->region, 10291 &service_context->prefetch_bitmap_tree, 10292 &service_context->prefetch_hint); 10293 10294 // Obtain the prefetch hint and give a fake fault access type to the 10295 // prefetched pages 10296 if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency)) { 10297 const uvm_page_mask_t *prefetch_pages_mask = &service_context->prefetch_hint.prefetch_pages_mask; 10298 10299 for_each_va_block_page_in_mask(page_index, prefetch_pages_mask, va_block) { 10300 UVM_ASSERT(!uvm_page_mask_test(new_residency_mask, page_index)); 10301 10302 service_context->access_type[page_index] = UVM_FAULT_ACCESS_TYPE_PREFETCH; 10303 10304 if (uvm_va_policy_is_read_duplicate(policy, va_space) || 10305 (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED && 10306 uvm_page_mask_test(&va_block->read_duplicated_pages, page_index))) { 10307 if (service_context->read_duplicate_count++ == 0) 10308 uvm_page_mask_zero(&service_context->read_duplicate_mask); 10309 10310 uvm_page_mask_set(&service_context->read_duplicate_mask, page_index); 10311 } 10312 } 10313 10314 uvm_page_mask_or(new_residency_mask, new_residency_mask, prefetch_pages_mask); 10315 service_context->region = uvm_va_block_region_from_mask(va_block, new_residency_mask); 10316 } 10317 } 10318 else { 10319 service_context->prefetch_hint.residency = UVM_ID_INVALID; 10320 } 10321 } 10322 10323 NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id, 10324 uvm_processor_id_t new_residency, 10325 uvm_va_block_t *va_block, 10326 uvm_va_block_retry_t *block_retry, 10327 uvm_service_block_context_t *service_context) 10328 { 10329 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10330 uvm_processor_mask_t *all_involved_processors = 10331 &service_context->block_context.make_resident.all_involved_processors; 10332 uvm_page_mask_t *new_residency_mask = 10333 &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency; 10334 uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency; 10335 uvm_page_mask_t *caller_page_mask = &service_context->block_context.caller_page_mask; 10336 uvm_make_resident_cause_t cause; 10337 NV_STATUS status; 10338 10339 // 1- Migrate pages 10340 switch (service_context->operation) { 10341 case UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS: 10342 cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT; 10343 break; 10344 case UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS: 10345 cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT; 10346 break; 10347 case UVM_SERVICE_OPERATION_ACCESS_COUNTERS: 10348 cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER; 10349 break; 10350 default: 10351 UVM_ASSERT_MSG(false, "Invalid operation value %d\n", service_context->operation); 10352 // Set cause to silence compiler warning that it may be unused. 10353 cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER; 10354 break; 10355 } 10356 10357 // Reset masks before all of the make_resident calls 10358 uvm_page_mask_zero(did_migrate_mask); 10359 uvm_processor_mask_zero(all_involved_processors); 10360 10361 // Handle read duplication first so that the caller_page_mask will be free 10362 // to use below and still valid in uvm_va_block_service_finish(). 10363 // TODO: Bug 3660922: need to implement HMM read duplication support. 10364 if (service_context->read_duplicate_count != 0 && 10365 uvm_page_mask_and(caller_page_mask, 10366 new_residency_mask, 10367 &service_context->read_duplicate_mask)) { 10368 status = uvm_va_block_make_resident_read_duplicate(va_block, 10369 block_retry, 10370 &service_context->block_context, 10371 new_residency, 10372 service_context->region, 10373 caller_page_mask, 10374 &service_context->prefetch_hint.prefetch_pages_mask, 10375 cause); 10376 if (status != NV_OK) 10377 return status; 10378 } 10379 10380 if (service_context->read_duplicate_count == 0 || 10381 uvm_page_mask_andnot(caller_page_mask, new_residency_mask, &service_context->read_duplicate_mask)) { 10382 if (service_context->read_duplicate_count == 0) 10383 uvm_page_mask_copy(caller_page_mask, new_residency_mask); 10384 status = uvm_va_block_make_resident_copy(va_block, 10385 block_retry, 10386 &service_context->block_context, 10387 new_residency, 10388 service_context->region, 10389 caller_page_mask, 10390 &service_context->prefetch_hint.prefetch_pages_mask, 10391 cause); 10392 if (status != NV_OK) 10393 return status; 10394 } 10395 10396 if (UVM_ID_IS_CPU(processor_id) && !uvm_processor_mask_empty(all_involved_processors)) 10397 service_context->cpu_fault.did_migrate = true; 10398 10399 // 2- Check for ECC errors on all GPUs involved in the migration if CPU is 10400 // the destination. Migrations in response to CPU faults are special 10401 // because they're on the only path (apart from tools) where CUDA is not 10402 // involved and wouldn't have a chance to do its own ECC checking. 10403 if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS && 10404 UVM_ID_IS_CPU(new_residency) && 10405 !uvm_processor_mask_empty(all_involved_processors)) { 10406 uvm_gpu_t *gpu; 10407 10408 // Before checking for ECC errors, make sure all of the GPU work 10409 // is finished. Creating mappings on the CPU would have to wait 10410 // for the tracker anyway so this shouldn't hurt performance. 10411 status = uvm_tracker_wait(&va_block->tracker); 10412 if (status != NV_OK) 10413 return status; 10414 10415 for_each_va_space_gpu_in_mask(gpu, va_space, all_involved_processors) { 10416 // We cannot call into RM here so use the no RM ECC check. 10417 status = uvm_gpu_check_ecc_error_no_rm(gpu); 10418 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) { 10419 // In case we need to call into RM to be sure whether 10420 // there is an ECC error or not, signal that to the 10421 // caller by adding the GPU to the mask. 10422 // 10423 // In that case the ECC error might be noticed only after 10424 // the CPU mappings have been already created below, 10425 // exposing different CPU threads to the possibly corrupt 10426 // data, but this thread will fault eventually and that's 10427 // considered to be an acceptable trade-off between 10428 // performance and ECC error containment. 10429 uvm_processor_mask_set(&service_context->cpu_fault.gpus_to_check_for_ecc, gpu->id); 10430 status = NV_OK; 10431 } 10432 if (status != NV_OK) 10433 return status; 10434 } 10435 } 10436 10437 return NV_OK; 10438 } 10439 10440 NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id, 10441 uvm_va_block_t *va_block, 10442 uvm_service_block_context_t *service_context) 10443 { 10444 uvm_processor_id_t new_residency = service_context->block_context.make_resident.dest_id; 10445 uvm_page_mask_t *new_residency_mask = 10446 &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency; 10447 uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency; 10448 uvm_page_mask_t *caller_page_mask = &service_context->block_context.caller_page_mask; 10449 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10450 uvm_prot_t new_prot; 10451 uvm_page_index_t page_index; 10452 NV_STATUS status; 10453 10454 // Update residency. 10455 if (service_context->read_duplicate_count == 0 || !uvm_page_mask_empty(caller_page_mask)) 10456 uvm_va_block_make_resident_finish(va_block, 10457 &service_context->block_context, 10458 service_context->region, 10459 caller_page_mask); 10460 10461 uvm_page_mask_andnot(&service_context->did_not_migrate_mask, new_residency_mask, did_migrate_mask); 10462 10463 // The loops below depend on the enums having the following values in order 10464 // to index into service_context->mappings_by_prot[]. 10465 BUILD_BUG_ON(UVM_PROT_READ_ONLY != 1); 10466 BUILD_BUG_ON(UVM_PROT_READ_WRITE != 2); 10467 BUILD_BUG_ON(UVM_PROT_READ_WRITE_ATOMIC != 3); 10468 BUILD_BUG_ON(UVM_PROT_MAX != 4); 10469 10470 // 1- Compute mapping protections for the requesting processor on the new 10471 // residency. 10472 for (new_prot = UVM_PROT_READ_ONLY; new_prot < UVM_PROT_MAX; ++new_prot) 10473 service_context->mappings_by_prot[new_prot - 1].count = 0; 10474 10475 for_each_va_block_page_in_region_mask(page_index, new_residency_mask, service_context->region) { 10476 new_prot = compute_new_permission(va_block, 10477 &service_context->block_context, 10478 page_index, 10479 processor_id, 10480 new_residency, 10481 service_context->access_type[page_index]); 10482 10483 if (service_context->mappings_by_prot[new_prot - 1].count++ == 0) 10484 uvm_page_mask_zero(&service_context->mappings_by_prot[new_prot - 1].page_mask); 10485 10486 uvm_page_mask_set(&service_context->mappings_by_prot[new_prot - 1].page_mask, page_index); 10487 } 10488 10489 // 2- Revoke permissions 10490 // 10491 // NOTE: uvm_va_block_make_resident_copy destroys mappings to old locations. 10492 // Thus, we need to revoke only if residency did not change and we 10493 // are mapping higher than READ ONLY. 10494 for (new_prot = UVM_PROT_READ_WRITE; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) { 10495 bool pages_need_revocation; 10496 uvm_processor_mask_t revoke_processors; 10497 uvm_prot_t revoke_prot; 10498 bool this_processor_has_enabled_atomics; 10499 10500 if (service_context->mappings_by_prot[new_prot - 1].count == 0) 10501 continue; 10502 10503 pages_need_revocation = uvm_page_mask_and(&service_context->revocation_mask, 10504 &service_context->did_not_migrate_mask, 10505 &service_context->mappings_by_prot[new_prot - 1].page_mask); 10506 if (!pages_need_revocation) 10507 continue; 10508 10509 uvm_processor_mask_and(&revoke_processors, &va_block->mapped, &va_space->faultable_processors); 10510 10511 // Do not revoke the processor that took the fault 10512 uvm_processor_mask_clear(&revoke_processors, processor_id); 10513 10514 this_processor_has_enabled_atomics = uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, 10515 processor_id); 10516 10517 // Atomic operations on processors with system-wide atomics 10518 // disabled or with native atomics access to new_residency 10519 // behave like writes. 10520 if (new_prot == UVM_PROT_READ_WRITE || 10521 !this_processor_has_enabled_atomics || 10522 uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id)) { 10523 10524 // Exclude processors with native atomics on the resident copy 10525 uvm_processor_mask_andnot(&revoke_processors, 10526 &revoke_processors, 10527 &va_space->has_native_atomics[uvm_id_value(new_residency)]); 10528 10529 // Exclude processors with disabled system-wide atomics 10530 uvm_processor_mask_and(&revoke_processors, 10531 &revoke_processors, 10532 &va_space->system_wide_atomics_enabled_processors); 10533 } 10534 10535 if (UVM_ID_IS_CPU(processor_id)) { 10536 revoke_prot = UVM_PROT_READ_WRITE_ATOMIC; 10537 } 10538 else { 10539 revoke_prot = (new_prot == UVM_PROT_READ_WRITE_ATOMIC)? UVM_PROT_READ_WRITE: 10540 UVM_PROT_READ_WRITE_ATOMIC; 10541 } 10542 10543 // UVM-Lite processors must always have RWA mappings 10544 if (uvm_processor_mask_andnot(&revoke_processors, &revoke_processors, block_get_uvm_lite_gpus(va_block))) { 10545 // Access counters should never trigger revocations apart from 10546 // read-duplication, which are performed in the calls to 10547 // uvm_va_block_make_resident_read_duplicate, above. 10548 if (service_context->operation == UVM_SERVICE_OPERATION_ACCESS_COUNTERS) { 10549 UVM_ASSERT(check_access_counters_dont_revoke(va_block, 10550 &service_context->block_context, 10551 service_context->region, 10552 &revoke_processors, 10553 &service_context->revocation_mask, 10554 revoke_prot)); 10555 } 10556 10557 // Downgrade other processors' mappings 10558 status = uvm_va_block_revoke_prot_mask(va_block, 10559 &service_context->block_context, 10560 &revoke_processors, 10561 service_context->region, 10562 &service_context->revocation_mask, 10563 revoke_prot); 10564 if (status != NV_OK) 10565 return status; 10566 } 10567 } 10568 10569 // 3- Map requesting processor with the necessary privileges 10570 for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) { 10571 const uvm_page_mask_t *map_prot_mask = &service_context->mappings_by_prot[new_prot - 1].page_mask; 10572 10573 if (service_context->mappings_by_prot[new_prot - 1].count == 0) 10574 continue; 10575 10576 // 3.1 - Unmap CPU pages 10577 // HMM cpu mappings can be upgraded at any time without notification 10578 // so no need to downgrade first. 10579 if (service_context->operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS && 10580 UVM_ID_IS_CPU(processor_id) && 10581 !uvm_va_block_is_hmm(va_block)) { 10582 // The kernel can downgrade managed CPU mappings at any time without 10583 // notifying us, which means our PTE state could be stale. We 10584 // handle this by unmapping the CPU PTE and re-mapping it again. 10585 // 10586 // A CPU fault is unexpected if: 10587 // curr_prot == RW || (!is_write && curr_prot == RO) 10588 status = uvm_va_block_unmap(va_block, 10589 &service_context->block_context, 10590 UVM_ID_CPU, 10591 service_context->region, 10592 map_prot_mask, 10593 NULL); 10594 if (status != NV_OK) 10595 return status; 10596 } 10597 10598 // 3.2 - Add new mappings 10599 10600 // The faulting processor can be mapped remotely due to user policy or 10601 // the thrashing mitigation heuristics. Therefore, we set the cause 10602 // accordingly in each case. 10603 10604 // Map pages that are thrashing first 10605 if (service_context->thrashing_pin_count > 0 && va_space->tools.enabled) { 10606 uvm_page_mask_t *helper_page_mask = &service_context->block_context.caller_page_mask; 10607 bool pages_need_mapping = uvm_page_mask_and(helper_page_mask, 10608 map_prot_mask, 10609 &service_context->thrashing_pin_mask); 10610 if (pages_need_mapping) { 10611 status = uvm_va_block_map(va_block, 10612 &service_context->block_context, 10613 processor_id, 10614 service_context->region, 10615 helper_page_mask, 10616 new_prot, 10617 UvmEventMapRemoteCauseThrashing, 10618 &va_block->tracker); 10619 if (status != NV_OK) 10620 return status; 10621 10622 // Remove thrashing pages from the map mask 10623 pages_need_mapping = uvm_page_mask_andnot(helper_page_mask, 10624 map_prot_mask, 10625 &service_context->thrashing_pin_mask); 10626 if (!pages_need_mapping) 10627 continue; 10628 10629 map_prot_mask = helper_page_mask; 10630 } 10631 } 10632 10633 status = uvm_va_block_map(va_block, 10634 &service_context->block_context, 10635 processor_id, 10636 service_context->region, 10637 map_prot_mask, 10638 new_prot, 10639 UvmEventMapRemoteCausePolicy, 10640 &va_block->tracker); 10641 if (status != NV_OK) 10642 return status; 10643 } 10644 10645 // 4- If pages did migrate, map SetAccessedBy processors, except for 10646 // UVM-Lite 10647 for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) { 10648 bool pages_need_mapping; 10649 10650 if (service_context->mappings_by_prot[new_prot - 1].count == 0) 10651 continue; 10652 10653 pages_need_mapping = uvm_page_mask_and(caller_page_mask, 10654 new_residency_mask, 10655 &service_context->mappings_by_prot[new_prot - 1].page_mask); 10656 if (!pages_need_mapping) 10657 continue; 10658 10659 // Map pages that are thrashing 10660 if (service_context->thrashing_pin_count > 0) { 10661 uvm_page_index_t page_index; 10662 10663 for_each_va_block_page_in_region_mask(page_index, 10664 &service_context->thrashing_pin_mask, 10665 service_context->region) { 10666 uvm_processor_mask_t *map_thrashing_processors = NULL; 10667 NvU64 page_addr = uvm_va_block_cpu_page_address(va_block, page_index); 10668 10669 // Check protection type 10670 if (!uvm_page_mask_test(caller_page_mask, page_index)) 10671 continue; 10672 10673 map_thrashing_processors = uvm_perf_thrashing_get_thrashing_processors(va_block, page_addr); 10674 10675 status = uvm_va_block_add_mappings_after_migration(va_block, 10676 &service_context->block_context, 10677 new_residency, 10678 processor_id, 10679 uvm_va_block_region_for_page(page_index), 10680 caller_page_mask, 10681 new_prot, 10682 map_thrashing_processors); 10683 if (status != NV_OK) 10684 return status; 10685 } 10686 10687 pages_need_mapping = uvm_page_mask_andnot(caller_page_mask, 10688 caller_page_mask, 10689 &service_context->thrashing_pin_mask); 10690 if (!pages_need_mapping) 10691 continue; 10692 } 10693 10694 // Map the rest of pages in a single shot 10695 status = uvm_va_block_add_mappings_after_migration(va_block, 10696 &service_context->block_context, 10697 new_residency, 10698 processor_id, 10699 service_context->region, 10700 caller_page_mask, 10701 new_prot, 10702 NULL); 10703 if (status != NV_OK) 10704 return status; 10705 } 10706 10707 return NV_OK; 10708 } 10709 10710 NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id, 10711 uvm_va_block_t *va_block, 10712 uvm_va_block_retry_t *block_retry, 10713 uvm_service_block_context_t *service_context) 10714 { 10715 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10716 uvm_processor_id_t new_residency; 10717 NV_STATUS status = NV_OK; 10718 10719 uvm_assert_mutex_locked(&va_block->lock); 10720 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, 10721 service_context->block_context.policy, 10722 service_context->region)); 10723 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, 10724 &service_context->block_context, 10725 service_context->region)); 10726 10727 // GPU fault servicing must be done under the VA space read lock. GPU fault 10728 // servicing is required for RM to make forward progress, and we allow other 10729 // threads to call into RM while holding the VA space lock in read mode. If 10730 // we took the VA space lock in write mode on the GPU fault service path, 10731 // we could deadlock because the thread in RM which holds the VA space lock 10732 // for read wouldn't be able to complete until fault servicing completes. 10733 if (service_context->operation != UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS || UVM_ID_IS_CPU(processor_id)) 10734 uvm_assert_rwsem_locked(&va_space->lock); 10735 else 10736 uvm_assert_rwsem_locked_read(&va_space->lock); 10737 10738 uvm_va_block_get_prefetch_hint(va_block, service_context); 10739 10740 for_each_id_in_mask(new_residency, &service_context->resident_processors) { 10741 if (uvm_va_block_is_hmm(va_block)) { 10742 status = uvm_hmm_va_block_service_locked(processor_id, new_residency, va_block, block_retry, service_context); 10743 if (status != NV_OK) 10744 break; 10745 10746 continue; 10747 } 10748 10749 status = uvm_va_block_service_copy(processor_id, new_residency, va_block, block_retry, service_context); 10750 if (status != NV_OK) 10751 break; 10752 10753 status = uvm_va_block_service_finish(processor_id, va_block, service_context); 10754 if (status != NV_OK) 10755 break; 10756 } 10757 10758 return status; 10759 } 10760 10761 NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block, 10762 uvm_va_block_context_t *va_block_context, 10763 uvm_processor_id_t processor_id, 10764 uvm_page_index_t page_index, 10765 uvm_fault_type_t access_type, 10766 bool allow_migration) 10767 { 10768 uvm_va_range_t *va_range = va_block->va_range; 10769 uvm_prot_t access_prot = uvm_fault_access_type_to_prot(access_type); 10770 10771 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, 10772 va_block_context->policy, 10773 uvm_va_block_region_for_page(page_index))); 10774 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, 10775 va_block_context, 10776 uvm_va_block_region_for_page(page_index))); 10777 10778 // CPU permissions are checked later by block_map_cpu_page. 10779 // 10780 // TODO: Bug 1766124: permissions are checked by block_map_cpu_page because 10781 // it can also be called from change_pte. Make change_pte call this 10782 // function and only check CPU permissions here. 10783 if (UVM_ID_IS_GPU(processor_id)) { 10784 if (va_range && uvm_va_range_is_managed_zombie(va_range)) 10785 return NV_ERR_INVALID_ADDRESS; 10786 10787 // GPU faults only check vma permissions if a mm is registered with the 10788 // VA space (ie. uvm_va_space_mm_retain_lock(va_space) != NULL) or if 10789 // uvm_enable_builtin_tests is set, because the Linux kernel can change 10790 // vm_flags at any moment (for example on mprotect) and here we are not 10791 // guaranteed to have vma->vm_mm->mmap_lock. During tests we ensure that 10792 // this scenario does not happen. 10793 if ((va_block_context->mm || uvm_enable_builtin_tests) && 10794 (access_prot > compute_logical_prot(va_block, va_block_context, page_index))) 10795 return NV_ERR_INVALID_ACCESS_TYPE; 10796 } 10797 10798 // Non-migratable range: 10799 // - CPU accesses are always fatal, regardless of the VA range residency 10800 // - GPU accesses are fatal if the GPU can't map the preferred location 10801 if (!allow_migration) { 10802 UVM_ASSERT(!uvm_va_block_is_hmm(va_block)); 10803 10804 if (UVM_ID_IS_CPU(processor_id)) { 10805 return NV_ERR_INVALID_OPERATION; 10806 } 10807 else { 10808 uvm_va_space_t *va_space = va_range->va_space; 10809 10810 return uvm_processor_mask_test( 10811 &va_space->accessible_from[uvm_id_value(uvm_va_range_get_policy(va_range)->preferred_location)], 10812 processor_id)? 10813 NV_OK : NV_ERR_INVALID_ACCESS_TYPE; 10814 } 10815 } 10816 10817 return NV_OK; 10818 } 10819 10820 // Check if we are faulting on a page with valid permissions to check if we can 10821 // skip fault handling. See uvm_va_block_t::cpu::fault_authorized for more 10822 // details 10823 static bool skip_cpu_fault_with_valid_permissions(uvm_va_block_t *va_block, 10824 uvm_page_index_t page_index, 10825 uvm_fault_access_type_t fault_access_type) 10826 { 10827 // TODO: Bug 3900038: is skip_cpu_fault_with_valid_permissions() needed for 10828 // HMM? 10829 if (uvm_va_block_is_hmm(va_block)) 10830 return false; 10831 10832 if (block_page_is_processor_authorized(va_block, 10833 page_index, 10834 UVM_ID_CPU, 10835 uvm_fault_access_type_to_prot(fault_access_type))) { 10836 NvU64 now = NV_GETTIME(); 10837 pid_t pid = current->pid; 10838 10839 // Latch the pid/timestamp/page_index values for the first time 10840 if (!va_block->cpu.fault_authorized.first_fault_stamp) { 10841 va_block->cpu.fault_authorized.first_fault_stamp = now; 10842 va_block->cpu.fault_authorized.first_pid = pid; 10843 va_block->cpu.fault_authorized.page_index = page_index; 10844 10845 return true; 10846 } 10847 10848 // If the same thread shows up again, this means that the kernel 10849 // downgraded the page's PTEs. Service the fault to force a remap of 10850 // the page. 10851 if (va_block->cpu.fault_authorized.first_pid == pid && 10852 va_block->cpu.fault_authorized.page_index == page_index) { 10853 va_block->cpu.fault_authorized.first_fault_stamp = 0; 10854 } 10855 else { 10856 // If the window has expired, clear the information and service the 10857 // fault. Otherwise, just return 10858 if (now - va_block->cpu.fault_authorized.first_fault_stamp > uvm_perf_authorized_cpu_fault_tracking_window_ns) 10859 va_block->cpu.fault_authorized.first_fault_stamp = 0; 10860 else 10861 return true; 10862 } 10863 } 10864 10865 return false; 10866 } 10867 10868 static NV_STATUS block_cpu_fault_locked(uvm_va_block_t *va_block, 10869 uvm_va_block_retry_t *va_block_retry, 10870 NvU64 fault_addr, 10871 uvm_fault_access_type_t fault_access_type, 10872 uvm_service_block_context_t *service_context) 10873 { 10874 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10875 NV_STATUS status = NV_OK; 10876 uvm_page_index_t page_index; 10877 uvm_perf_thrashing_hint_t thrashing_hint; 10878 uvm_processor_id_t new_residency; 10879 bool read_duplicate; 10880 10881 uvm_assert_rwsem_locked(&va_space->lock); 10882 10883 UVM_ASSERT(fault_addr >= va_block->start); 10884 UVM_ASSERT(fault_addr <= va_block->end); 10885 10886 uvm_assert_mmap_lock_locked(service_context->block_context.mm); 10887 10888 service_context->block_context.policy = uvm_va_policy_get(va_block, fault_addr); 10889 10890 if (service_context->num_retries == 0) { 10891 // notify event to tools/performance heuristics 10892 uvm_perf_event_notify_cpu_fault(&va_space->perf_events, 10893 va_block, 10894 service_context->block_context.policy->preferred_location, 10895 fault_addr, 10896 fault_access_type > UVM_FAULT_ACCESS_TYPE_READ, 10897 KSTK_EIP(current)); 10898 } 10899 10900 // Check logical permissions 10901 page_index = uvm_va_block_cpu_page_index(va_block, fault_addr); 10902 status = uvm_va_block_check_logical_permissions(va_block, 10903 &service_context->block_context, 10904 UVM_ID_CPU, 10905 page_index, 10906 fault_access_type, 10907 uvm_range_group_address_migratable(va_space, fault_addr)); 10908 if (status != NV_OK) 10909 return status; 10910 10911 uvm_processor_mask_zero(&service_context->cpu_fault.gpus_to_check_for_ecc); 10912 10913 if (skip_cpu_fault_with_valid_permissions(va_block, page_index, fault_access_type)) 10914 return NV_OK; 10915 10916 thrashing_hint = uvm_perf_thrashing_get_hint(va_block, fault_addr, UVM_ID_CPU); 10917 // Throttling is implemented by sleeping in the fault handler on the CPU 10918 if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) { 10919 service_context->cpu_fault.wakeup_time_stamp = thrashing_hint.throttle.end_time_stamp; 10920 return NV_WARN_MORE_PROCESSING_REQUIRED; 10921 } 10922 10923 service_context->read_duplicate_count = 0; 10924 service_context->thrashing_pin_count = 0; 10925 service_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS; 10926 10927 if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) { 10928 uvm_page_mask_zero(&service_context->thrashing_pin_mask); 10929 uvm_page_mask_set(&service_context->thrashing_pin_mask, page_index); 10930 service_context->thrashing_pin_count = 1; 10931 } 10932 10933 // Compute new residency and update the masks 10934 new_residency = uvm_va_block_select_residency(va_block, 10935 &service_context->block_context, 10936 page_index, 10937 UVM_ID_CPU, 10938 uvm_fault_access_type_mask_bit(fault_access_type), 10939 service_context->block_context.policy, 10940 &thrashing_hint, 10941 UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS, 10942 &read_duplicate); 10943 10944 // Initialize the minimum necessary state in the fault service context 10945 uvm_processor_mask_zero(&service_context->resident_processors); 10946 10947 // Set new residency and update the masks 10948 uvm_processor_mask_set(&service_context->resident_processors, new_residency); 10949 10950 // The masks need to be fully zeroed as the fault region may grow due to prefetching 10951 uvm_page_mask_zero(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency); 10952 uvm_page_mask_set(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index); 10953 10954 if (read_duplicate) { 10955 uvm_page_mask_zero(&service_context->read_duplicate_mask); 10956 uvm_page_mask_set(&service_context->read_duplicate_mask, page_index); 10957 service_context->read_duplicate_count = 1; 10958 } 10959 10960 service_context->access_type[page_index] = fault_access_type; 10961 10962 service_context->region = uvm_va_block_region_for_page(page_index); 10963 10964 status = uvm_va_block_service_locked(UVM_ID_CPU, va_block, va_block_retry, service_context); 10965 10966 ++service_context->num_retries; 10967 10968 return status; 10969 } 10970 10971 NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block, 10972 NvU64 fault_addr, 10973 bool is_write, 10974 uvm_service_block_context_t *service_context) 10975 { 10976 NV_STATUS status; 10977 uvm_va_block_retry_t va_block_retry; 10978 uvm_fault_access_type_t fault_access_type; 10979 10980 if (is_write) 10981 fault_access_type = UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG; 10982 else 10983 fault_access_type = UVM_FAULT_ACCESS_TYPE_READ; 10984 10985 service_context->num_retries = 0; 10986 service_context->cpu_fault.did_migrate = false; 10987 10988 // We have to use vm_insert_page instead of handing the page to the kernel 10989 // and letting it insert the mapping, and we must do that while holding the 10990 // lock on this VA block. Otherwise there will be a window in which we think 10991 // we've mapped the page but the CPU mapping hasn't actually been created 10992 // yet. During that window a GPU fault event could arrive and claim 10993 // ownership of that VA, "unmapping" it. Then later the kernel would 10994 // eventually establish the mapping, and we'd end up with both CPU and GPU 10995 // thinking they each owned the page. 10996 // 10997 // This function must only be called when it's safe to call vm_insert_page. 10998 // That is, there must be a reference held on the vma's vm_mm, and 10999 // vm_mm->mmap_lock is held in at least read mode. Note that current->mm 11000 // might not be vma->vm_mm. 11001 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, 11002 &va_block_retry, 11003 block_cpu_fault_locked(va_block, 11004 &va_block_retry, 11005 fault_addr, 11006 fault_access_type, 11007 service_context)); 11008 return status; 11009 } 11010 11011 NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t **out_block) 11012 { 11013 uvm_va_range_t *va_range; 11014 uvm_va_block_t *block; 11015 size_t index; 11016 11017 va_range = uvm_va_range_find(va_space, addr); 11018 if (!va_range) 11019 return uvm_hmm_va_block_find(va_space, addr, out_block); 11020 11021 UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS || 11022 uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND); 11023 11024 if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) 11025 return NV_ERR_INVALID_ADDRESS; 11026 11027 index = uvm_va_range_block_index(va_range, addr); 11028 block = uvm_va_range_block(va_range, index); 11029 if (!block) 11030 return NV_ERR_OBJECT_NOT_FOUND; 11031 11032 *out_block = block; 11033 return NV_OK; 11034 } 11035 11036 NV_STATUS uvm_va_block_find_create_in_range(uvm_va_space_t *va_space, 11037 uvm_va_range_t *va_range, 11038 NvU64 addr, 11039 uvm_va_block_context_t *va_block_context, 11040 uvm_va_block_t **out_block) 11041 { 11042 size_t index; 11043 11044 if (uvm_enable_builtin_tests && atomic_dec_if_positive(&va_space->test.va_block_allocation_fail_nth) == 0) 11045 return NV_ERR_NO_MEMORY; 11046 11047 if (!va_range) { 11048 if (!va_block_context || !va_block_context->mm) 11049 return NV_ERR_INVALID_ADDRESS; 11050 return uvm_hmm_va_block_find_create(va_space, addr, va_block_context, out_block); 11051 } 11052 11053 UVM_ASSERT(addr >= va_range->node.start); 11054 UVM_ASSERT(addr <= va_range->node.end); 11055 11056 UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS || 11057 uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND); 11058 11059 if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) 11060 return NV_ERR_INVALID_ADDRESS; 11061 11062 index = uvm_va_range_block_index(va_range, addr); 11063 return uvm_va_range_block_create(va_range, index, out_block); 11064 } 11065 11066 NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space, 11067 NvU64 addr, 11068 uvm_va_block_context_t *va_block_context, 11069 uvm_va_block_t **out_block) 11070 { 11071 uvm_va_range_t *va_range = uvm_va_range_find(va_space, addr); 11072 11073 return uvm_va_block_find_create_in_range(va_space, va_range, addr, va_block_context, out_block); 11074 } 11075 11076 // Launch a synchronous, encrypted copy between GPU and CPU. 11077 // 11078 // The copy entails a GPU-side encryption (relying on the Copy Engine), and a 11079 // CPU-side decryption step, such that the destination CPU buffer pointed by 11080 // dst_plain will contain the unencrypted (plain text) contents. The destination 11081 // buffer can be in protected or unprotected sysmem, while the source buffer 11082 // must be in protected vidmem. 11083 // 11084 // The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE. 11085 // 11086 // The input tracker, if not NULL, is internally acquired by the push 11087 // responsible for the encrypted copy. 11088 __attribute__ ((format(printf, 6, 7))) 11089 static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu, 11090 void *dst_plain, 11091 uvm_gpu_address_t src_gpu_address, 11092 size_t size, 11093 uvm_tracker_t *tracker, 11094 const char *format, 11095 ...) 11096 { 11097 NV_STATUS status; 11098 UvmCslIv decrypt_iv; 11099 uvm_push_t push; 11100 uvm_conf_computing_dma_buffer_t *dma_buffer; 11101 uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address; 11102 void *src_cipher, *auth_tag; 11103 va_list args; 11104 11105 UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); 11106 UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE); 11107 11108 status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL); 11109 if (status != NV_OK) 11110 return status; 11111 11112 va_start(args, format); 11113 status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args); 11114 va_end(args); 11115 11116 if (status != NV_OK) 11117 goto out; 11118 11119 uvm_conf_computing_log_gpu_encryption(push.channel, &decrypt_iv); 11120 11121 dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu); 11122 auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu); 11123 gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address); 11124 11125 status = uvm_push_end_and_wait(&push); 11126 if (status != NV_OK) 11127 goto out; 11128 11129 src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc); 11130 auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag); 11131 status = uvm_conf_computing_cpu_decrypt(push.channel, dst_plain, src_cipher, &decrypt_iv, size, auth_tag); 11132 11133 out: 11134 uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL); 11135 return status; 11136 } 11137 11138 // Launch a synchronous, encrypted copy between CPU and GPU. 11139 // 11140 // The source CPU buffer pointed by src_plain contains the unencrypted (plain 11141 // text) contents; the function internally performs a CPU-side encryption step 11142 // before launching the GPU-side CE decryption. The source buffer can be in 11143 // protected or unprotected sysmem, while the destination buffer must be in 11144 // protected vidmem. 11145 // 11146 // The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE. 11147 // 11148 // The input tracker, if not NULL, is internally acquired by the push 11149 // responsible for the encrypted copy. 11150 __attribute__ ((format(printf, 6, 7))) 11151 static NV_STATUS encrypted_memcopy_cpu_to_gpu(uvm_gpu_t *gpu, 11152 uvm_gpu_address_t dst_gpu_address, 11153 void *src_plain, 11154 size_t size, 11155 uvm_tracker_t *tracker, 11156 const char *format, 11157 ...) 11158 { 11159 NV_STATUS status; 11160 uvm_push_t push; 11161 uvm_conf_computing_dma_buffer_t *dma_buffer; 11162 uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address; 11163 void *dst_cipher, *auth_tag; 11164 va_list args; 11165 11166 UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); 11167 UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE); 11168 11169 status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL); 11170 if (status != NV_OK) 11171 return status; 11172 11173 va_start(args, format); 11174 status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args); 11175 va_end(args); 11176 11177 if (status != NV_OK) 11178 goto out; 11179 11180 dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc); 11181 auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag); 11182 uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag); 11183 11184 src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu); 11185 auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu); 11186 gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address); 11187 11188 status = uvm_push_end_and_wait(&push); 11189 11190 out: 11191 uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL); 11192 return status; 11193 } 11194 11195 static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block, 11196 uvm_gpu_t *gpu, 11197 uvm_gpu_address_t dst_gpu_address, 11198 NvU64 dst, 11199 uvm_mem_t *src_mem, 11200 size_t size) 11201 { 11202 NV_STATUS status; 11203 uvm_push_t push; 11204 uvm_gpu_address_t src_gpu_address; 11205 11206 if (uvm_conf_computing_mode_enabled(gpu)) { 11207 return encrypted_memcopy_cpu_to_gpu(gpu, 11208 dst_gpu_address, 11209 uvm_mem_get_cpu_addr_kernel(src_mem), 11210 size, 11211 &va_block->tracker, 11212 "Encrypted write to [0x%llx, 0x%llx)", 11213 dst, 11214 dst + size); 11215 } 11216 11217 status = uvm_push_begin_acquire(gpu->channel_manager, 11218 UVM_CHANNEL_TYPE_CPU_TO_GPU, 11219 &va_block->tracker, 11220 &push, 11221 "Direct write to [0x%llx, 0x%llx)", 11222 dst, 11223 dst + size); 11224 if (status != NV_OK) 11225 return status; 11226 11227 src_gpu_address = uvm_mem_gpu_address_virtual_kernel(src_mem, gpu); 11228 gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size); 11229 return uvm_push_end_and_wait(&push); 11230 } 11231 11232 NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block, 11233 uvm_va_block_context_t *block_context, 11234 NvU64 dst, 11235 uvm_mem_t *src_mem, 11236 size_t size) 11237 { 11238 NV_STATUS status; 11239 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, dst); 11240 NvU64 page_offset = dst & (PAGE_SIZE - 1); 11241 uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, page_index, UVM_ID_CPU); 11242 uvm_va_block_region_t region = uvm_va_block_region_for_page(page_index); 11243 11244 uvm_assert_mutex_locked(&va_block->lock); 11245 UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Write spans multiple pages: dst 0x%llx, size 0x%zx\n", dst, size); 11246 11247 if (UVM_ID_IS_INVALID(proc)) 11248 proc = UVM_ID_CPU; 11249 11250 block_context->policy = uvm_va_policy_get(va_block, dst); 11251 11252 // Use make_resident() in all cases to break read-duplication, but 11253 // block_retry can be NULL as if the page is not resident yet we will make 11254 // it resident on the CPU. 11255 // Notably we don't care about coherence with respect to atomics from other 11256 // processors. 11257 status = uvm_va_block_make_resident(va_block, 11258 NULL, 11259 block_context, 11260 proc, 11261 region, 11262 NULL, 11263 NULL, 11264 UVM_MAKE_RESIDENT_CAUSE_API_TOOLS); 11265 11266 if (status != NV_OK) 11267 return status; 11268 11269 if (UVM_ID_IS_CPU(proc)) { 11270 char *mapped_page; 11271 struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, page_index); 11272 void *src = uvm_mem_get_cpu_addr_kernel(src_mem); 11273 11274 status = uvm_tracker_wait(&va_block->tracker); 11275 if (status != NV_OK) 11276 return status; 11277 11278 mapped_page = (char *)kmap(page); 11279 memcpy(mapped_page + page_offset, src, size); 11280 kunmap(page); 11281 11282 return NV_OK; 11283 } 11284 else { 11285 uvm_gpu_t *dst_gpu; 11286 uvm_gpu_address_t dst_gpu_address; 11287 11288 UVM_ASSERT(UVM_ID_IS_GPU(proc)); 11289 11290 dst_gpu = block_get_gpu(va_block, proc); 11291 11292 dst_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), dst_gpu); 11293 dst_gpu_address.address += page_offset; 11294 11295 return va_block_write_cpu_to_gpu(va_block, dst_gpu, dst_gpu_address, dst, src_mem, size); 11296 } 11297 } 11298 11299 static NV_STATUS va_block_read_gpu_to_cpu(uvm_va_block_t *va_block, 11300 uvm_mem_t *dst_mem, 11301 uvm_gpu_t *gpu, 11302 uvm_gpu_address_t src_gpu_address, 11303 NvU64 src, 11304 size_t size) 11305 { 11306 NV_STATUS status; 11307 uvm_push_t push; 11308 uvm_gpu_address_t dst_gpu_address; 11309 11310 if (uvm_conf_computing_mode_enabled(gpu)) { 11311 return encrypted_memcopy_gpu_to_cpu(gpu, 11312 uvm_mem_get_cpu_addr_kernel(dst_mem), 11313 src_gpu_address, 11314 size, 11315 &va_block->tracker, 11316 "Encrypted read from [0x%llx, 0x%llx)", 11317 src, 11318 src + size); 11319 } 11320 11321 status = uvm_push_begin_acquire(gpu->channel_manager, 11322 UVM_CHANNEL_TYPE_GPU_TO_CPU, 11323 &va_block->tracker, 11324 &push, 11325 "Direct read from [0x%llx, 0x%llx)", 11326 src, 11327 src + size); 11328 if (status != NV_OK) 11329 return status; 11330 11331 dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dst_mem, gpu); 11332 gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size); 11333 return uvm_push_end_and_wait(&push); 11334 } 11335 11336 NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block, uvm_mem_t *dst_mem, NvU64 src, size_t size) 11337 { 11338 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, src); 11339 NvU64 page_offset = src & (PAGE_SIZE - 1); 11340 uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, page_index, UVM_ID_CPU); 11341 void *dst = uvm_mem_get_cpu_addr_kernel(dst_mem); 11342 11343 uvm_assert_mutex_locked(&va_block->lock); 11344 UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Read spans multiple pages: src 0x%llx, size 0x%zx\n", src, size); 11345 11346 if (UVM_ID_IS_INVALID(proc)) { 11347 memset(dst, 0, size); 11348 return NV_OK; 11349 } 11350 else if (UVM_ID_IS_CPU(proc)) { 11351 NV_STATUS status; 11352 char *mapped_page; 11353 struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, page_index); 11354 11355 status = uvm_tracker_wait(&va_block->tracker); 11356 if (status != NV_OK) 11357 return status; 11358 11359 mapped_page = (char *)kmap(page); 11360 memcpy(dst, mapped_page + page_offset, size); 11361 kunmap(page); 11362 11363 return NV_OK; 11364 } 11365 else { 11366 uvm_gpu_address_t src_gpu_address; 11367 uvm_gpu_t *gpu = block_get_gpu(va_block, proc); 11368 11369 src_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), gpu); 11370 src_gpu_address.address += page_offset; 11371 11372 return va_block_read_gpu_to_cpu(va_block, dst_mem, gpu, src_gpu_address, src, size); 11373 } 11374 } 11375 11376 // Deferred work item reestablishing accessed by mappings after eviction. On 11377 // GPUs with access counters enabled, the evicted GPU will also get remote 11378 // mappings. 11379 static void block_add_eviction_mappings(void *args) 11380 { 11381 uvm_va_block_t *va_block = (uvm_va_block_t*)args; 11382 uvm_va_space_t *va_space; 11383 uvm_processor_id_t id; 11384 uvm_va_block_context_t *block_context = NULL; 11385 struct mm_struct *mm = NULL; 11386 11387 uvm_mutex_lock(&va_block->lock); 11388 va_space = uvm_va_block_get_va_space_maybe_dead(va_block); 11389 uvm_mutex_unlock(&va_block->lock); 11390 11391 if (!va_space) { 11392 // Block has been killed in the meantime 11393 goto done; 11394 } 11395 11396 mm = uvm_va_space_mm_retain_lock(va_space); 11397 11398 block_context = uvm_va_block_context_alloc(mm); 11399 if (!block_context) 11400 goto done; 11401 11402 // The block wasn't dead when we checked above and that's enough to 11403 // guarantee that the VA space is still around, because 11404 // uvm_va_space_destroy() flushes the associated nv_kthread_q, and that 11405 // flush waits for this function call to finish. 11406 uvm_va_space_down_read(va_space); 11407 11408 // Now that we have the VA space lock held, we can check whether the block 11409 // is still alive since the VA space write lock is needed to kill blocks. 11410 if (uvm_va_block_is_dead(va_block)) 11411 goto unlock; 11412 11413 if (uvm_va_block_is_hmm(va_block)) { 11414 uvm_hmm_block_add_eviction_mappings(va_space, va_block, block_context); 11415 } 11416 else { 11417 uvm_va_range_t *va_range = va_block->va_range; 11418 NV_STATUS status = NV_OK; 11419 11420 block_context->policy = uvm_va_range_get_policy(va_range); 11421 for_each_id_in_mask(id, &uvm_va_range_get_policy(va_range)->accessed_by) { 11422 status = uvm_va_block_set_accessed_by(va_block, block_context, id); 11423 if (status != NV_OK) 11424 break; 11425 } 11426 11427 if (status == NV_OK && uvm_va_space_map_remote_on_eviction(va_space)) { 11428 uvm_processor_mask_t map_processors; 11429 11430 // Exclude the processors that have been already mapped due to 11431 // AccessedBy 11432 uvm_processor_mask_andnot(&map_processors, 11433 &va_block->evicted_gpus, 11434 &uvm_va_range_get_policy(va_range)->accessed_by); 11435 11436 for_each_gpu_id_in_mask(id, &map_processors) { 11437 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id); 11438 uvm_va_block_gpu_state_t *gpu_state; 11439 11440 if (!gpu->parent->access_counters_supported) 11441 continue; 11442 11443 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 11444 UVM_ASSERT(gpu_state); 11445 11446 // TODO: Bug 2096389: uvm_va_block_add_mappings does not add 11447 // remote mappings to read-duplicated pages. Add support for it 11448 // or create a new function. 11449 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, NULL, 11450 uvm_va_block_add_mappings(va_block, 11451 block_context, 11452 id, 11453 uvm_va_block_region_from_block(va_block), 11454 &gpu_state->evicted, 11455 UvmEventMapRemoteCauseEviction)); 11456 if (status != NV_OK) 11457 break; 11458 } 11459 } 11460 11461 if (status != NV_OK) { 11462 UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s, processor %s\n", 11463 va_block->start, 11464 va_block->end, 11465 nvstatusToString(status), 11466 uvm_va_space_processor_name(va_space, id)); 11467 } 11468 } 11469 11470 unlock: 11471 uvm_va_space_up_read(va_space); 11472 uvm_va_block_context_free(block_context); 11473 11474 done: 11475 uvm_va_space_mm_release_unlock(va_space, mm); 11476 uvm_va_block_release(va_block); 11477 } 11478 11479 static void block_add_eviction_mappings_entry(void *args) 11480 { 11481 UVM_ENTRY_VOID(block_add_eviction_mappings(args)); 11482 } 11483 11484 NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block, 11485 uvm_gpu_t *gpu, 11486 uvm_gpu_chunk_t *root_chunk, 11487 uvm_tracker_t *tracker) 11488 { 11489 NV_STATUS status = NV_OK; 11490 NvU32 i; 11491 uvm_va_block_gpu_state_t *gpu_state; 11492 uvm_va_block_region_t chunk_region; 11493 size_t num_gpu_chunks = block_num_gpu_chunks(va_block, gpu); 11494 size_t chunks_to_evict = 0; 11495 uvm_va_block_context_t *block_context; 11496 uvm_page_mask_t *pages_to_evict; 11497 uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block); 11498 uvm_va_space_t *va_space = uvm_va_block_get_va_space_maybe_dead(va_block); 11499 struct mm_struct *mm; 11500 bool accessed_by_set = false; 11501 11502 uvm_assert_mutex_locked(&va_block->lock); 11503 11504 // The block might have been killed in the meantime 11505 if (!va_space) 11506 return NV_OK; 11507 11508 gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 11509 if (!gpu_state) 11510 return NV_OK; 11511 11512 if (va_block_test && va_block_test->inject_eviction_error) { 11513 va_block_test->inject_eviction_error = false; 11514 return NV_ERR_NO_MEMORY; 11515 } 11516 11517 // We cannot take this block's VA space or mmap_lock locks on the eviction 11518 // path, however, we retain mm in order to support accounting of CPU memory 11519 // allocations. If mappings need to be created, 11520 // block_add_eviction_mappings() will be scheduled below. 11521 mm = uvm_va_space_mm_retain(va_space); 11522 block_context = uvm_va_block_context_alloc(mm); 11523 if (!block_context) { 11524 if (mm) 11525 uvm_va_space_mm_release(va_space); 11526 return NV_ERR_NO_MEMORY; 11527 } 11528 11529 pages_to_evict = &block_context->caller_page_mask; 11530 uvm_page_mask_zero(pages_to_evict); 11531 chunk_region.outer = 0; 11532 11533 // Find all chunks that are subchunks of the root chunk 11534 for (i = 0; i < num_gpu_chunks; ++i) { 11535 uvm_chunk_size_t chunk_size; 11536 size_t chunk_index = block_gpu_chunk_index(va_block, gpu, chunk_region.outer, &chunk_size); 11537 UVM_ASSERT(chunk_index == i); 11538 chunk_region.first = chunk_region.outer; 11539 chunk_region.outer = chunk_region.first + chunk_size / PAGE_SIZE; 11540 11541 if (!gpu_state->chunks[i]) 11542 continue; 11543 if (!uvm_gpu_chunk_same_root(gpu_state->chunks[i], root_chunk)) 11544 continue; 11545 11546 if (uvm_va_block_is_hmm(va_block)) { 11547 status = uvm_hmm_va_block_evict_chunk_prep(va_block, block_context, gpu_state->chunks[i], chunk_region); 11548 if (status != NV_OK) 11549 break; 11550 } 11551 11552 uvm_page_mask_region_fill(pages_to_evict, chunk_region); 11553 ++chunks_to_evict; 11554 } 11555 11556 if (chunks_to_evict == 0) 11557 goto out; 11558 11559 // Only move pages resident on the GPU 11560 uvm_page_mask_and(pages_to_evict, pages_to_evict, uvm_va_block_resident_mask_get(va_block, gpu->id)); 11561 uvm_processor_mask_zero(&block_context->make_resident.all_involved_processors); 11562 11563 if (uvm_va_block_is_hmm(va_block)) { 11564 status = uvm_hmm_va_block_evict_chunks(va_block, 11565 block_context, 11566 pages_to_evict, 11567 uvm_va_block_region_from_block(va_block), 11568 &accessed_by_set); 11569 } 11570 else { 11571 block_context->policy = uvm_va_range_get_policy(va_block->va_range); 11572 accessed_by_set = uvm_processor_mask_get_count(&block_context->policy->accessed_by) > 0; 11573 11574 // TODO: Bug 1765193: make_resident() breaks read-duplication, but it's 11575 // not necessary to do so for eviction. Add a version that unmaps only 11576 // the processors that have mappings to the pages being evicted. 11577 status = uvm_va_block_make_resident(va_block, 11578 NULL, 11579 block_context, 11580 UVM_ID_CPU, 11581 uvm_va_block_region_from_block(va_block), 11582 pages_to_evict, 11583 NULL, 11584 UVM_MAKE_RESIDENT_CAUSE_EVICTION); 11585 } 11586 if (status != NV_OK) 11587 goto out; 11588 11589 // VA space lock may not be held and hence we cannot reestablish any 11590 // mappings here and need to defer it to a work queue. 11591 // 11592 // Reading the accessed_by mask without the VA space lock is safe because 11593 // adding a new processor to the mask triggers going over all the VA blocks 11594 // in the range and locking them. And we hold one of the VA block's locks. 11595 // 11596 // If uvm_va_range_set_accessed_by() hasn't called 11597 // uvm_va_block_set_accessed_by() for this block yet then it will take care 11598 // of adding the mapping after we are done. If it already did then we are 11599 // guaranteed to see the new processor in the accessed_by mask because we 11600 // locked the block's lock that the thread calling 11601 // uvm_va_range_set_accessed_by() unlocked after updating the mask. 11602 // 11603 // If a processor gets removed from the mask then we might not notice and 11604 // schedule the work item anyway, but that's benign as 11605 // block_add_eviction_mappings() re-examines the mask. 11606 // 11607 // Checking if access counters migrations are enabled on a VA space is racy 11608 // without holding the VA space lock. However, this is fine as 11609 // block_add_eviction_mappings() reexamines the value with the VA space 11610 // lock being held. 11611 if (accessed_by_set || (gpu->parent->access_counters_supported && uvm_va_space_map_remote_on_eviction(va_space))) { 11612 // Always retain the VA block first so that it's safe for the deferred 11613 // callback to release it immediately after it runs. 11614 uvm_va_block_retain(va_block); 11615 11616 if (!nv_kthread_q_schedule_q_item(&g_uvm_global.global_q, 11617 &va_block->eviction_mappings_q_item)) { 11618 // And release it if no new callback was scheduled 11619 uvm_va_block_release_no_destroy(va_block); 11620 } 11621 } 11622 11623 status = uvm_tracker_add_tracker_safe(tracker, &va_block->tracker); 11624 if (status != NV_OK) 11625 goto out; 11626 11627 for (i = 0; i < num_gpu_chunks; ++i) { 11628 uvm_gpu_id_t accessing_gpu_id; 11629 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 11630 11631 if (!chunk) 11632 continue; 11633 if (!uvm_gpu_chunk_same_root(chunk, root_chunk)) 11634 continue; 11635 11636 // Remove the mappings of indirect peers from the reverse map. We 11637 // access the indirect peer mask from the VA space without holding the 11638 // VA space lock. Therefore, we can race with enable_peer/disable_peer 11639 // operations. However this is fine: 11640 // 11641 // The enable_peer sequence is as follows: 11642 // 11643 // set_bit in va_space->indirect_peers 11644 // uvm_va_block_enable_peer; 11645 // 11646 // - If we read the mask BEFORE it is set or AFTER the mapping has 11647 // been added to the map there is no race. 11648 // - If we read the mask AFTER it is set but BEFORE adding the mapping 11649 // to the reverse map, we will try to remove it although it is not 11650 // there yet. Therefore, we use 11651 // uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction, which does 11652 // not check if the mapping is present in the reverse map. 11653 // 11654 // The disable_peer sequence is as follows: 11655 // 11656 // uvm_va_block_disable_peer; 11657 // clear_bit in va_space->indirect_peers 11658 // 11659 // - If we read the mask BEFORE the mapping has been added to the map 11660 // or AFTER the bit has been cleared, there is no race. 11661 // - If we read the mask AFTER the mapping has been removed and BEFORE 11662 // the bit is cleared, we will try to remove the mapping, too. 11663 // Again, uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction works 11664 // in this scenario. 11665 // Obtain the uvm_gpu_t directly via the parent GPU's id since indirect 11666 // peers are not supported when SMC is enabled. 11667 for_each_gpu_id_in_mask(accessing_gpu_id, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 11668 uvm_gpu_t *accessing_gpu = uvm_va_space_get_gpu(va_space, accessing_gpu_id); 11669 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); 11670 11671 uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction(&accessing_gpu->pmm_reverse_sysmem_mappings, 11672 peer_addr); 11673 } 11674 11675 uvm_mmu_chunk_unmap(chunk, tracker); 11676 11677 uvm_pmm_gpu_mark_chunk_evicted(&gpu->pmm, gpu_state->chunks[i]); 11678 gpu_state->chunks[i] = NULL; 11679 } 11680 11681 out: 11682 uvm_va_block_context_free(block_context); 11683 if (mm) 11684 uvm_va_space_mm_release(va_space); 11685 11686 return status; 11687 } 11688 11689 static NV_STATUS block_gpu_force_4k_ptes(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu) 11690 { 11691 uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu); 11692 uvm_push_t push; 11693 NV_STATUS status; 11694 11695 // See comment in uvm_va_block_set_cancel 11696 UVM_ASSERT(!gpu->parent->fault_cancel_va_supported); 11697 11698 if (!gpu_state) 11699 return NV_ERR_NO_MEMORY; 11700 11701 // Force all pages to be 4K and prevent future upgrades during cancel 11702 gpu_state->force_4k_ptes = true; 11703 11704 // If we have no page tables we're done. For fault cancel we need to make 11705 // sure that fatal faults are on different 4k PTEs than non-fatal faults, 11706 // and we need to service all non-fatal faults before issuing the cancel. So 11707 // either all faults are fatal and we have no PTEs (we're PROT_NONE), or 11708 // we'll allocate PTEs later when we service the non-fatal faults. Those 11709 // PTEs will be 4k since force_4k_ptes is set. 11710 if (!block_gpu_has_page_tables(block, gpu)) 11711 return NV_OK; 11712 11713 // Are we 4k already? 11714 if (!gpu_state->pte_is_2m && bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 11715 return NV_OK; 11716 11717 status = block_alloc_ptes_with_retry(block, gpu, UVM_PAGE_SIZE_4K, NULL); 11718 if (status != NV_OK) 11719 return status; 11720 11721 status = uvm_push_begin_acquire(gpu->channel_manager, 11722 UVM_CHANNEL_TYPE_MEMOPS, 11723 &block->tracker, 11724 &push, 11725 "Forcing 4k PTEs on block [0x%llx, 0x%llx)", 11726 block->start, 11727 block->end + 1); 11728 if (status != NV_OK) 11729 return status; 11730 11731 if (gpu_state->pte_is_2m) 11732 block_gpu_split_2m(block, block_context, gpu, NULL, &push); 11733 else 11734 block_gpu_split_big(block, block_context, gpu, gpu_state->big_ptes, &push); 11735 11736 uvm_push_end(&push); 11737 11738 UVM_ASSERT(block_check_mappings(block)); 11739 11740 return uvm_tracker_add_push_safe(&block->tracker, &push); 11741 } 11742 11743 NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu) 11744 { 11745 uvm_assert_mutex_locked(&va_block->lock); 11746 11747 // Volta+ devices support a global VA cancel method that does not require 11748 // 4k PTEs. Thus, skip doing this PTE splitting, particularly because it 11749 // could result in 4k PTEs on P9 systems which otherwise would never need 11750 // them. 11751 if (gpu->parent->fault_cancel_va_supported) 11752 return NV_OK; 11753 11754 return block_gpu_force_4k_ptes(va_block, block_context, gpu); 11755 } 11756 11757 NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *params, struct file *filp) 11758 { 11759 uvm_va_space_t *va_space = uvm_va_space_get(filp); 11760 struct mm_struct *mm; 11761 uvm_va_block_t *va_block; 11762 uvm_va_block_test_t *va_block_test; 11763 uvm_va_block_context_t *block_context = NULL; 11764 NV_STATUS status = NV_OK; 11765 11766 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 11767 uvm_va_space_down_read(va_space); 11768 11769 block_context = uvm_va_block_context_alloc(mm); 11770 if (!block_context) { 11771 status = NV_ERR_NO_MEMORY; 11772 goto out; 11773 } 11774 11775 status = uvm_va_block_find_create(va_space, params->lookup_address, block_context, &va_block); 11776 if (status != NV_OK) 11777 goto out; 11778 11779 va_block_test = uvm_va_block_get_test(va_block); 11780 UVM_ASSERT(va_block_test); 11781 11782 uvm_mutex_lock(&va_block->lock); 11783 11784 if (params->page_table_allocation_retry_force_count) 11785 va_block_test->page_table_allocation_retry_force_count = params->page_table_allocation_retry_force_count; 11786 11787 if (params->user_pages_allocation_retry_force_count) 11788 va_block_test->user_pages_allocation_retry_force_count = params->user_pages_allocation_retry_force_count; 11789 11790 if (params->cpu_chunk_allocation_size_mask) { 11791 if (params->cpu_chunk_allocation_size_mask & ~UVM_CPU_CHUNK_SIZES || 11792 !(params->cpu_chunk_allocation_size_mask & PAGE_SIZE)) { 11793 status = NV_ERR_INVALID_ARGUMENT; 11794 goto block_unlock; 11795 } 11796 11797 va_block_test->cpu_chunk_allocation_size_mask = params->cpu_chunk_allocation_size_mask & UVM_CPU_CHUNK_SIZES; 11798 } 11799 11800 if (params->eviction_error) 11801 va_block_test->inject_eviction_error = params->eviction_error; 11802 11803 if (params->cpu_pages_allocation_error_count) 11804 va_block_test->inject_cpu_pages_allocation_error_count = params->cpu_pages_allocation_error_count; 11805 11806 if (params->populate_error) 11807 va_block_test->inject_populate_error = params->populate_error; 11808 11809 block_unlock: 11810 uvm_mutex_unlock(&va_block->lock); 11811 11812 out: 11813 uvm_va_space_up_read(va_space); 11814 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 11815 uvm_va_block_context_free(block_context); 11816 return status; 11817 } 11818 11819 static uvm_prot_t g_uvm_test_pte_mapping_to_prot[UVM_TEST_PTE_MAPPING_MAX] = 11820 { 11821 [UVM_TEST_PTE_MAPPING_INVALID] = UVM_PROT_NONE, 11822 [UVM_TEST_PTE_MAPPING_READ_ONLY] = UVM_PROT_READ_ONLY, 11823 [UVM_TEST_PTE_MAPPING_READ_WRITE] = UVM_PROT_READ_WRITE, 11824 [UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC] = UVM_PROT_READ_WRITE_ATOMIC, 11825 }; 11826 11827 static UVM_TEST_PTE_MAPPING g_uvm_prot_to_test_pte_mapping[UVM_PROT_MAX] = 11828 { 11829 [UVM_PROT_NONE] = UVM_TEST_PTE_MAPPING_INVALID, 11830 [UVM_PROT_READ_ONLY] = UVM_TEST_PTE_MAPPING_READ_ONLY, 11831 [UVM_PROT_READ_WRITE] = UVM_TEST_PTE_MAPPING_READ_WRITE, 11832 [UVM_PROT_READ_WRITE_ATOMIC] = UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC, 11833 }; 11834 11835 NV_STATUS uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS *params, struct file *filp) 11836 { 11837 uvm_va_space_t *va_space = uvm_va_space_get(filp); 11838 uvm_va_block_t *block; 11839 struct mm_struct *mm; 11840 NV_STATUS status = NV_OK; 11841 uvm_prot_t curr_prot, new_prot; 11842 uvm_gpu_t *gpu = NULL; 11843 uvm_processor_id_t id; 11844 uvm_tracker_t local_tracker; 11845 uvm_va_block_region_t region; 11846 uvm_va_block_context_t *block_context = NULL; 11847 11848 if (!PAGE_ALIGNED(params->va)) 11849 return NV_ERR_INVALID_ADDRESS; 11850 11851 if (params->mapping >= UVM_TEST_PTE_MAPPING_MAX) 11852 return NV_ERR_INVALID_ARGUMENT; 11853 11854 new_prot = g_uvm_test_pte_mapping_to_prot[params->mapping]; 11855 11856 // mmap_lock isn't needed for invalidating CPU mappings, but it will be 11857 // needed for inserting them. 11858 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 11859 uvm_va_space_down_read(va_space); 11860 11861 if (uvm_uuid_is_cpu(¶ms->uuid)) { 11862 id = UVM_ID_CPU; 11863 } 11864 else { 11865 gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->uuid); 11866 if (!gpu) { 11867 status = NV_ERR_INVALID_DEVICE; 11868 goto out; 11869 } 11870 11871 // Check if the GPU can access the VA 11872 if (!uvm_gpu_can_address(gpu, params->va, PAGE_SIZE)) { 11873 status = NV_ERR_OUT_OF_RANGE; 11874 goto out; 11875 } 11876 11877 id = gpu->id; 11878 } 11879 11880 block_context = uvm_va_block_context_alloc(mm); 11881 if (!block_context) { 11882 status = NV_ERR_NO_MEMORY; 11883 goto out; 11884 } 11885 11886 status = uvm_va_block_find_create(va_space, params->va, block_context, &block); 11887 if (status != NV_OK) 11888 goto out; 11889 11890 // TODO: Bug 3912902: UvmTestChangePteMapping() doesn't work on CPU. 11891 if (UVM_ID_IS_CPU(id) && uvm_va_block_is_hmm(block)) 11892 goto out; 11893 11894 uvm_mutex_lock(&block->lock); 11895 11896 region = uvm_va_block_region_from_start_size(block, params->va, PAGE_SIZE); 11897 curr_prot = block_page_prot(block, id, region.first); 11898 11899 if (new_prot == curr_prot) { 11900 status = NV_OK; 11901 goto out_block; 11902 } 11903 11904 // TODO: Bug 1766124: Upgrades might require revoking other processors' 11905 // access privileges. We just fail for now. Only downgrades are 11906 // supported. If we allowed upgrades, we would need to check the mm 11907 // like we do for revocation below. 11908 if (new_prot > curr_prot) { 11909 status = NV_ERR_INVALID_OPERATION; 11910 goto out_block; 11911 } 11912 11913 block_context->policy = uvm_va_policy_get(block, params->va); 11914 11915 if (new_prot == UVM_PROT_NONE) { 11916 status = uvm_va_block_unmap(block, block_context, id, region, NULL, &block->tracker); 11917 } 11918 else { 11919 UVM_ASSERT(block_is_page_resident_anywhere(block, region.first)); 11920 11921 // Revoking CPU mappings performs a combination of unmap + map. The map 11922 // portion requires a valid mm. 11923 if (UVM_ID_IS_CPU(id) && !uvm_va_range_vma_check(block->va_range, mm)) { 11924 status = NV_ERR_INVALID_STATE; 11925 } 11926 else { 11927 status = uvm_va_block_revoke_prot(block, 11928 block_context, 11929 id, 11930 region, 11931 NULL, 11932 new_prot + 1, 11933 &block->tracker); 11934 } 11935 } 11936 11937 out_block: 11938 if (status == NV_OK) 11939 status = uvm_tracker_init_from(&local_tracker, &block->tracker); 11940 11941 uvm_mutex_unlock(&block->lock); 11942 11943 if (status == NV_OK) 11944 status = uvm_tracker_wait_deinit(&local_tracker); 11945 11946 out: 11947 uvm_va_space_up_read(va_space); 11948 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 11949 11950 uvm_va_block_context_free(block_context); 11951 11952 return status; 11953 } 11954 11955 NV_STATUS uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS *params, struct file *filp) 11956 { 11957 uvm_va_space_t *va_space = uvm_va_space_get(filp); 11958 uvm_va_block_t *va_block; 11959 uvm_va_range_t *va_range; 11960 struct mm_struct *mm; 11961 size_t index; 11962 NV_STATUS status = NV_OK; 11963 11964 BUILD_BUG_ON(UVM_TEST_VA_BLOCK_SIZE != UVM_VA_BLOCK_SIZE); 11965 11966 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 11967 uvm_va_space_down_read(va_space); 11968 11969 va_range = uvm_va_range_find(va_space, params->lookup_address); 11970 if (!va_range) { 11971 status = uvm_hmm_va_block_find(va_space, params->lookup_address, &va_block); 11972 if (status == NV_ERR_OBJECT_NOT_FOUND) { 11973 status = uvm_hmm_va_block_range_bounds(va_space, 11974 mm, 11975 params->lookup_address, 11976 ¶ms->va_block_start, 11977 ¶ms->va_block_end, 11978 NULL); 11979 goto out; 11980 } 11981 else if (status != NV_OK) { 11982 goto out; 11983 } 11984 } 11985 else { 11986 index = uvm_va_range_block_index(va_range, params->lookup_address); 11987 va_block = uvm_va_range_block(va_range, index); 11988 if (!va_block) { 11989 status = NV_ERR_OBJECT_NOT_FOUND; 11990 goto out; 11991 } 11992 } 11993 11994 params->va_block_start = va_block->start; 11995 params->va_block_end = va_block->end; 11996 11997 out: 11998 uvm_va_space_up_read(va_space); 11999 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 12000 return status; 12001 } 12002 12003 NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params, struct file *filp) 12004 { 12005 NV_STATUS status = NV_OK; 12006 uvm_va_space_t *va_space = uvm_va_space_get(filp); 12007 uvm_va_range_t *va_range; 12008 uvm_va_block_t *block = NULL; 12009 struct mm_struct *mm; 12010 NvU32 count = 0; 12011 uvm_processor_mask_t resident_on_mask; 12012 uvm_processor_id_t id; 12013 uvm_page_index_t page_index; 12014 unsigned release_block_count = 0; 12015 NvU64 addr = UVM_ALIGN_DOWN(params->lookup_address, PAGE_SIZE); 12016 size_t index; 12017 12018 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 12019 uvm_va_space_down_read(va_space); 12020 12021 // Inline uvm_va_block_find() to get the va_range. 12022 va_range = uvm_va_range_find(va_space, addr); 12023 if (!va_range) { 12024 NvU64 start, end; 12025 12026 status = uvm_hmm_va_block_find(va_space, addr, &block); 12027 if (status != NV_OK) { 12028 if (status != NV_ERR_OBJECT_NOT_FOUND) 12029 goto out; 12030 status = uvm_hmm_va_block_range_bounds(va_space, mm, addr, &start, &end, params); 12031 goto out; 12032 } 12033 // Update current CPU mapping information. 12034 status = uvm_hmm_va_block_update_residency_info(block, mm, addr, false); 12035 if (status != NV_OK) { 12036 block = NULL; 12037 goto out; 12038 } 12039 } 12040 else if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) { 12041 status = NV_ERR_INVALID_ADDRESS; 12042 goto out; 12043 } 12044 else { 12045 index = uvm_va_range_block_index(va_range, addr); 12046 block = uvm_va_range_block(va_range, index); 12047 if (!block) { 12048 params->resident_on_count = 0; 12049 params->populated_on_count = 0; 12050 params->mapped_on_count = 0; 12051 12052 status = NV_OK; 12053 12054 goto out; 12055 } 12056 } 12057 12058 uvm_mutex_lock(&block->lock); 12059 12060 page_index = uvm_va_block_cpu_page_index(block, addr); 12061 uvm_va_block_page_resident_processors(block, page_index, &resident_on_mask); 12062 12063 for_each_id_in_mask(id, &resident_on_mask) { 12064 block_phys_page_t block_page = block_phys_page(id, page_index); 12065 uvm_va_space_processor_uuid(va_space, ¶ms->resident_on[count], id); 12066 params->resident_physical_size[count] = block_phys_page_size(block, block_page); 12067 if (UVM_ID_IS_CPU(id)) { 12068 params->resident_physical_address[count] = page_to_phys(uvm_cpu_chunk_get_cpu_page(block, page_index)); 12069 } 12070 else { 12071 params->resident_physical_address[count] = 12072 block_phys_page_address(block, block_page, uvm_va_space_get_gpu(va_space, id)).address; 12073 } 12074 ++count; 12075 } 12076 params->resident_on_count = count; 12077 12078 count = 0; 12079 for_each_id_in_mask(id, &block->mapped) { 12080 uvm_processor_id_t processor_to_map; 12081 block_phys_page_t block_page; 12082 NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index); 12083 12084 if (page_size == 0) 12085 continue; 12086 12087 uvm_va_space_processor_uuid(va_space, ¶ms->mapped_on[count], id); 12088 12089 params->mapping_type[count] = g_uvm_prot_to_test_pte_mapping[block_page_prot(block, id, page_index)]; 12090 UVM_ASSERT(params->mapping_type[count] != UVM_TEST_PTE_MAPPING_INVALID); 12091 processor_to_map = block_get_processor_to_map(block, id, page_index); 12092 block_page = block_phys_page(processor_to_map, page_index); 12093 12094 if (!UVM_ID_IS_CPU(id)) { 12095 uvm_gpu_phys_address_t gpu_phys_addr = block_phys_page_address(block, 12096 block_page, 12097 uvm_va_space_get_gpu(va_space, id)); 12098 params->mapping_physical_address[count] = gpu_phys_addr.address; 12099 } 12100 else { 12101 struct page *page = block_page_get(block, block_page); 12102 12103 params->mapping_physical_address[count] = page_to_phys(page); 12104 } 12105 12106 params->page_size[count] = page_size; 12107 ++count; 12108 } 12109 12110 if (params->resident_on_count == 1) { 12111 if (uvm_processor_mask_test(&resident_on_mask, UVM_ID_CPU)) { 12112 if (uvm_pmm_sysmem_mappings_indirect_supported()) { 12113 for_each_gpu_id(id) { 12114 NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index); 12115 uvm_reverse_map_t sysmem_page; 12116 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 12117 size_t num_pages; 12118 uvm_gpu_t *gpu; 12119 12120 if (!uvm_va_block_gpu_state_get(block, id)) 12121 continue; 12122 12123 gpu = uvm_va_space_get_gpu(va_space, id); 12124 12125 if (!gpu->parent->access_counters_supported) 12126 continue; 12127 12128 num_pages = uvm_pmm_sysmem_mappings_dma_to_virt(&gpu->pmm_reverse_sysmem_mappings, 12129 uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent), 12130 uvm_cpu_chunk_get_size(chunk), 12131 &sysmem_page, 12132 1); 12133 if (page_size > 0) 12134 UVM_ASSERT(num_pages == 1); 12135 else 12136 UVM_ASSERT(num_pages <= 1); 12137 12138 if (num_pages == 1) { 12139 UVM_ASSERT(sysmem_page.va_block == block); 12140 UVM_ASSERT(uvm_reverse_map_start(&sysmem_page) <= addr); 12141 UVM_ASSERT(uvm_reverse_map_end(&sysmem_page) > addr); 12142 12143 ++release_block_count; 12144 } 12145 } 12146 } 12147 } 12148 else { 12149 uvm_gpu_id_t id = uvm_processor_mask_find_first_id(&resident_on_mask); 12150 uvm_reverse_map_t gpu_mapping; 12151 size_t num_pages; 12152 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id); 12153 uvm_gpu_phys_address_t phys_addr; 12154 12155 phys_addr = uvm_va_block_gpu_phys_page_address(block, page_index, gpu); 12156 num_pages = uvm_pmm_gpu_phys_to_virt(&gpu->pmm, phys_addr.address, PAGE_SIZE, &gpu_mapping); 12157 12158 // Chunk may be in TEMP_PINNED state so it may not have a VA block 12159 // assigned. In that case, we don't get a valid translation. 12160 if (num_pages > 0) { 12161 UVM_ASSERT(num_pages == 1); 12162 UVM_ASSERT(gpu_mapping.va_block == block); 12163 UVM_ASSERT(uvm_reverse_map_start(&gpu_mapping) == addr); 12164 12165 ++release_block_count; 12166 } 12167 } 12168 } 12169 12170 params->mapped_on_count = count; 12171 12172 count = 0; 12173 for_each_processor_id(id) { 12174 if (!block_processor_page_is_populated(block, id, page_index)) 12175 continue; 12176 12177 uvm_va_space_processor_uuid(va_space, ¶ms->populated_on[count], id); 12178 ++count; 12179 } 12180 params->populated_on_count = count; 12181 12182 out: 12183 if (block) { 12184 if (!params->is_async && status == NV_OK) 12185 status = uvm_tracker_wait(&block->tracker); 12186 uvm_mutex_unlock(&block->lock); 12187 while (release_block_count--) 12188 uvm_va_block_release(block); 12189 } 12190 uvm_va_space_up_read(va_space); 12191 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 12192 return status; 12193 } 12194 12195 void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block) 12196 { 12197 block_mark_region_cpu_dirty(va_block, uvm_va_block_region_from_block(va_block)); 12198 } 12199