1 /******************************************************************************* 2 Copyright (c) 2015-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_linux.h" 25 #include "uvm_common.h" 26 #include "uvm_api.h" 27 #include "uvm_gpu.h" 28 #include "uvm_va_space.h" 29 #include "uvm_va_range.h" 30 #include "uvm_va_block.h" 31 #include "uvm_hal_types.h" 32 #include "uvm_kvmalloc.h" 33 #include "uvm_tools.h" 34 #include "uvm_push.h" 35 #include "uvm_hal.h" 36 #include "uvm_perf_thrashing.h" 37 #include "uvm_perf_prefetch.h" 38 #include "uvm_mem.h" 39 #include "uvm_gpu_access_counters.h" 40 #include "uvm_va_space_mm.h" 41 #include "uvm_test_ioctl.h" 42 #include "uvm_conf_computing.h" 43 44 typedef enum 45 { 46 BLOCK_PTE_OP_MAP, 47 BLOCK_PTE_OP_REVOKE, 48 BLOCK_PTE_OP_COUNT 49 } block_pte_op_t; 50 51 static NvU64 uvm_perf_authorized_cpu_fault_tracking_window_ns = 300000; 52 53 static struct kmem_cache *g_uvm_va_block_cache __read_mostly; 54 static struct kmem_cache *g_uvm_va_block_gpu_state_cache __read_mostly; 55 static struct kmem_cache *g_uvm_page_mask_cache __read_mostly; 56 static struct kmem_cache *g_uvm_va_block_context_cache __read_mostly; 57 58 static int uvm_fault_force_sysmem __read_mostly = 0; 59 module_param(uvm_fault_force_sysmem, int, S_IRUGO|S_IWUSR); 60 MODULE_PARM_DESC(uvm_fault_force_sysmem, "Force (1) using sysmem storage for pages that faulted. Default: 0."); 61 62 static int uvm_perf_map_remote_on_eviction __read_mostly = 1; 63 module_param(uvm_perf_map_remote_on_eviction, int, S_IRUGO); 64 65 // Caching is always disabled for mappings to remote memory. The following two 66 // module parameters can be used to force caching for GPU peer/sysmem mappings. 67 // 68 // However, it is important to note that it may not be safe to enable caching 69 // in the general case so the enablement should only be used for experiments. 70 static unsigned uvm_exp_gpu_cache_peermem __read_mostly = 0; 71 module_param(uvm_exp_gpu_cache_peermem, uint, S_IRUGO); 72 MODULE_PARM_DESC(uvm_exp_gpu_cache_peermem, 73 "Force caching for mappings to peer memory. " 74 "This is an experimental parameter that may cause correctness issues if used."); 75 76 static unsigned uvm_exp_gpu_cache_sysmem __read_mostly = 0; 77 module_param(uvm_exp_gpu_cache_sysmem, uint, S_IRUGO); 78 MODULE_PARM_DESC(uvm_exp_gpu_cache_sysmem, 79 "Force caching for mappings to system memory. " 80 "This is an experimental parameter that may cause correctness issues if used."); 81 82 static void block_add_eviction_mappings_entry(void *args); 83 84 uvm_va_space_t *uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t *va_block) 85 { 86 #if UVM_IS_CONFIG_HMM() 87 if (va_block->hmm.va_space) 88 return va_block->hmm.va_space; 89 #endif 90 91 if (va_block->va_range) 92 return va_block->va_range->va_space; 93 94 return NULL; 95 } 96 97 uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block) 98 { 99 uvm_va_space_t *va_space; 100 101 UVM_ASSERT(!uvm_va_block_is_dead(va_block)); 102 103 va_space = uvm_va_block_get_va_space_maybe_dead(va_block); 104 UVM_ASSERT(va_space); 105 106 return va_space; 107 } 108 109 bool uvm_va_block_check_policy_is_valid(uvm_va_block_t *va_block, 110 const uvm_va_policy_t *policy, 111 uvm_va_block_region_t region) 112 { 113 uvm_assert_mutex_locked(&va_block->lock); 114 115 if (uvm_va_block_is_hmm(va_block)) { 116 const uvm_va_policy_node_t *node; 117 118 if (uvm_va_policy_is_default(policy)) { 119 // There should only be the default policy within the region. 120 node = uvm_va_policy_node_iter_first(va_block, 121 uvm_va_block_region_start(va_block, region), 122 uvm_va_block_region_end(va_block, region)); 123 UVM_ASSERT(!node); 124 } 125 else { 126 // The policy node should cover the region. 127 node = uvm_va_policy_node_from_policy(policy); 128 UVM_ASSERT(node->node.start <= uvm_va_block_region_start(va_block, region)); 129 UVM_ASSERT(node->node.end >= uvm_va_block_region_end(va_block, region)); 130 } 131 } 132 else { 133 UVM_ASSERT(policy == uvm_va_range_get_policy(va_block->va_range)); 134 } 135 136 return true; 137 } 138 139 static NvU64 block_gpu_pte_flag_cacheable(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t resident_id) 140 { 141 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 142 143 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 144 145 // Local vidmem is always cached 146 if (uvm_id_equal(resident_id, gpu->id)) 147 return UVM_MMU_PTE_FLAGS_CACHED; 148 149 if (UVM_ID_IS_CPU(resident_id)) 150 return uvm_exp_gpu_cache_sysmem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED; 151 152 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(gpu->id)], resident_id)); 153 154 return uvm_exp_gpu_cache_peermem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED; 155 } 156 157 static uvm_gpu_t *block_get_gpu(uvm_va_block_t *block, uvm_gpu_id_t gpu_id) 158 { 159 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 160 161 return uvm_va_space_get_gpu(va_space, gpu_id); 162 } 163 164 static const char *block_processor_name(uvm_va_block_t *block, uvm_processor_id_t id) 165 { 166 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 167 168 return uvm_va_space_processor_name(va_space, id); 169 } 170 171 static bool block_processor_has_memory(uvm_va_block_t *block, uvm_processor_id_t id) 172 { 173 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 174 175 return uvm_va_space_processor_has_memory(va_space, id); 176 } 177 178 static bool is_uvm_fault_force_sysmem_set(void) 179 { 180 // Only enforce this during testing 181 return uvm_enable_builtin_tests && uvm_fault_force_sysmem != 0; 182 } 183 184 bool uvm_va_space_map_remote_on_eviction(uvm_va_space_t *va_space) 185 { 186 return uvm_perf_map_remote_on_eviction && 187 uvm_va_space_has_access_counter_migrations(va_space); 188 } 189 190 static const uvm_processor_mask_t *block_get_uvm_lite_gpus(uvm_va_block_t *va_block) 191 { 192 // Note that for HMM we always return a pointer to a zero bitmap 193 // (not allocated on the stack) since uvm_lite GPUs are not supported. 194 static const uvm_processor_mask_t uvm_lite_gpus = {}; 195 196 if (uvm_va_block_is_hmm(va_block)) 197 return &uvm_lite_gpus; 198 else 199 return &va_block->va_range->uvm_lite_gpus; 200 } 201 202 void uvm_va_block_retry_init(uvm_va_block_retry_t *retry) 203 { 204 if (!retry) 205 return; 206 207 uvm_tracker_init(&retry->tracker); 208 INIT_LIST_HEAD(&retry->used_chunks); 209 INIT_LIST_HEAD(&retry->free_chunks); 210 } 211 212 // The bottom bit of uvm_va_block_t::chunks is used to indicate how CPU chunks 213 // are stored. 214 // 215 // CPU chunk storage is handled in three different ways depending on the 216 // type of chunks the VA block owns. This is done to minimize the memory 217 // required to hold metadata. 218 typedef enum 219 { 220 // The uvm_va_block_t::chunk pointer points to a single 2MB 221 // CPU chunk. 222 UVM_CPU_CHUNK_STORAGE_CHUNK = 0, 223 224 // The uvm_va_block_t::chunks pointer points to a 225 // structure of mixed (64K and 4K) chunks. 226 UVM_CPU_CHUNK_STORAGE_MIXED, 227 UVM_CPU_CHUNK_STORAGE_COUNT, 228 } uvm_cpu_chunk_storage_type_t; 229 230 #define UVM_CPU_CHUNK_STORAGE_MASK 0x1 231 232 // The maximum number of slots in the mixed chunk mode (64K + 4K chunks) is 233 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK. Any leading/trailing misaligned pages will 234 // be stored in the first/last entry, respectively. 235 #define MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK MAX_BIG_PAGES_PER_UVM_VA_BLOCK 236 237 #define MAX_SMALL_CHUNKS_PER_BIG_SLOT (UVM_MIN_BIG_PAGE_SIZE / PAGE_SIZE) 238 239 // This structure is used when a VA block contains 64K or a mix of 64K and 4K 240 // CPU chunks. 241 // For every 64K CPU chunks, big_chunks will have its corresponding bit set 242 // and the corresponding index in slots will point directly to the 243 // uvm_cpu_chunk_t structure. 244 // 245 // For 4K CPU chunks, the corresponding bit in big_chunks will be clear and 246 // the element in slots will point to an array of 16 uvm_cpu_chunk_t pointers. 247 typedef struct { 248 DECLARE_BITMAP(big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK); 249 void *slots[MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK]; 250 } uvm_cpu_chunk_storage_mixed_t; 251 252 static uvm_va_block_region_t uvm_cpu_chunk_block_region(uvm_va_block_t *va_block, 253 uvm_cpu_chunk_t *chunk, 254 uvm_page_index_t page_index) 255 { 256 UVM_ASSERT(chunk); 257 return uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index); 258 } 259 260 static void *uvm_cpu_storage_get_ptr(uvm_va_block_t *block) 261 { 262 return (void *)(block->cpu.chunks & ~UVM_CPU_CHUNK_STORAGE_MASK); 263 } 264 265 static uvm_cpu_chunk_storage_type_t uvm_cpu_storage_get_type(uvm_va_block_t *block) 266 { 267 return block->cpu.chunks & UVM_CPU_CHUNK_STORAGE_MASK; 268 } 269 270 static uvm_page_index_t compute_page_prefix(uvm_va_block_t *va_block, uvm_chunk_size_t size) 271 { 272 return (UVM_ALIGN_UP(va_block->start, size) - va_block->start) / PAGE_SIZE; 273 } 274 275 static size_t compute_slot_index(uvm_va_block_t *va_block, uvm_page_index_t page_index) 276 { 277 uvm_va_block_region_t block_region = uvm_va_block_region_from_block(va_block); 278 uvm_page_index_t prefix; 279 size_t slot_index; 280 281 UVM_ASSERT(page_index < block_region.outer); 282 prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K); 283 284 if (page_index < prefix) 285 return 0; 286 287 slot_index = ((page_index - prefix) / MAX_SMALL_CHUNKS_PER_BIG_SLOT) + !!prefix; 288 UVM_ASSERT(slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK); 289 290 return slot_index; 291 } 292 293 static size_t compute_small_index(uvm_va_block_t *va_block, uvm_page_index_t page_index) 294 { 295 size_t prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K); 296 297 if (page_index < prefix) 298 return page_index; 299 300 return (page_index - prefix) % MAX_SMALL_CHUNKS_PER_BIG_SLOT; 301 } 302 303 NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, 304 uvm_cpu_chunk_t *chunk, 305 uvm_page_index_t page_index) 306 { 307 uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk); 308 uvm_va_block_region_t chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk)); 309 size_t slot_index; 310 uvm_cpu_chunk_storage_mixed_t *mixed; 311 uvm_cpu_chunk_t **chunks = NULL; 312 313 // We only want to use the bottom bit of a pointer. 314 BUILD_BUG_ON(UVM_CPU_CHUNK_STORAGE_COUNT > 2); 315 316 // We want to protect against two threads manipulating the VA block's CPU 317 // chunks at the same time. However, when a block is split, the new block's 318 // lock is locked without tracking. So, we can't use 319 // uvm_assert_mutex_locked(). 320 UVM_ASSERT(mutex_is_locked(&va_block->lock.m)); 321 322 if (chunk_size == UVM_CHUNK_SIZE_2M) { 323 UVM_ASSERT(uvm_va_block_size(va_block) == UVM_PAGE_SIZE_2M); 324 UVM_ASSERT(!va_block->cpu.chunks); 325 va_block->cpu.chunks = (unsigned long)chunk | UVM_CPU_CHUNK_STORAGE_CHUNK; 326 } 327 else { 328 if (!va_block->cpu.chunks) { 329 mixed = uvm_kvmalloc_zero(sizeof(*mixed)); 330 if (!mixed) 331 return NV_ERR_NO_MEMORY; 332 333 va_block->cpu.chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED; 334 } 335 336 UVM_ASSERT(uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_MIXED); 337 mixed = uvm_cpu_storage_get_ptr(va_block); 338 slot_index = compute_slot_index(va_block, page_index); 339 UVM_ASSERT(compute_slot_index(va_block, page_index + uvm_cpu_chunk_num_pages(chunk) - 1) == slot_index); 340 UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks)); 341 342 if (chunk_size == UVM_CHUNK_SIZE_64K) { 343 mixed->slots[slot_index] = chunk; 344 set_bit(slot_index, mixed->big_chunks); 345 } 346 else { 347 size_t small_index; 348 349 UVM_ASSERT(chunk_size == UVM_CHUNK_SIZE_4K); 350 chunks = mixed->slots[slot_index]; 351 352 if (!chunks) { 353 chunks = uvm_kvmalloc_zero(sizeof(*chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT); 354 if (!chunks) 355 return NV_ERR_NO_MEMORY; 356 mixed->slots[slot_index] = chunks; 357 } 358 359 small_index = compute_small_index(va_block, page_index); 360 chunks[small_index] = chunk; 361 } 362 } 363 364 uvm_page_mask_region_fill(&va_block->cpu.allocated, chunk_region); 365 return NV_OK; 366 } 367 368 uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block, uvm_page_index_t page_index) 369 { 370 uvm_cpu_chunk_storage_mixed_t *mixed; 371 uvm_cpu_chunk_t *chunk; 372 uvm_cpu_chunk_t **chunks; 373 size_t slot_index; 374 375 UVM_ASSERT(page_index < uvm_va_block_num_cpu_pages(va_block)); 376 if (!uvm_page_mask_test(&va_block->cpu.allocated, page_index)) 377 return NULL; 378 379 UVM_ASSERT(va_block->cpu.chunks); 380 381 if (uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_CHUNK) { 382 return uvm_cpu_storage_get_ptr(va_block); 383 } 384 else { 385 mixed = uvm_cpu_storage_get_ptr(va_block); 386 slot_index = compute_slot_index(va_block, page_index); 387 UVM_ASSERT(mixed->slots[slot_index] != NULL); 388 if (test_bit(slot_index, mixed->big_chunks)) 389 return mixed->slots[slot_index]; 390 391 chunks = mixed->slots[slot_index]; 392 chunk = chunks[compute_small_index(va_block, page_index)]; 393 } 394 395 UVM_ASSERT(chunk); 396 return chunk; 397 } 398 399 void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, 400 uvm_page_index_t page_index) 401 { 402 uvm_cpu_chunk_storage_mixed_t *mixed; 403 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); 404 uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index); 405 size_t slot_index; 406 uvm_cpu_chunk_t **chunks; 407 408 // We want to protect against two threads manipulating the VA block's CPU 409 // chunks at the same time. However, when a block is split, the new block's 410 // lock is locked without tracking. So, we can't use 411 // uvm_assert_mutex_locked(). 412 UVM_ASSERT(mutex_is_locked(&va_block->lock.m)); 413 UVM_ASSERT(va_block->cpu.chunks); 414 UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == uvm_cpu_chunk_num_pages(chunk)); 415 416 if (uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_CHUNK) { 417 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M); 418 UVM_ASSERT(uvm_cpu_storage_get_ptr(va_block) == chunk); 419 va_block->cpu.chunks = 0; 420 } 421 else { 422 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) != UVM_CHUNK_SIZE_2M); 423 mixed = uvm_cpu_storage_get_ptr(va_block); 424 slot_index = compute_slot_index(va_block, page_index); 425 UVM_ASSERT(mixed->slots[slot_index] != NULL); 426 427 if (test_bit(slot_index, mixed->big_chunks)) { 428 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K); 429 UVM_ASSERT(mixed->slots[slot_index] == chunk); 430 mixed->slots[slot_index] = NULL; 431 clear_bit(slot_index, mixed->big_chunks); 432 } 433 else { 434 size_t small_index; 435 436 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K); 437 chunks = mixed->slots[slot_index]; 438 small_index = compute_small_index(va_block, page_index); 439 UVM_ASSERT(chunks[small_index] == chunk); 440 chunks[small_index] = NULL; 441 442 for (small_index = 0; small_index < MAX_SMALL_CHUNKS_PER_BIG_SLOT; small_index++) { 443 if (chunks[small_index]) 444 break; 445 } 446 447 if (small_index == MAX_SMALL_CHUNKS_PER_BIG_SLOT) { 448 uvm_kvfree(chunks); 449 mixed->slots[slot_index] = NULL; 450 } 451 } 452 } 453 454 uvm_page_mask_region_clear(&va_block->cpu.allocated, chunk_region); 455 456 if (uvm_page_mask_empty(&va_block->cpu.allocated) && va_block->cpu.chunks) { 457 uvm_kvfree(uvm_cpu_storage_get_ptr(va_block)); 458 va_block->cpu.chunks = 0; 459 } 460 } 461 462 struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index) 463 { 464 uvm_va_block_region_t chunk_region; 465 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); 466 467 UVM_ASSERT(chunk); 468 UVM_ASSERT(chunk->page); 469 chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index); 470 return chunk->page + (page_index - chunk_region.first); 471 } 472 473 static uvm_cpu_chunk_t *uvm_cpu_chunk_first_in_region(uvm_va_block_t *va_block, 474 uvm_va_block_region_t region, 475 uvm_page_index_t *first_chunk_page) 476 { 477 uvm_cpu_chunk_t *chunk = NULL; 478 uvm_page_index_t page_index; 479 480 page_index = uvm_va_block_first_page_in_mask(region, &va_block->cpu.allocated); 481 if (page_index < region.outer) 482 chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); 483 484 if (first_chunk_page && chunk) { 485 uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index); 486 *first_chunk_page = chunk_region.first; 487 } 488 489 return chunk; 490 } 491 492 #define for_each_cpu_chunk_in_block_region(chunk, page_index, va_block, region) \ 493 for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), &(page_index)); \ 494 (chunk) != NULL; \ 495 (chunk) = uvm_cpu_chunk_first_in_region((va_block), \ 496 uvm_va_block_region((page_index) + uvm_cpu_chunk_num_pages((chunk)), \ 497 (region).outer), \ 498 &(page_index))) 499 500 #define for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, region) \ 501 for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), &(page_index)), \ 502 (next_page_index) = (page_index) + (chunk ? uvm_cpu_chunk_num_pages(chunk) : 0); \ 503 (chunk) != NULL; \ 504 (chunk) = uvm_cpu_chunk_first_in_region((va_block), \ 505 uvm_va_block_region((next_page_index), (region).outer), \ 506 &(page_index)), \ 507 (next_page_index) = (page_index) + ((chunk) ? uvm_cpu_chunk_num_pages((chunk)) : 0)) 508 509 #define for_each_cpu_chunk_in_block(chunk, page_index, va_block) \ 510 for_each_cpu_chunk_in_block_region((chunk), (page_index), (va_block), uvm_va_block_region_from_block((va_block))) 511 512 #define for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, va_block) \ 513 for_each_cpu_chunk_in_block_region_safe((chunk), \ 514 (page_index), \ 515 (next_page_index), \ 516 (va_block), \ 517 uvm_va_block_region_from_block((va_block))) 518 519 struct vm_area_struct *uvm_va_block_find_vma_region(uvm_va_block_t *va_block, 520 struct mm_struct *mm, 521 NvU64 start, 522 uvm_va_block_region_t *region) 523 { 524 struct vm_area_struct *vma; 525 NvU64 end; 526 527 if (start > va_block->end) 528 return NULL; 529 530 vma = find_vma_intersection(mm, start, va_block->end + 1); 531 if (!vma) 532 return NULL; 533 534 if (start < vma->vm_start) 535 start = vma->vm_start; 536 537 end = vma->vm_end - 1; 538 if (end > va_block->end) 539 end = va_block->end; 540 541 *region = uvm_va_block_region_from_start_end(va_block, start, end); 542 543 return vma; 544 } 545 546 static bool block_check_cpu_chunks(uvm_va_block_t *block) 547 { 548 uvm_cpu_chunk_t *chunk; 549 size_t alloced_pages = 0; 550 uvm_va_block_region_t prev_region = { 0 }; 551 uvm_page_index_t page_index; 552 553 for_each_cpu_chunk_in_block(chunk, page_index, block) { 554 uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(block, chunk, page_index); 555 size_t num_chunk_pages = uvm_cpu_chunk_num_pages(chunk); 556 uvm_page_index_t chunk_page; 557 558 UVM_ASSERT(prev_region.outer <= chunk_region.first); 559 UVM_ASSERT(IS_ALIGNED(uvm_va_block_region_start(block, chunk_region), uvm_cpu_chunk_get_size(chunk))); 560 UVM_ASSERT(chunk_region.outer <= uvm_va_block_num_cpu_pages(block)); 561 562 alloced_pages += uvm_cpu_chunk_num_pages(chunk); 563 UVM_ASSERT(uvm_page_mask_region_full(&block->cpu.allocated, chunk_region)); 564 prev_region = chunk_region; 565 566 for (chunk_page = page_index; chunk_page < page_index + num_chunk_pages; chunk_page++) 567 UVM_ASSERT(uvm_cpu_chunk_get_chunk_for_page(block, chunk_page) == chunk); 568 } 569 570 UVM_ASSERT(alloced_pages == uvm_page_mask_weight(&block->cpu.allocated)); 571 572 return true; 573 } 574 575 // Frees any left-over free chunks and unpins all the used chunks 576 void uvm_va_block_retry_deinit(uvm_va_block_retry_t *retry, uvm_va_block_t *va_block) 577 { 578 uvm_gpu_t *gpu; 579 uvm_gpu_chunk_t *gpu_chunk; 580 uvm_gpu_chunk_t *next_chunk; 581 582 if (!retry) 583 return; 584 585 uvm_tracker_deinit(&retry->tracker); 586 587 // Free any unused chunks 588 list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->free_chunks, list) { 589 list_del_init(&gpu_chunk->list); 590 gpu = uvm_gpu_chunk_get_gpu(gpu_chunk); 591 uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL); 592 } 593 594 // Unpin all the used chunks now that we are done 595 list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->used_chunks, list) { 596 list_del_init(&gpu_chunk->list); 597 gpu = uvm_gpu_chunk_get_gpu(gpu_chunk); 598 // HMM should have already moved allocated blocks to the referenced 599 // state so any left over were not migrated and should be freed. 600 if (uvm_va_block_is_hmm(va_block)) 601 uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL); 602 else 603 uvm_pmm_gpu_unpin_allocated(&gpu->pmm, gpu_chunk, va_block); 604 } 605 } 606 607 static void block_retry_add_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk) 608 { 609 list_add_tail(&gpu_chunk->list, &retry->free_chunks); 610 } 611 612 static void block_retry_add_used_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk) 613 { 614 list_add_tail(&gpu_chunk->list, &retry->used_chunks); 615 } 616 617 static uvm_gpu_chunk_t *block_retry_get_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_t *gpu, uvm_chunk_size_t size) 618 { 619 uvm_gpu_chunk_t *gpu_chunk; 620 621 list_for_each_entry(gpu_chunk, &retry->free_chunks, list) { 622 if (uvm_gpu_chunk_get_gpu(gpu_chunk) == gpu && uvm_gpu_chunk_get_size(gpu_chunk) == size) { 623 list_del_init(&gpu_chunk->list); 624 return gpu_chunk; 625 } 626 } 627 628 return NULL; 629 } 630 631 // Encapsulates a reference to a physical page belonging to a specific processor 632 // within a VA block. 633 typedef struct 634 { 635 // Processor the page is on 636 uvm_processor_id_t processor; 637 638 // The page index 639 uvm_page_index_t page_index; 640 } block_phys_page_t; 641 642 static block_phys_page_t block_phys_page(uvm_processor_id_t processor, uvm_page_index_t page_index) 643 { 644 return (block_phys_page_t){ processor, page_index }; 645 } 646 647 NV_STATUS uvm_va_block_init(void) 648 { 649 if (uvm_enable_builtin_tests) 650 g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_wrapper_t", uvm_va_block_wrapper_t); 651 else 652 g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_t", uvm_va_block_t); 653 654 if (!g_uvm_va_block_cache) 655 return NV_ERR_NO_MEMORY; 656 657 g_uvm_va_block_gpu_state_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_gpu_state_t", uvm_va_block_gpu_state_t); 658 if (!g_uvm_va_block_gpu_state_cache) 659 return NV_ERR_NO_MEMORY; 660 661 g_uvm_page_mask_cache = NV_KMEM_CACHE_CREATE("uvm_page_mask_t", uvm_page_mask_t); 662 if (!g_uvm_page_mask_cache) 663 return NV_ERR_NO_MEMORY; 664 665 g_uvm_va_block_context_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_context_t", uvm_va_block_context_t); 666 if (!g_uvm_va_block_context_cache) 667 return NV_ERR_NO_MEMORY; 668 669 return NV_OK; 670 } 671 672 void uvm_va_block_exit(void) 673 { 674 kmem_cache_destroy_safe(&g_uvm_va_block_context_cache); 675 kmem_cache_destroy_safe(&g_uvm_page_mask_cache); 676 kmem_cache_destroy_safe(&g_uvm_va_block_gpu_state_cache); 677 kmem_cache_destroy_safe(&g_uvm_va_block_cache); 678 } 679 680 uvm_va_block_context_t *uvm_va_block_context_alloc(struct mm_struct *mm) 681 { 682 uvm_va_block_context_t *block_context = kmem_cache_alloc(g_uvm_va_block_context_cache, NV_UVM_GFP_FLAGS); 683 if (block_context) 684 uvm_va_block_context_init(block_context, mm); 685 686 return block_context; 687 } 688 689 void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context) 690 { 691 if (va_block_context) 692 kmem_cache_free(g_uvm_va_block_context_cache, va_block_context); 693 } 694 695 // Convert from page_index to chunk_index. The goal is for each system page in 696 // the region [start, start + size) to be covered by the largest naturally- 697 // aligned user chunk size. 698 size_t uvm_va_block_gpu_chunk_index_range(NvU64 start, 699 NvU64 size, 700 uvm_gpu_t *gpu, 701 uvm_page_index_t page_index, 702 uvm_chunk_size_t *out_chunk_size) 703 { 704 uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes; 705 uvm_chunk_size_t chunk_size, final_chunk_size; 706 size_t num_chunks, num_chunks_total; 707 NvU64 addr, end, aligned_start, aligned_addr, aligned_end, temp_size; 708 709 UVM_ASSERT(PAGE_ALIGNED(start)); 710 UVM_ASSERT(PAGE_ALIGNED(size)); 711 UVM_ASSERT(size > 0); 712 UVM_ASSERT(size <= UVM_CHUNK_SIZE_2M); 713 UVM_ASSERT(UVM_ALIGN_DOWN(start, UVM_CHUNK_SIZE_2M) == UVM_ALIGN_DOWN(start + size - 1, UVM_CHUNK_SIZE_2M)); 714 BUILD_BUG_ON(UVM_VA_BLOCK_SIZE != UVM_CHUNK_SIZE_2M); 715 716 // PAGE_SIZE needs to be the lowest natively-supported chunk size in the 717 // mask, since we never deal with chunk sizes smaller than that (although we 718 // may have PTEs mapping pages smaller than that). 719 UVM_ASSERT(uvm_chunk_find_first_size(chunk_sizes) == PAGE_SIZE); 720 721 // Optimize the ideal Pascal+ case: the whole block is covered by a single 722 // 2M page. 723 if ((chunk_sizes & UVM_CHUNK_SIZE_2M) && size == UVM_CHUNK_SIZE_2M) { 724 UVM_ASSERT(IS_ALIGNED(start, UVM_CHUNK_SIZE_2M)); 725 final_chunk_size = UVM_CHUNK_SIZE_2M; 726 num_chunks_total = 0; 727 goto out; 728 } 729 730 // Only one 2M chunk can fit within a VA block on any GPU architecture, so 731 // remove that size from consideration. 732 chunk_sizes &= ~UVM_CHUNK_SIZE_2M; 733 734 // Next common case: the whole block is aligned and sized to perfectly fit 735 // the largest page size. 736 final_chunk_size = uvm_chunk_find_last_size(chunk_sizes); 737 if (IS_ALIGNED(start, final_chunk_size) && IS_ALIGNED(size, final_chunk_size)) { 738 num_chunks_total = (size_t)uvm_div_pow2_64(page_index * PAGE_SIZE, final_chunk_size); 739 goto out; 740 } 741 742 // We didn't hit our special paths. Do it the hard way. 743 744 num_chunks_total = 0; 745 addr = start + page_index * PAGE_SIZE; 746 end = start + size; 747 final_chunk_size = 0; 748 UVM_ASSERT(addr < end); 749 750 // The below loop collapses almost completely when chunk_size == PAGE_SIZE 751 // since in that lowest-common-denominator case everything is already 752 // aligned. Skip it and handle that specially after the loop. 753 // 754 // Note that since we removed 2M already above, this loop will only iterate 755 // once on x86 Pascal+ since only 64K is left. 756 chunk_sizes &= ~PAGE_SIZE; 757 758 // This loop calculates the number of chunks between start and addr by 759 // calculating the number of whole chunks of each size between them, 760 // starting with the largest allowed chunk size. This requires fewer 761 // iterations than if we began from start and kept calculating the next 762 // larger chunk size boundary. 763 for_each_chunk_size_rev(chunk_size, chunk_sizes) { 764 aligned_start = UVM_ALIGN_UP(start, chunk_size); 765 aligned_addr = UVM_ALIGN_DOWN(addr, chunk_size); 766 aligned_end = UVM_ALIGN_DOWN(end, chunk_size); 767 768 // If addr and start are within the same chunk, try smaller 769 if (aligned_start > aligned_addr) 770 continue; 771 772 // If addr and end are not in the same chunk, then addr is covered by a 773 // single chunk of the current size. Ignore smaller boundaries between 774 // addr and aligned_addr. 775 if (aligned_addr < aligned_end && final_chunk_size == 0) { 776 addr = aligned_addr; 777 final_chunk_size = chunk_size; 778 } 779 780 // How many chunks of this size are between start and addr? Note that 781 // this might be 0 since aligned_addr and aligned_start could be in the 782 // same chunk. 783 num_chunks = uvm_div_pow2_32(((NvU32)aligned_addr - aligned_start), chunk_size); 784 num_chunks_total += num_chunks; 785 786 // We've already accounted for these chunks, so "remove" them by 787 // bringing start, addr, and end closer together to calculate the 788 // remaining chunk sizes. 789 temp_size = num_chunks * chunk_size; 790 addr -= temp_size; 791 end -= temp_size; 792 793 // Once there's no separation between addr and start, and we've 794 // successfully found the right chunk size when taking end into account, 795 // we're done. 796 if (addr == start && final_chunk_size) 797 break; 798 } 799 800 // Handle PAGE_SIZE cleanup since we skipped it in the loop 801 num_chunks_total += (addr - start) / PAGE_SIZE; 802 if (final_chunk_size == 0) 803 final_chunk_size = PAGE_SIZE; 804 805 out: 806 if (out_chunk_size) 807 *out_chunk_size = final_chunk_size; 808 809 return num_chunks_total; 810 } 811 812 static size_t block_gpu_chunk_index_range(uvm_va_block_t *va_block, 813 NvU64 start, 814 NvU64 size, 815 uvm_gpu_t *gpu, 816 uvm_page_index_t page_index, 817 uvm_chunk_size_t *out_chunk_size) 818 { 819 if (uvm_va_block_is_hmm(va_block)) { 820 if (out_chunk_size) 821 *out_chunk_size = PAGE_SIZE; 822 return page_index; 823 } 824 825 return uvm_va_block_gpu_chunk_index_range(start, size, gpu, page_index, out_chunk_size); 826 } 827 828 static size_t block_gpu_chunk_index(uvm_va_block_t *block, 829 uvm_gpu_t *gpu, 830 uvm_page_index_t page_index, 831 uvm_chunk_size_t *out_chunk_size) 832 { 833 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 834 uvm_chunk_size_t size; 835 uvm_gpu_chunk_t *chunk; 836 size_t index; 837 838 index = block_gpu_chunk_index_range(block, block->start, uvm_va_block_size(block), gpu, page_index, &size); 839 840 UVM_ASSERT(size >= PAGE_SIZE); 841 842 if (gpu_state) { 843 UVM_ASSERT(gpu_state->chunks); 844 chunk = gpu_state->chunks[index]; 845 if (chunk) { 846 UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == size); 847 UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED); 848 UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_FREE); 849 } 850 } 851 852 if (out_chunk_size) 853 *out_chunk_size = size; 854 855 return index; 856 } 857 858 // Compute the size of the chunk known to start at start_page_index 859 static uvm_chunk_size_t block_gpu_chunk_size(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t start_page_index) 860 { 861 uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes; 862 uvm_chunk_sizes_mask_t start_alignments, pow2_leq_size, allowed_sizes; 863 NvU64 start = uvm_va_block_cpu_page_address(block, start_page_index); 864 NvU64 size = block->end - start + 1; 865 866 if (uvm_va_block_is_hmm(block)) 867 return PAGE_SIZE; 868 869 // Create a mask of all sizes for which start is aligned. x ^ (x-1) yields a 870 // mask of the rightmost 1 bit in x, as well as all trailing 0 bits in x. 871 // Example: 1011000 -> 0001111 872 start_alignments = (uvm_chunk_sizes_mask_t)(start ^ (start - 1)); 873 874 // Next, compute all sizes (powers of two) which are <= size. 875 pow2_leq_size = (uvm_chunk_sizes_mask_t)rounddown_pow_of_two(size); 876 pow2_leq_size |= pow2_leq_size - 1; 877 878 // Now and them all together to get our list of GPU-supported chunk sizes 879 // which are aligned to start and will fit within size. 880 allowed_sizes = chunk_sizes & start_alignments & pow2_leq_size; 881 882 // start and size must always be aligned to at least the smallest supported 883 // chunk size (PAGE_SIZE). 884 UVM_ASSERT(allowed_sizes >= PAGE_SIZE); 885 886 // Take the largest allowed size 887 return uvm_chunk_find_last_size(allowed_sizes); 888 } 889 890 static size_t block_num_gpu_chunks(uvm_va_block_t *block, uvm_gpu_t *gpu) 891 { 892 return block_gpu_chunk_index(block, gpu, uvm_va_block_cpu_page_index(block, block->end), NULL) + 1; 893 } 894 895 static size_t block_num_gpu_chunks_range(uvm_va_block_t *block, NvU64 start, NvU64 size, uvm_gpu_t *gpu) 896 { 897 uvm_page_index_t last_page_index = (size_t)((size / PAGE_SIZE) - 1); 898 return block_gpu_chunk_index_range(block, start, size, gpu, last_page_index, NULL) + 1; 899 } 900 901 uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU64 address) 902 { 903 size_t chunk_index; 904 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 905 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, address); 906 907 uvm_assert_mutex_locked(&va_block->lock); 908 909 if (!gpu_state) 910 return NULL; 911 912 chunk_index = block_gpu_chunk_index(va_block, gpu, page_index, NULL); 913 914 return gpu_state->chunks[chunk_index]; 915 } 916 917 NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range, 918 NvU64 start, 919 NvU64 end, 920 uvm_va_block_t **out_block) 921 { 922 uvm_va_block_t *block = NULL; 923 NvU64 size = end - start + 1; 924 925 UVM_ASSERT(PAGE_ALIGNED(start)); 926 UVM_ASSERT(PAGE_ALIGNED(end + 1)); 927 UVM_ASSERT(PAGE_ALIGNED(size)); 928 UVM_ASSERT(size > 0); 929 UVM_ASSERT(size <= UVM_VA_BLOCK_SIZE); 930 931 if (va_range) { 932 // Create a managed va_block. 933 UVM_ASSERT(start >= va_range->node.start); 934 UVM_ASSERT(end <= va_range->node.end); 935 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 936 } 937 938 // Blocks can't span a block alignment boundary 939 UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end)); 940 941 if (uvm_enable_builtin_tests) { 942 uvm_va_block_wrapper_t *block_wrapper = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS); 943 944 if (block_wrapper) 945 block = &block_wrapper->block; 946 } 947 else { 948 block = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS); 949 } 950 951 if (!block) 952 return NV_ERR_NO_MEMORY; 953 954 nv_kref_init(&block->kref); 955 uvm_mutex_init(&block->lock, UVM_LOCK_ORDER_VA_BLOCK); 956 block->start = start; 957 block->end = end; 958 block->va_range = va_range; 959 uvm_tracker_init(&block->tracker); 960 block->prefetch_info.last_migration_proc_id = UVM_ID_INVALID; 961 962 nv_kthread_q_item_init(&block->eviction_mappings_q_item, block_add_eviction_mappings_entry, block); 963 964 *out_block = block; 965 return NV_OK; 966 } 967 968 static void cpu_chunk_remove_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu) 969 { 970 NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 971 if (gpu_mapping_addr == 0) 972 return; 973 974 uvm_pmm_sysmem_mappings_remove_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, gpu_mapping_addr); 975 uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu->parent); 976 } 977 978 static NV_STATUS cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, 979 uvm_va_block_t *block, 980 uvm_page_index_t page_index, 981 uvm_gpu_t *gpu) 982 { 983 NV_STATUS status; 984 uvm_chunk_size_t chunk_size; 985 986 // When the Confidential Computing feature is enabled the transfers don't 987 // use the DMA mapping of CPU chunks (since it's protected memory), but 988 // the DMA address of the unprotected dma buffer. 989 if (uvm_conf_computing_mode_enabled(gpu)) 990 return NV_OK; 991 992 status = uvm_cpu_chunk_map_gpu(chunk, gpu); 993 if (status != NV_OK) 994 return status; 995 996 chunk_size = uvm_cpu_chunk_get_size(chunk); 997 998 // TODO: Bug 3744779: Handle benign assertion in 999 // pmm_sysmem_mappings_remove_gpu_mapping() in case of a 1000 // failure. 1001 status = uvm_pmm_sysmem_mappings_add_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, 1002 uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent), 1003 uvm_va_block_cpu_page_address(block, page_index), 1004 chunk_size, 1005 block, 1006 UVM_ID_CPU); 1007 if (status != NV_OK) 1008 cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu); 1009 1010 return status; 1011 } 1012 1013 static void block_gpu_unmap_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu) 1014 { 1015 uvm_cpu_chunk_t *chunk; 1016 uvm_page_index_t page_index; 1017 1018 for_each_cpu_chunk_in_block(chunk, page_index, block) 1019 cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu); 1020 } 1021 1022 static NV_STATUS block_gpu_map_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu) 1023 { 1024 NV_STATUS status; 1025 uvm_cpu_chunk_t *chunk; 1026 NvU64 block_mapping_size = uvm_va_block_size(block); 1027 uvm_page_index_t page_index; 1028 1029 UVM_ASSERT(IS_ALIGNED(block_mapping_size, UVM_PAGE_SIZE_4K)); 1030 1031 for_each_cpu_chunk_in_block(chunk, page_index, block) { 1032 UVM_ASSERT_MSG(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent) == 0, 1033 "GPU%u DMA address 0x%llx\n", 1034 uvm_id_value(gpu->id), 1035 uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent)); 1036 1037 status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, page_index, gpu); 1038 if (status != NV_OK) 1039 goto error; 1040 } 1041 1042 return NV_OK; 1043 1044 error: 1045 block_gpu_unmap_phys_all_cpu_pages(block, gpu); 1046 return status; 1047 } 1048 1049 static NV_STATUS block_sysmem_mappings_add_gpu_chunk(uvm_va_block_t *block, 1050 uvm_gpu_t *local_gpu, 1051 uvm_gpu_chunk_t *chunk, 1052 uvm_gpu_t *accessing_gpu) 1053 { 1054 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu); 1055 return uvm_pmm_sysmem_mappings_add_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, 1056 peer_addr, 1057 block->start + chunk->va_block_page_index * PAGE_SIZE, 1058 uvm_gpu_chunk_get_size(chunk), 1059 block, 1060 local_gpu->id); 1061 } 1062 1063 static void block_sysmem_mappings_remove_gpu_chunk(uvm_gpu_t *local_gpu, 1064 uvm_gpu_chunk_t *chunk, 1065 uvm_gpu_t *accessing_gpu) 1066 { 1067 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu); 1068 uvm_pmm_sysmem_mappings_remove_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr); 1069 } 1070 1071 static NV_STATUS block_gpu_map_all_chunks_indirect_peer(uvm_va_block_t *block, 1072 uvm_gpu_t *local_gpu, 1073 uvm_gpu_t *accessing_gpu) 1074 { 1075 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id); 1076 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1077 size_t num_chunks, i; 1078 NV_STATUS status; 1079 1080 UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)], 1081 accessing_gpu->id)); 1082 1083 // If no chunks are allocated currently, the mappings will be created later 1084 // at chunk allocation. 1085 if (!gpu_state || !gpu_state->chunks) 1086 return NV_OK; 1087 1088 num_chunks = block_num_gpu_chunks(block, local_gpu); 1089 for (i = 0; i < num_chunks; i++) { 1090 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 1091 if (!chunk) 1092 continue; 1093 1094 status = uvm_pmm_gpu_indirect_peer_map(&local_gpu->pmm, chunk, accessing_gpu); 1095 if (status != NV_OK) 1096 goto error; 1097 1098 status = block_sysmem_mappings_add_gpu_chunk(block, local_gpu, chunk, accessing_gpu); 1099 if (status != NV_OK) 1100 goto error; 1101 } 1102 1103 return NV_OK; 1104 1105 error: 1106 while (i-- > 0) { 1107 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 1108 if (chunk) { 1109 // Indirect peer mappings are removed lazily by PMM, so if an error 1110 // occurs the mappings established above will be removed when the 1111 // chunk is freed later on. We only need to remove the sysmem 1112 // reverse mappings. 1113 block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu); 1114 } 1115 } 1116 1117 return status; 1118 } 1119 1120 // Mappings for indirect peers are removed lazily by PMM, but we need to remove 1121 // the entries from the reverse map. 1122 static void block_gpu_unmap_all_chunks_indirect_peer(uvm_va_block_t *block, 1123 uvm_gpu_t *local_gpu, 1124 uvm_gpu_t *accessing_gpu) 1125 { 1126 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id); 1127 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1128 size_t num_chunks, i; 1129 1130 UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)], 1131 accessing_gpu->id)); 1132 1133 // Exit if no chunks are allocated currently. 1134 if (!gpu_state || !gpu_state->chunks) 1135 return; 1136 1137 num_chunks = block_num_gpu_chunks(block, local_gpu); 1138 for (i = 0; i < num_chunks; i++) { 1139 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 1140 if (chunk) 1141 block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu); 1142 } 1143 } 1144 1145 // Retrieves the gpu_state for the given GPU. The returned pointer is 1146 // internally managed and will be allocated (and freed) automatically, 1147 // rather than by the caller. 1148 static uvm_va_block_gpu_state_t *block_gpu_state_get_alloc(uvm_va_block_t *block, uvm_gpu_t *gpu) 1149 { 1150 NV_STATUS status; 1151 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 1152 1153 if (gpu_state) 1154 return gpu_state; 1155 1156 gpu_state = nv_kmem_cache_zalloc(g_uvm_va_block_gpu_state_cache, NV_UVM_GFP_FLAGS); 1157 if (!gpu_state) 1158 return NULL; 1159 1160 gpu_state->chunks = uvm_kvmalloc_zero(block_num_gpu_chunks(block, gpu) * sizeof(gpu_state->chunks[0])); 1161 if (!gpu_state->chunks) 1162 goto error; 1163 1164 block->gpus[uvm_id_gpu_index(gpu->id)] = gpu_state; 1165 1166 status = block_gpu_map_phys_all_cpu_pages(block, gpu); 1167 if (status != NV_OK) 1168 goto error; 1169 1170 return gpu_state; 1171 1172 error: 1173 uvm_kvfree(gpu_state->chunks); 1174 kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state); 1175 block->gpus[uvm_id_gpu_index(gpu->id)] = NULL; 1176 1177 return NULL; 1178 } 1179 1180 NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block) 1181 { 1182 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 1183 uvm_gpu_id_t gpu_id; 1184 1185 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1186 uvm_assert_mutex_locked(&va_block->lock); 1187 1188 for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpus) { 1189 if (!block_gpu_state_get_alloc(va_block, uvm_va_space_get_gpu(va_space, gpu_id))) 1190 return NV_ERR_NO_MEMORY; 1191 } 1192 1193 return NV_OK; 1194 } 1195 1196 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block, 1197 uvm_cpu_chunk_t *chunk, 1198 uvm_page_index_t page_index) 1199 { 1200 uvm_gpu_id_t id; 1201 1202 for_each_gpu_id(id) { 1203 if (uvm_va_block_gpu_state_get(block, id)) 1204 cpu_chunk_remove_sysmem_gpu_mapping(chunk, block_get_gpu(block, id)); 1205 } 1206 } 1207 1208 NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *block, 1209 uvm_page_index_t page_index) 1210 { 1211 NV_STATUS status; 1212 uvm_gpu_id_t id; 1213 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 1214 uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk); 1215 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index); 1216 1217 // We can't iterate over va_space->registered_gpus because we might be 1218 // on the eviction path, which does not have the VA space lock held. We have 1219 // the VA block lock held however, so the gpu_states can't change. 1220 uvm_assert_mutex_locked(&block->lock); 1221 1222 for_each_gpu_id(id) { 1223 uvm_gpu_t *gpu; 1224 1225 if (!uvm_va_block_gpu_state_get(block, id)) 1226 continue; 1227 1228 gpu = block_get_gpu(block, id); 1229 status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, chunk_region.first, gpu); 1230 if (status != NV_OK) 1231 goto error; 1232 } 1233 1234 return NV_OK; 1235 1236 error: 1237 uvm_va_block_unmap_cpu_chunk_on_gpus(block, chunk, page_index); 1238 return status; 1239 } 1240 1241 void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region) 1242 { 1243 uvm_cpu_chunk_t *chunk; 1244 uvm_page_index_t page_index, next_page_index; 1245 uvm_va_block_region_t chunk_region; 1246 1247 for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, region) { 1248 chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk)); 1249 1250 uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], chunk_region); 1251 uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region); 1252 uvm_page_mask_region_clear(&va_block->cpu.resident, chunk_region); 1253 uvm_cpu_chunk_remove_from_block(va_block, page_index); 1254 uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index); 1255 uvm_cpu_chunk_free(chunk); 1256 } 1257 1258 if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) 1259 uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU); 1260 if (uvm_page_mask_empty(&va_block->cpu.resident)) 1261 uvm_processor_mask_clear(&va_block->resident, UVM_ID_CPU); 1262 } 1263 1264 // Create physical mappings to allow other GPUs to access this chunk. 1265 static NV_STATUS block_map_indirect_peers_to_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) 1266 { 1267 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1268 uvm_gpu_t *accessing_gpu, *remove_gpu; 1269 NV_STATUS status; 1270 1271 // Unlike uvm_va_block_map_cpu_chunk_on_gpus, this function isn't called on 1272 // the eviction path, so we can assume that the VA space is locked. 1273 // 1274 // TODO: Bug 2007346: In the future we may want to enable eviction to peers, 1275 // meaning we may need to allocate peer memory and map it on the 1276 // eviction path. That will require making sure that peers can't be 1277 // enabled or disabled either in the VA space or globally within this 1278 // function. 1279 uvm_assert_rwsem_locked(&va_space->lock); 1280 uvm_assert_mutex_locked(&block->lock); 1281 1282 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 1283 status = uvm_pmm_gpu_indirect_peer_map(&gpu->pmm, chunk, accessing_gpu); 1284 if (status != NV_OK) 1285 goto error; 1286 1287 status = block_sysmem_mappings_add_gpu_chunk(block, gpu, chunk, accessing_gpu); 1288 if (status != NV_OK) 1289 goto error; 1290 } 1291 1292 return NV_OK; 1293 1294 error: 1295 for_each_va_space_gpu_in_mask(remove_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 1296 if (remove_gpu == accessing_gpu) 1297 break; 1298 1299 // Indirect peer mappings are removed lazily by PMM, so if an error 1300 // occurs the mappings established above will be removed when the 1301 // chunk is freed later on. We only need to remove the sysmem 1302 // reverse mappings. 1303 block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, remove_gpu); 1304 } 1305 1306 return status; 1307 } 1308 1309 static void block_unmap_indirect_peers_from_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) 1310 { 1311 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1312 uvm_gpu_t *peer_gpu; 1313 1314 uvm_assert_rwsem_locked(&va_space->lock); 1315 uvm_assert_mutex_locked(&block->lock); 1316 1317 // Indirect peer mappings are removed lazily by PMM, so we only need to 1318 // remove the sysmem reverse mappings. 1319 for_each_va_space_gpu_in_mask(peer_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) 1320 block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, peer_gpu); 1321 } 1322 1323 // Mark a CPU page as dirty. 1324 static void block_mark_cpu_page_dirty(uvm_va_block_t *block, uvm_page_index_t page_index) 1325 { 1326 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 1327 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index); 1328 uvm_cpu_chunk_mark_dirty(chunk, page_index - chunk_region.first); 1329 } 1330 1331 // Mark a CPU page as clean. 1332 static void block_mark_cpu_page_clean(uvm_va_block_t *block, uvm_page_index_t page_index) 1333 { 1334 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 1335 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index); 1336 uvm_cpu_chunk_mark_clean(chunk, page_index - chunk_region.first); 1337 } 1338 1339 // Check if a CPU page is dirty. 1340 static bool block_cpu_page_is_dirty(uvm_va_block_t *block, uvm_page_index_t page_index) 1341 { 1342 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 1343 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index); 1344 return uvm_cpu_chunk_is_dirty(chunk, page_index - chunk_region.first); 1345 } 1346 1347 static NV_STATUS block_alloc_cpu_chunk(uvm_va_block_t *block, 1348 uvm_chunk_size_t alloc_size, 1349 uvm_cpu_chunk_alloc_flags_t flags, 1350 uvm_cpu_chunk_t **chunk) 1351 { 1352 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); 1353 1354 // Return out of memory error if the tests have requested it. As opposed to 1355 // other error injection settings, this one fails N times and then succeeds. 1356 // TODO: Bug 3701182: This will print a warning in Linux kernels newer than 1357 // 5.16.0-rc1+. 1358 if (block_test && block_test->inject_cpu_pages_allocation_error_count) { 1359 if (block_test->inject_cpu_pages_allocation_error_count != ~(NvU32)0) 1360 block_test->inject_cpu_pages_allocation_error_count--; 1361 return NV_ERR_NO_MEMORY; 1362 } 1363 1364 return uvm_cpu_chunk_alloc(alloc_size, flags, chunk); 1365 } 1366 1367 // Allocates the input page in the block, if it doesn't already exist 1368 // 1369 // Also maps the page for physical access by all GPUs used by the block, which 1370 // is required for IOMMU support. Skipped on GPUs without access to CPU memory. 1371 // e.g., this happens when the Confidential Computing Feature is enabled. 1372 static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block, 1373 uvm_page_mask_t *populate_page_mask, 1374 uvm_va_block_region_t populate_region, 1375 uvm_va_block_context_t *block_context) 1376 { 1377 NV_STATUS status = NV_OK; 1378 uvm_cpu_chunk_t *chunk; 1379 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); 1380 uvm_chunk_sizes_mask_t cpu_allocation_sizes = uvm_cpu_chunk_get_allocation_sizes(); 1381 uvm_chunk_size_t alloc_size; 1382 uvm_page_mask_t *resident_mask = &block_context->scratch_page_mask; 1383 uvm_cpu_chunk_alloc_flags_t alloc_flags = UVM_CPU_CHUNK_ALLOC_FLAGS_NONE; 1384 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 1385 uvm_processor_mask_t uvm_lite_gpus; 1386 uvm_page_index_t page_index; 1387 uvm_gpu_id_t id; 1388 1389 // Check whether all requested pages have already been allocated. 1390 uvm_page_mask_init_from_region(&block_context->scratch_page_mask, populate_region, populate_page_mask); 1391 if (!uvm_page_mask_andnot(&block_context->scratch_page_mask, 1392 &block_context->scratch_page_mask, 1393 &block->cpu.allocated)) 1394 return NV_OK; 1395 1396 if (block_test) { 1397 if (block_test->cpu_chunk_allocation_size_mask) 1398 cpu_allocation_sizes &= block_test->cpu_chunk_allocation_size_mask; 1399 } 1400 1401 uvm_page_mask_zero(resident_mask); 1402 for_each_id_in_mask (id, &block->resident) 1403 uvm_page_mask_or(resident_mask, resident_mask, uvm_va_block_resident_mask_get(block, id)); 1404 1405 // If the VA space has a UVM-Lite GPU registered, only PAGE_SIZE allocations 1406 // should be used in order to avoid extra copies due to dirty compound 1407 // pages. HMM va_blocks also require PAGE_SIZE allocations. 1408 // TODO: Bug 3368756: add support for HMM transparent huge page (THP) 1409 // migrations. 1410 uvm_processor_mask_andnot(&uvm_lite_gpus, &va_space->registered_gpus, &va_space->faultable_processors); 1411 if (!uvm_processor_mask_empty(&uvm_lite_gpus) || uvm_va_block_is_hmm(block)) 1412 cpu_allocation_sizes = PAGE_SIZE; 1413 1414 if (block_context->mm) 1415 alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ACCOUNT; 1416 1417 UVM_ASSERT(cpu_allocation_sizes >= PAGE_SIZE); 1418 UVM_ASSERT(cpu_allocation_sizes & PAGE_SIZE); 1419 1420 for_each_va_block_page_in_region_mask(page_index, populate_page_mask, populate_region) { 1421 uvm_cpu_chunk_alloc_flags_t chunk_alloc_flags; 1422 uvm_va_block_region_t region = populate_region; 1423 1424 if (uvm_page_mask_test(&block->cpu.allocated, page_index)) { 1425 page_index = uvm_va_block_next_unset_page_in_mask(populate_region, &block->cpu.allocated, page_index) - 1; 1426 continue; 1427 } 1428 1429 UVM_ASSERT(!uvm_page_mask_test(&block->cpu.resident, page_index)); 1430 1431 chunk_alloc_flags = alloc_flags; 1432 1433 // Attempt to allocate CPU pages with the largest physically contiguous 1434 // size from the set of CPU chunk sizes that we can. 1435 // This is accomplished by: 1436 // 1. Aligning the CPU page address down to the allocation size. 1437 // 2. Ensuring that the entire allocation region fits withing the VA 1438 // block. 1439 // 3. Ensuring that the region covered by the allocation is empty. 1440 for_each_chunk_size_rev(alloc_size, cpu_allocation_sizes) { 1441 NvU64 alloc_virt_addr; 1442 1443 chunk = NULL; 1444 alloc_virt_addr = UVM_ALIGN_DOWN(uvm_va_block_cpu_page_address(block, page_index), alloc_size); 1445 1446 if (!uvm_va_block_contains_address(block, alloc_virt_addr) || 1447 !uvm_va_block_contains_address(block, alloc_virt_addr + alloc_size - 1)) 1448 continue; 1449 1450 region = uvm_va_block_region_from_start_end(block, alloc_virt_addr, alloc_virt_addr + alloc_size - 1); 1451 1452 if (!uvm_page_mask_region_empty(&block->cpu.allocated, region)) 1453 continue; 1454 1455 // If not all pages in the allocation region are resident somewhere, 1456 // zero out the allocated page. 1457 // This could be wasteful if only a few pages in high-order 1458 // allocation need to be zero'ed out but the alternative is to map 1459 // single sub-pages one-by-one. 1460 if (!uvm_page_mask_region_full(resident_mask, region)) 1461 chunk_alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO; 1462 1463 status = block_alloc_cpu_chunk(block, alloc_size, chunk_alloc_flags, &chunk); 1464 if (status == NV_OK) { 1465 page_index = region.first; 1466 break; 1467 } 1468 1469 UVM_ASSERT(status == NV_ERR_NO_MEMORY); 1470 } 1471 1472 if (status != NV_OK) 1473 break; 1474 1475 status = uvm_cpu_chunk_insert_in_block(block, chunk, page_index); 1476 if (status != NV_OK) { 1477 uvm_cpu_chunk_free(chunk); 1478 return status; 1479 } 1480 1481 status = uvm_va_block_map_cpu_chunk_on_gpus(block, page_index); 1482 if (status != NV_OK) 1483 break; 1484 1485 // Skip iterating over all pages covered by the allocated chunk. 1486 page_index = region.outer - 1; 1487 } 1488 1489 if (status != NV_OK && chunk) { 1490 uvm_cpu_chunk_remove_from_block(block, page_index); 1491 uvm_cpu_chunk_free(chunk); 1492 } 1493 1494 return status; 1495 } 1496 1497 // Try allocating a chunk. If eviction was required, 1498 // NV_ERR_MORE_PROCESSING_REQUIRED will be returned since the block's lock was 1499 // unlocked and relocked. The caller is responsible for adding the chunk to the 1500 // retry used_chunks list. 1501 static NV_STATUS block_alloc_gpu_chunk(uvm_va_block_t *block, 1502 uvm_va_block_retry_t *retry, 1503 uvm_gpu_t *gpu, 1504 uvm_chunk_size_t size, 1505 uvm_gpu_chunk_t **out_gpu_chunk) 1506 { 1507 NV_STATUS status = NV_OK; 1508 uvm_gpu_chunk_t *gpu_chunk; 1509 1510 // First try getting a free chunk from previously-made allocations. 1511 gpu_chunk = block_retry_get_free_chunk(retry, gpu, size); 1512 if (!gpu_chunk) { 1513 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); 1514 if (block_test && block_test->user_pages_allocation_retry_force_count > 0) { 1515 // Force eviction by pretending the allocation failed with no memory 1516 --block_test->user_pages_allocation_retry_force_count; 1517 status = NV_ERR_NO_MEMORY; 1518 } 1519 else { 1520 // Try allocating a new one without eviction 1521 status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_NONE, &gpu_chunk, &retry->tracker); 1522 } 1523 1524 if (status == NV_ERR_NO_MEMORY) { 1525 // If that fails with no memory, try allocating with eviction and 1526 // return back to the caller immediately so that the operation can 1527 // be restarted. 1528 uvm_mutex_unlock(&block->lock); 1529 1530 status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_EVICT, &gpu_chunk, &retry->tracker); 1531 if (status == NV_OK) { 1532 block_retry_add_free_chunk(retry, gpu_chunk); 1533 status = NV_ERR_MORE_PROCESSING_REQUIRED; 1534 } 1535 1536 uvm_mutex_lock(&block->lock); 1537 return status; 1538 } 1539 else if (status != NV_OK) { 1540 return status; 1541 } 1542 } 1543 1544 *out_gpu_chunk = gpu_chunk; 1545 return NV_OK; 1546 } 1547 1548 static bool block_gpu_has_page_tables(uvm_va_block_t *block, uvm_gpu_t *gpu) 1549 { 1550 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 1551 1552 if (!gpu_state) 1553 return false; 1554 1555 return gpu_state->page_table_range_4k.table || 1556 gpu_state->page_table_range_big.table || 1557 gpu_state->page_table_range_2m.table; 1558 } 1559 1560 // A helper to get a known-to-be-present GPU VA space given a VA block that's 1561 // locked. In order to use this function, the caller must know that at least one 1562 // of these conditions is true: 1563 // 1564 // 1) The VA space lock is held 1565 // 2) The VA block has active page tables for the GPU 1566 // 1567 // If the VA space lock is held (#1), then the gpu_va_space obviously can't go 1568 // away. 1569 // 1570 // On the eviction path, we don't have a lock on the VA space state. However, 1571 // since remove_gpu_va_space walks each block to unmap the GPU and free GPU page 1572 // tables before destroying the gpu_va_space, we're guaranteed that if this GPU 1573 // has page tables (#2), the gpu_va_space can't go away while we're holding the 1574 // block lock. 1575 static uvm_gpu_va_space_t *uvm_va_block_get_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_t *gpu) 1576 { 1577 uvm_gpu_va_space_t *gpu_va_space; 1578 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 1579 1580 UVM_ASSERT(gpu); 1581 1582 if (!block_gpu_has_page_tables(va_block, gpu)) 1583 uvm_assert_rwsem_locked(&va_space->lock); 1584 1585 UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id)); 1586 1587 gpu_va_space = va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)]; 1588 1589 UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE); 1590 UVM_ASSERT(gpu_va_space->va_space == va_space); 1591 UVM_ASSERT(gpu_va_space->gpu == gpu); 1592 1593 return gpu_va_space; 1594 } 1595 1596 static bool block_gpu_supports_2m(uvm_va_block_t *block, uvm_gpu_t *gpu) 1597 { 1598 uvm_gpu_va_space_t *gpu_va_space; 1599 1600 // TODO: Bug 3368756: add HMM support for transparent huge page migrations. 1601 if (uvm_va_block_size(block) < UVM_PAGE_SIZE_2M || uvm_va_block_is_hmm(block)) 1602 return false; 1603 1604 UVM_ASSERT(uvm_va_block_size(block) == UVM_PAGE_SIZE_2M); 1605 1606 gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 1607 return uvm_mmu_page_size_supported(&gpu_va_space->page_tables, UVM_PAGE_SIZE_2M); 1608 } 1609 1610 NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu) 1611 { 1612 uvm_gpu_va_space_t *gpu_va_space; 1613 1614 gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu); 1615 return gpu_va_space->page_tables.big_page_size; 1616 } 1617 1618 static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU32 big_page_size) 1619 { 1620 NvU64 first_addr = UVM_ALIGN_UP(start, big_page_size); 1621 NvU64 outer_addr = UVM_ALIGN_DOWN(end + 1, big_page_size); 1622 1623 // The range must fit within a VA block 1624 UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end)); 1625 1626 if (outer_addr <= first_addr) 1627 return uvm_va_block_region(0, 0); 1628 1629 return uvm_va_block_region((first_addr - start) / PAGE_SIZE, (outer_addr - start) / PAGE_SIZE); 1630 } 1631 1632 static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU32 big_page_size) 1633 { 1634 uvm_va_block_region_t region = range_big_page_region_all(start, end, big_page_size); 1635 return (size_t)uvm_div_pow2_64(uvm_va_block_region_size(region), big_page_size); 1636 } 1637 1638 uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size) 1639 { 1640 return range_big_page_region_all(va_block->start, va_block->end, big_page_size); 1641 } 1642 1643 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block, 1644 uvm_va_block_region_t region, 1645 NvU32 big_page_size) 1646 { 1647 NvU64 start = uvm_va_block_region_start(va_block, region); 1648 NvU64 end = uvm_va_block_region_end(va_block, region); 1649 uvm_va_block_region_t big_region; 1650 1651 UVM_ASSERT(start < va_block->end); 1652 UVM_ASSERT(end <= va_block->end); 1653 1654 big_region = range_big_page_region_all(start, end, big_page_size); 1655 if (big_region.outer) { 1656 big_region.first += region.first; 1657 big_region.outer += region.first; 1658 } 1659 1660 return big_region; 1661 } 1662 1663 size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size) 1664 { 1665 return range_num_big_pages(va_block->start, va_block->end, big_page_size); 1666 } 1667 1668 NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size) 1669 { 1670 NvU64 addr = UVM_ALIGN_UP(va_block->start, big_page_size) + (big_page_index * big_page_size); 1671 UVM_ASSERT(addr >= va_block->start); 1672 UVM_ASSERT(addr < va_block->end); 1673 return addr; 1674 } 1675 1676 uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size) 1677 { 1678 NvU64 page_addr = uvm_va_block_big_page_addr(va_block, big_page_index, big_page_size); 1679 1680 // Assume that we don't have to handle multiple big PTEs per system page. 1681 // It's not terribly difficult to implement, but we don't currently have a 1682 // use case. 1683 UVM_ASSERT(big_page_size >= PAGE_SIZE); 1684 1685 return uvm_va_block_region_from_start_size(va_block, page_addr, big_page_size); 1686 } 1687 1688 // Returns the big page index (the bit index within 1689 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If 1690 // page_index cannot be covered by a big PTE due to alignment or block size, 1691 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned. 1692 size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size) 1693 { 1694 uvm_va_block_region_t big_region_all = uvm_va_block_big_page_region_all(va_block, big_page_size); 1695 size_t big_index; 1696 1697 // Note that this condition also handles the case of having no big pages in 1698 // the block, in which case .first >= .outer. 1699 if (page_index < big_region_all.first || page_index >= big_region_all.outer) 1700 return MAX_BIG_PAGES_PER_UVM_VA_BLOCK; 1701 1702 big_index = (size_t)uvm_div_pow2_64((page_index - big_region_all.first) * PAGE_SIZE, big_page_size); 1703 1704 UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) >= va_block->start); 1705 UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) + big_page_size <= va_block->end + 1); 1706 1707 return big_index; 1708 } 1709 1710 static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block, 1711 uvm_gpu_t *gpu, 1712 uvm_page_mask_t *mask_out, 1713 const unsigned long *big_ptes_in) 1714 { 1715 uvm_va_block_region_t big_region; 1716 size_t big_page_index; 1717 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); 1718 1719 uvm_page_mask_zero(mask_out); 1720 1721 for_each_set_bit(big_page_index, big_ptes_in, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 1722 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 1723 uvm_page_mask_region_fill(mask_out, big_region); 1724 } 1725 } 1726 1727 NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index) 1728 { 1729 if (!uvm_page_mask_test(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index)) 1730 return 0; 1731 1732 UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU)); 1733 1734 // Despite the fact that physical CPU memory can be allocated at sizes 1735 // greater than PAGE_SIZE, vm_insert_page(s)() always maps CPU memory 1736 // with 4K PTEs. Until the core kernel adds support for PMD mappings, 1737 // the return value of this function will remain at PAGE_SIZE. 1738 return PAGE_SIZE; 1739 } 1740 1741 NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index) 1742 { 1743 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 1744 size_t big_page_size, big_page_index; 1745 1746 if (!gpu_state) 1747 return 0; 1748 1749 if (!uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index)) 1750 return 0; 1751 1752 UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, gpu_id)); 1753 1754 if (gpu_state->pte_is_2m) 1755 return UVM_PAGE_SIZE_2M; 1756 1757 big_page_size = uvm_va_block_gpu_big_page_size(va_block, block_get_gpu(va_block, gpu_id)); 1758 big_page_index = uvm_va_block_big_page_index(va_block, page_index, big_page_size); 1759 if (big_page_index != MAX_BIG_PAGES_PER_UVM_VA_BLOCK && test_bit(big_page_index, gpu_state->big_ptes)) 1760 return big_page_size; 1761 1762 return UVM_PAGE_SIZE_4K; 1763 } 1764 1765 // Get the size of the physical allocation backing the page, or 0 if not 1766 // resident. Note that this is different from uvm_va_block_page_size_* because 1767 // those return the size of the PTE which maps the page index, which may be 1768 // smaller than the physical allocation. 1769 static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page) 1770 { 1771 uvm_va_block_gpu_state_t *gpu_state; 1772 uvm_chunk_size_t chunk_size; 1773 1774 if (UVM_ID_IS_CPU(page.processor)) { 1775 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page.page_index); 1776 1777 if (!uvm_page_mask_test(&block->cpu.resident, page.page_index)) 1778 return 0; 1779 1780 UVM_ASSERT(uvm_processor_mask_test(&block->resident, UVM_ID_CPU)); 1781 return (NvU32)uvm_cpu_chunk_get_size(chunk); 1782 } 1783 1784 gpu_state = uvm_va_block_gpu_state_get(block, page.processor); 1785 if (!gpu_state || !uvm_page_mask_test(&gpu_state->resident, page.page_index)) 1786 return 0; 1787 1788 UVM_ASSERT(uvm_processor_mask_test(&block->resident, page.processor)); 1789 block_gpu_chunk_index(block, block_get_gpu(block, page.processor), page.page_index, &chunk_size); 1790 return (NvU32)chunk_size; 1791 } 1792 1793 static uvm_pte_bits_cpu_t get_cpu_pte_bit_index(uvm_prot_t prot) 1794 { 1795 uvm_pte_bits_cpu_t pte_bit_index = UVM_PTE_BITS_CPU_MAX; 1796 1797 // ATOMIC and WRITE are synonyms for the CPU 1798 if (prot == UVM_PROT_READ_WRITE_ATOMIC || prot == UVM_PROT_READ_WRITE) 1799 pte_bit_index = UVM_PTE_BITS_CPU_WRITE; 1800 else if (prot == UVM_PROT_READ_ONLY) 1801 pte_bit_index = UVM_PTE_BITS_CPU_READ; 1802 else 1803 UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot)); 1804 1805 return pte_bit_index; 1806 } 1807 1808 static uvm_pte_bits_gpu_t get_gpu_pte_bit_index(uvm_prot_t prot) 1809 { 1810 uvm_pte_bits_gpu_t pte_bit_index = UVM_PTE_BITS_GPU_MAX; 1811 1812 if (prot == UVM_PROT_READ_WRITE_ATOMIC) 1813 pte_bit_index = UVM_PTE_BITS_GPU_ATOMIC; 1814 else if (prot == UVM_PROT_READ_WRITE) 1815 pte_bit_index = UVM_PTE_BITS_GPU_WRITE; 1816 else if (prot == UVM_PROT_READ_ONLY) 1817 pte_bit_index = UVM_PTE_BITS_GPU_READ; 1818 else 1819 UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot)); 1820 1821 return pte_bit_index; 1822 } 1823 1824 uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor) 1825 { 1826 uvm_va_block_gpu_state_t *gpu_state; 1827 1828 if (UVM_ID_IS_CPU(processor)) 1829 return &block->cpu.resident; 1830 1831 gpu_state = uvm_va_block_gpu_state_get(block, processor); 1832 1833 UVM_ASSERT(gpu_state); 1834 return &gpu_state->resident; 1835 } 1836 1837 // Get the page residency mask for a processor 1838 // 1839 // Notably this will allocate GPU state if not yet present and if that fails 1840 // NULL is returned. 1841 static uvm_page_mask_t *block_resident_mask_get_alloc(uvm_va_block_t *block, uvm_processor_id_t processor) 1842 { 1843 uvm_va_block_gpu_state_t *gpu_state; 1844 1845 if (UVM_ID_IS_CPU(processor)) 1846 return &block->cpu.resident; 1847 1848 gpu_state = block_gpu_state_get_alloc(block, block_get_gpu(block, processor)); 1849 if (!gpu_state) 1850 return NULL; 1851 1852 return &gpu_state->resident; 1853 } 1854 1855 static const uvm_page_mask_t *block_map_with_prot_mask_get(uvm_va_block_t *block, 1856 uvm_processor_id_t processor, 1857 uvm_prot_t prot) 1858 { 1859 uvm_va_block_gpu_state_t *gpu_state; 1860 1861 if (UVM_ID_IS_CPU(processor)) 1862 return &block->cpu.pte_bits[get_cpu_pte_bit_index(prot)]; 1863 1864 gpu_state = uvm_va_block_gpu_state_get(block, processor); 1865 1866 UVM_ASSERT(gpu_state); 1867 return &gpu_state->pte_bits[get_gpu_pte_bit_index(prot)]; 1868 } 1869 1870 const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor) 1871 { 1872 return block_map_with_prot_mask_get(block, processor, UVM_PROT_READ_ONLY); 1873 } 1874 1875 static const uvm_page_mask_t *block_evicted_mask_get(uvm_va_block_t *block, uvm_gpu_id_t gpu_id) 1876 { 1877 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id); 1878 UVM_ASSERT(gpu_state); 1879 1880 return &gpu_state->evicted; 1881 } 1882 1883 static bool block_is_page_resident_anywhere(uvm_va_block_t *block, uvm_page_index_t page_index) 1884 { 1885 uvm_processor_id_t id; 1886 for_each_id_in_mask(id, &block->resident) { 1887 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(block, id), page_index)) 1888 return true; 1889 } 1890 1891 return false; 1892 } 1893 1894 static bool block_processor_page_is_populated(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index) 1895 { 1896 uvm_va_block_gpu_state_t *gpu_state; 1897 size_t chunk_index; 1898 1899 if (UVM_ID_IS_CPU(proc)) 1900 return uvm_page_mask_test(&block->cpu.allocated, page_index); 1901 1902 gpu_state = uvm_va_block_gpu_state_get(block, proc); 1903 if (!gpu_state) 1904 return false; 1905 1906 chunk_index = block_gpu_chunk_index(block, block_get_gpu(block, proc), page_index, NULL); 1907 return gpu_state->chunks[chunk_index] != NULL; 1908 } 1909 1910 static bool block_processor_page_is_resident_on(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index) 1911 { 1912 const uvm_page_mask_t *resident_mask; 1913 1914 if (UVM_ID_IS_CPU(proc)) { 1915 resident_mask = &block->cpu.resident; 1916 } 1917 else { 1918 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, proc); 1919 if (!gpu_state) 1920 return false; 1921 1922 resident_mask = &gpu_state->resident; 1923 } 1924 1925 return uvm_page_mask_test(resident_mask, page_index); 1926 } 1927 1928 // Compute the gpus that have at least the given access permissions for the 1929 // range described by region and page_mask. The function sets the bit if any 1930 // page in the region has the permissions. 1931 static void block_region_authorized_gpus(uvm_va_block_t *va_block, 1932 uvm_va_block_region_t region, 1933 uvm_prot_t access_permission, 1934 uvm_processor_mask_t *authorized_gpus) 1935 { 1936 uvm_gpu_id_t gpu_id; 1937 uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(access_permission); 1938 1939 uvm_processor_mask_zero(authorized_gpus); 1940 1941 // Test all GPUs with mappings on the block 1942 for_each_gpu_id_in_mask(gpu_id, &va_block->mapped) { 1943 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 1944 if (gpu_state && !uvm_page_mask_region_empty(&gpu_state->pte_bits[search_gpu_bit], region)) 1945 uvm_processor_mask_set(authorized_gpus, gpu_id); 1946 } 1947 } 1948 1949 // Compute the processors that have at least the given access permissions for 1950 // the range described by region and page_mask. The function sets the bit if any 1951 // page in the region has the permissions. 1952 static void block_region_authorized_processors(uvm_va_block_t *va_block, 1953 uvm_va_block_region_t region, 1954 uvm_prot_t access_permission, 1955 uvm_processor_mask_t *authorized_processors) 1956 { 1957 uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(access_permission); 1958 1959 // Compute GPUs 1960 block_region_authorized_gpus(va_block, region, access_permission, authorized_processors); 1961 1962 // Test CPU 1963 if (uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU) && 1964 !uvm_page_mask_region_empty(&va_block->cpu.pte_bits[search_cpu_bit], region)) { 1965 uvm_processor_mask_set(authorized_processors, UVM_ID_CPU); 1966 } 1967 } 1968 1969 static void block_page_authorized_processors(uvm_va_block_t *va_block, 1970 uvm_page_index_t page_index, 1971 uvm_prot_t access_permission, 1972 uvm_processor_mask_t *authorized_processors) 1973 { 1974 block_region_authorized_processors(va_block, 1975 uvm_va_block_region_for_page(page_index), 1976 access_permission, 1977 authorized_processors); 1978 } 1979 1980 static bool block_is_gpu_authorized_on_whole_region(uvm_va_block_t *va_block, 1981 uvm_va_block_region_t region, 1982 uvm_gpu_id_t gpu_id, 1983 uvm_prot_t required_prot) 1984 { 1985 uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(required_prot); 1986 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 1987 1988 if (!gpu_state) 1989 return false; 1990 1991 return uvm_page_mask_region_full(&gpu_state->pte_bits[search_gpu_bit], region); 1992 } 1993 1994 static bool block_is_processor_authorized_on_whole_region(uvm_va_block_t *va_block, 1995 uvm_va_block_region_t region, 1996 uvm_processor_id_t processor_id, 1997 uvm_prot_t required_prot) 1998 { 1999 if (UVM_ID_IS_CPU(processor_id)) { 2000 uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(required_prot); 2001 2002 return uvm_page_mask_region_full(&va_block->cpu.pte_bits[search_cpu_bit], region); 2003 } 2004 else { 2005 return block_is_gpu_authorized_on_whole_region(va_block, region, processor_id, required_prot); 2006 } 2007 } 2008 2009 bool uvm_va_block_page_is_gpu_authorized(uvm_va_block_t *va_block, 2010 uvm_page_index_t page_index, 2011 uvm_gpu_id_t gpu_id, 2012 uvm_prot_t required_prot) 2013 { 2014 return block_is_gpu_authorized_on_whole_region(va_block, 2015 uvm_va_block_region_for_page(page_index), 2016 gpu_id, 2017 required_prot); 2018 } 2019 2020 static bool block_page_is_processor_authorized(uvm_va_block_t *va_block, 2021 uvm_page_index_t page_index, 2022 uvm_processor_id_t processor_id, 2023 uvm_prot_t required_prot) 2024 { 2025 return block_is_processor_authorized_on_whole_region(va_block, 2026 uvm_va_block_region_for_page(page_index), 2027 processor_id, 2028 required_prot); 2029 } 2030 2031 // Compute the gpus that have a copy of the given page resident in their memory 2032 static void block_page_resident_gpus(uvm_va_block_t *va_block, 2033 uvm_page_index_t page_index, 2034 uvm_processor_mask_t *resident_gpus) 2035 { 2036 uvm_gpu_id_t id; 2037 uvm_processor_mask_zero(resident_gpus); 2038 2039 for_each_gpu_id_in_mask(id, &va_block->resident) { 2040 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index)) { 2041 UVM_ASSERT(block_processor_page_is_populated(va_block, id, page_index)); 2042 uvm_processor_mask_set(resident_gpus, id); 2043 } 2044 } 2045 } 2046 2047 void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block, 2048 uvm_page_index_t page_index, 2049 uvm_processor_mask_t *resident_processors) 2050 { 2051 block_page_resident_gpus(va_block, page_index, resident_processors); 2052 2053 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU), page_index)) { 2054 UVM_ASSERT(block_processor_page_is_populated(va_block, UVM_ID_CPU, page_index)); 2055 uvm_processor_mask_set(resident_processors, UVM_ID_CPU); 2056 } 2057 } 2058 2059 NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block, uvm_page_index_t page_index) 2060 { 2061 uvm_processor_mask_t resident_processors; 2062 uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors); 2063 2064 return uvm_processor_mask_get_count(&resident_processors); 2065 } 2066 2067 static uvm_processor_id_t block_page_get_closest_resident_in_mask(uvm_va_block_t *va_block, 2068 uvm_page_index_t page_index, 2069 uvm_processor_id_t processor, 2070 const uvm_processor_mask_t *processor_mask) 2071 { 2072 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 2073 uvm_processor_mask_t search_mask; 2074 uvm_processor_id_t id; 2075 2076 if (processor_mask) 2077 uvm_processor_mask_and(&search_mask, processor_mask, &va_block->resident); 2078 else 2079 uvm_processor_mask_copy(&search_mask, &va_block->resident); 2080 2081 for_each_closest_id(id, &search_mask, processor, va_space) { 2082 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index)) 2083 return id; 2084 } 2085 2086 return UVM_ID_INVALID; 2087 } 2088 2089 uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block, 2090 uvm_page_index_t page_index, 2091 uvm_processor_id_t processor) 2092 { 2093 return block_page_get_closest_resident_in_mask(va_block, page_index, processor, NULL); 2094 } 2095 2096 // We don't track the specific aperture of each mapped page. Instead, we assume 2097 // that each virtual mapping from a given processor always targets the closest 2098 // processor on which that page is resident (with special rules for UVM-Lite). 2099 // 2100 // This function verifies that assumption: before a page becomes resident on a 2101 // new location, assert that no processor has a valid mapping to a farther 2102 // processor on that page. 2103 static bool block_check_resident_proximity(uvm_va_block_t *block, uvm_page_index_t page_index, uvm_processor_id_t new_residency) 2104 { 2105 uvm_processor_mask_t resident_procs, mapped_procs; 2106 uvm_processor_id_t mapped_id, closest_id; 2107 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 2108 2109 uvm_processor_mask_andnot(&mapped_procs, &block->mapped, block_get_uvm_lite_gpus(block)); 2110 2111 for_each_id_in_mask(mapped_id, &mapped_procs) { 2112 if (!uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index)) 2113 continue; 2114 2115 uvm_va_block_page_resident_processors(block, page_index, &resident_procs); 2116 UVM_ASSERT(!uvm_processor_mask_empty(&resident_procs)); 2117 UVM_ASSERT(!uvm_processor_mask_test(&resident_procs, new_residency)); 2118 uvm_processor_mask_set(&resident_procs, new_residency); 2119 closest_id = uvm_processor_mask_find_closest_id(va_space, &resident_procs, mapped_id); 2120 UVM_ASSERT(!uvm_id_equal(closest_id, new_residency)); 2121 } 2122 2123 return true; 2124 } 2125 2126 // Returns the processor to which page_index should be mapped on gpu 2127 static uvm_processor_id_t block_gpu_get_processor_to_map(uvm_va_block_t *block, 2128 uvm_gpu_t *gpu, 2129 uvm_page_index_t page_index) 2130 { 2131 uvm_processor_id_t dest_id; 2132 2133 // UVM-Lite GPUs can only map pages on the preferred location 2134 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) 2135 return uvm_va_range_get_policy(block->va_range)->preferred_location; 2136 2137 // Otherwise we always map the closest resident processor 2138 dest_id = uvm_va_block_page_get_closest_resident(block, page_index, gpu->id); 2139 UVM_ASSERT(UVM_ID_IS_VALID(dest_id)); 2140 return dest_id; 2141 } 2142 2143 // Returns the processor to which page_index should be mapped on mapping_id 2144 static uvm_processor_id_t block_get_processor_to_map(uvm_va_block_t *block, 2145 uvm_processor_id_t mapping_id, 2146 uvm_page_index_t page_index) 2147 { 2148 2149 if (UVM_ID_IS_CPU(mapping_id)) 2150 return uvm_va_block_page_get_closest_resident(block, page_index, mapping_id); 2151 2152 return block_gpu_get_processor_to_map(block, block_get_gpu(block, mapping_id), page_index); 2153 } 2154 2155 static void block_get_mapped_processors(uvm_va_block_t *block, 2156 uvm_processor_id_t resident_id, 2157 uvm_page_index_t page_index, 2158 uvm_processor_mask_t *mapped_procs) 2159 { 2160 uvm_processor_id_t mapped_id; 2161 2162 uvm_processor_mask_zero(mapped_procs); 2163 2164 for_each_id_in_mask(mapped_id, &block->mapped) { 2165 if (uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index)) { 2166 uvm_processor_id_t to_map_id = block_get_processor_to_map(block, mapped_id, page_index); 2167 2168 if (uvm_id_equal(to_map_id, resident_id)) 2169 uvm_processor_mask_set(mapped_procs, mapped_id); 2170 } 2171 } 2172 } 2173 2174 // We use block_gpu_get_processor_to_map to find the destination processor of a 2175 // given GPU mapping. This function is called when the mapping is established to 2176 // sanity check that the destination of the mapping matches the query. 2177 static bool block_check_mapping_residency_region(uvm_va_block_t *block, 2178 uvm_gpu_t *gpu, 2179 uvm_processor_id_t mapping_dest, 2180 uvm_va_block_region_t region, 2181 const uvm_page_mask_t *page_mask) 2182 { 2183 uvm_page_index_t page_index; 2184 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2185 NvU64 va = uvm_va_block_cpu_page_address(block, page_index); 2186 uvm_processor_id_t proc_to_map = block_gpu_get_processor_to_map(block, gpu, page_index); 2187 UVM_ASSERT_MSG(uvm_id_equal(mapping_dest, proc_to_map), 2188 "VA 0x%llx on %s: mapping %s, supposed to map %s", 2189 va, 2190 uvm_gpu_name(gpu), 2191 block_processor_name(block, mapping_dest), 2192 block_processor_name(block, proc_to_map)); 2193 } 2194 return true; 2195 } 2196 2197 static bool block_check_mapping_residency(uvm_va_block_t *block, 2198 uvm_gpu_t *gpu, 2199 uvm_processor_id_t mapping_dest, 2200 const uvm_page_mask_t *page_mask) 2201 { 2202 return block_check_mapping_residency_region(block, 2203 gpu, 2204 mapping_dest, 2205 uvm_va_block_region_from_block(block), 2206 page_mask); 2207 } 2208 2209 // Check that there are no mappings targeting resident_id from any processor in 2210 // the block. 2211 static bool block_check_processor_not_mapped(uvm_va_block_t *block, uvm_processor_id_t resident_id) 2212 { 2213 uvm_processor_id_t mapped_id; 2214 uvm_page_index_t page_index; 2215 2216 for_each_id_in_mask(mapped_id, &block->mapped) { 2217 const uvm_page_mask_t *map_mask = uvm_va_block_map_mask_get(block, mapped_id); 2218 2219 for_each_va_block_page_in_mask(page_index, map_mask, block) { 2220 uvm_processor_id_t to_map_id = block_get_processor_to_map(block, mapped_id, page_index); 2221 UVM_ASSERT(!uvm_id_equal(to_map_id, resident_id)); 2222 } 2223 } 2224 2225 return true; 2226 } 2227 2228 // Zero all pages of the newly-populated chunk which are not resident anywhere 2229 // else in the system, adding that work to the block's tracker. In all cases, 2230 // this function adds a dependency on passed in tracker to the block's tracker. 2231 static NV_STATUS block_zero_new_gpu_chunk(uvm_va_block_t *block, 2232 uvm_gpu_t *gpu, 2233 uvm_gpu_chunk_t *chunk, 2234 uvm_va_block_region_t chunk_region, 2235 uvm_tracker_t *tracker) 2236 { 2237 uvm_va_block_gpu_state_t *gpu_state; 2238 NV_STATUS status; 2239 uvm_gpu_address_t memset_addr_base, memset_addr; 2240 uvm_push_t push; 2241 uvm_gpu_id_t id; 2242 uvm_va_block_region_t subregion; 2243 uvm_page_mask_t *zero_mask; 2244 2245 UVM_ASSERT(uvm_va_block_region_size(chunk_region) == uvm_gpu_chunk_get_size(chunk)); 2246 2247 if (chunk->is_zero) 2248 return NV_OK; 2249 2250 gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 2251 zero_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS); 2252 2253 if (!zero_mask) 2254 return NV_ERR_NO_MEMORY; 2255 2256 // Tradeoff: zeroing entire chunk vs zeroing only the pages needed for the 2257 // operation. 2258 // 2259 // We may over-zero the page with this approach. For example, we might be 2260 // populating a 2MB chunk because only a single page within that chunk needs 2261 // to be made resident. If we also zero non-resident pages outside of the 2262 // strict region, we could waste the effort if those pages are populated on 2263 // another processor later and migrated here. 2264 // 2265 // We zero all non-resident pages in the chunk anyway for two reasons: 2266 // 2267 // 1) Efficiency. It's better to do all zeros as pipelined transfers once 2268 // rather than scatter them around for each populate operation. 2269 // 2270 // 2) Optimizing the common case of block_populate_gpu_chunk being called 2271 // for already-populated chunks. If we zero once at initial populate, we 2272 // can simply check whether the chunk is present in the array. Otherwise 2273 // we'd have to recompute the "is any page resident" mask every time. 2274 2275 // Roll up all pages in chunk_region which are resident somewhere 2276 uvm_page_mask_zero(zero_mask); 2277 for_each_id_in_mask(id, &block->resident) 2278 uvm_page_mask_or(zero_mask, zero_mask, uvm_va_block_resident_mask_get(block, id)); 2279 2280 // If all pages in the chunk are resident somewhere, we don't need to clear 2281 // anything. Just make sure the chunk is tracked properly. 2282 if (uvm_page_mask_region_full(zero_mask, chunk_region)) { 2283 status = uvm_tracker_add_tracker_safe(&block->tracker, tracker); 2284 goto out; 2285 } 2286 2287 // Complement to get the pages which are not resident anywhere. These 2288 // are the pages which must be zeroed. 2289 uvm_page_mask_complement(zero_mask, zero_mask); 2290 2291 memset_addr_base = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address)); 2292 memset_addr = memset_addr_base; 2293 2294 status = uvm_push_begin_acquire(gpu->channel_manager, 2295 UVM_CHANNEL_TYPE_GPU_INTERNAL, 2296 tracker, 2297 &push, 2298 "Zero out chunk [0x%llx, 0x%llx) for region [0x%llx, 0x%llx) in va block [0x%llx, 0x%llx)", 2299 chunk->address, 2300 chunk->address + uvm_gpu_chunk_get_size(chunk), 2301 uvm_va_block_region_start(block, chunk_region), 2302 uvm_va_block_region_end(block, chunk_region) + 1, 2303 block->start, 2304 block->end + 1); 2305 if (status != NV_OK) 2306 goto out; 2307 2308 for_each_va_block_subregion_in_mask(subregion, zero_mask, chunk_region) { 2309 // Pipeline the memsets since they never overlap with each other 2310 uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 2311 2312 // We'll push one membar later for all memsets in this loop 2313 uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 2314 2315 memset_addr.address = memset_addr_base.address + (subregion.first - chunk_region.first) * PAGE_SIZE; 2316 gpu->parent->ce_hal->memset_8(&push, memset_addr, 0, uvm_va_block_region_size(subregion)); 2317 } 2318 2319 // A membar from this GPU is required between this memset and any PTE write 2320 // pointing this or another GPU to this chunk. Otherwise an engine could 2321 // read the PTE then access the page before the memset write is visible to 2322 // that engine. 2323 // 2324 // This memset writes GPU memory, so local mappings need only a GPU-local 2325 // membar. We can't easily determine here whether a peer GPU will ever map 2326 // this page in the future, so always use a sysmembar. uvm_push_end provides 2327 // one by default. 2328 // 2329 // TODO: Bug 1766424: Use GPU-local membars if no peer can currently map 2330 // this page. When peer access gets enabled, do a MEMBAR_SYS at that 2331 // point. 2332 uvm_push_end(&push); 2333 status = uvm_tracker_add_push_safe(&block->tracker, &push); 2334 2335 out: 2336 if (zero_mask) 2337 kmem_cache_free(g_uvm_page_mask_cache, zero_mask); 2338 2339 return status; 2340 } 2341 2342 static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block, 2343 uvm_va_block_retry_t *retry, 2344 uvm_gpu_t *gpu, 2345 size_t chunk_index, 2346 uvm_va_block_region_t chunk_region) 2347 { 2348 uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu); 2349 uvm_gpu_chunk_t *chunk = NULL; 2350 uvm_chunk_size_t chunk_size = uvm_va_block_region_size(chunk_region); 2351 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); 2352 NV_STATUS status; 2353 2354 if (!gpu_state) 2355 return NV_ERR_NO_MEMORY; 2356 2357 uvm_assert_mutex_locked(&block->lock); 2358 UVM_ASSERT(chunk_index < block_num_gpu_chunks(block, gpu)); 2359 UVM_ASSERT(chunk_size & gpu->parent->mmu_user_chunk_sizes); 2360 2361 // We zero chunks as necessary at initial population, so if the chunk is 2362 // already populated we're done. See the comment in 2363 // block_zero_new_gpu_chunk. 2364 if (gpu_state->chunks[chunk_index]) 2365 return NV_OK; 2366 2367 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->resident, chunk_region)); 2368 2369 status = block_alloc_gpu_chunk(block, retry, gpu, chunk_size, &chunk); 2370 if (status != NV_OK) 2371 return status; 2372 2373 // In some configurations such as SR-IOV heavy, the chunk cannot be 2374 // referenced using its physical address. Create a virtual mapping. 2375 status = uvm_mmu_chunk_map(chunk); 2376 if (status != NV_OK) 2377 goto chunk_free; 2378 2379 status = block_zero_new_gpu_chunk(block, gpu, chunk, chunk_region, &retry->tracker); 2380 if (status != NV_OK) 2381 goto chunk_unmap; 2382 2383 // It is safe to modify the page index field without holding any PMM locks 2384 // because the chunk is pinned, which means that none of the other fields in 2385 // the bitmap can change. 2386 chunk->va_block_page_index = chunk_region.first; 2387 2388 // va_block_page_index is a bitfield of size PAGE_SHIFT. Make sure at 2389 // compile-time that it can store VA Block page indexes. 2390 BUILD_BUG_ON(PAGES_PER_UVM_VA_BLOCK >= PAGE_SIZE); 2391 2392 status = block_map_indirect_peers_to_gpu_chunk(block, gpu, chunk); 2393 if (status != NV_OK) 2394 goto chunk_unmap; 2395 2396 if (block_test && block_test->inject_populate_error) { 2397 block_test->inject_populate_error = false; 2398 2399 // Use NV_ERR_MORE_PROCESSING_REQUIRED to force a retry rather than 2400 // causing a fatal OOM failure. 2401 status = NV_ERR_MORE_PROCESSING_REQUIRED; 2402 goto chunk_unmap_indirect_peers; 2403 } 2404 2405 // Record the used chunk so that it can be unpinned at the end of the whole 2406 // operation. 2407 block_retry_add_used_chunk(retry, chunk); 2408 gpu_state->chunks[chunk_index] = chunk; 2409 2410 return NV_OK; 2411 2412 chunk_unmap_indirect_peers: 2413 block_unmap_indirect_peers_from_gpu_chunk(block, gpu, chunk); 2414 2415 chunk_unmap: 2416 uvm_mmu_chunk_unmap(chunk, &block->tracker); 2417 2418 chunk_free: 2419 // block_zero_new_gpu_chunk may have pushed memsets on this chunk which it 2420 // placed in the block tracker. 2421 uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker); 2422 2423 return status; 2424 } 2425 2426 // Populate all chunks which cover the given region and page mask. 2427 static NV_STATUS block_populate_pages_gpu(uvm_va_block_t *block, 2428 uvm_va_block_retry_t *retry, 2429 uvm_gpu_t *gpu, 2430 uvm_va_block_region_t region, 2431 const uvm_page_mask_t *populate_mask) 2432 { 2433 uvm_va_block_region_t chunk_region, check_region; 2434 size_t chunk_index; 2435 uvm_page_index_t page_index; 2436 uvm_chunk_size_t chunk_size; 2437 NV_STATUS status; 2438 2439 page_index = uvm_va_block_first_page_in_mask(region, populate_mask); 2440 if (page_index == region.outer) 2441 return NV_OK; 2442 2443 chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size); 2444 chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index); 2445 2446 while (1) { 2447 check_region = uvm_va_block_region(max(chunk_region.first, region.first), 2448 min(chunk_region.outer, region.outer)); 2449 page_index = uvm_va_block_first_page_in_mask(check_region, populate_mask); 2450 if (page_index != check_region.outer) { 2451 status = block_populate_gpu_chunk(block, retry, gpu, chunk_index, chunk_region); 2452 if (status != NV_OK) 2453 return status; 2454 } 2455 2456 if (check_region.outer == region.outer) 2457 break; 2458 2459 ++chunk_index; 2460 chunk_size = block_gpu_chunk_size(block, gpu, chunk_region.outer); 2461 chunk_region = uvm_va_block_region(chunk_region.outer, chunk_region.outer + (chunk_size / PAGE_SIZE)); 2462 } 2463 2464 return NV_OK; 2465 } 2466 2467 static NV_STATUS block_populate_pages(uvm_va_block_t *block, 2468 uvm_va_block_retry_t *retry, 2469 uvm_va_block_context_t *block_context, 2470 uvm_processor_id_t dest_id, 2471 uvm_va_block_region_t region, 2472 const uvm_page_mask_t *page_mask) 2473 { 2474 NV_STATUS status; 2475 const uvm_page_mask_t *resident_mask = block_resident_mask_get_alloc(block, dest_id); 2476 uvm_page_mask_t *populate_page_mask = &block_context->make_resident.page_mask; 2477 uvm_memcg_context_t memcg_context; 2478 2479 if (!resident_mask) 2480 return NV_ERR_NO_MEMORY; 2481 2482 if (page_mask) 2483 uvm_page_mask_andnot(populate_page_mask, page_mask, resident_mask); 2484 else 2485 uvm_page_mask_complement(populate_page_mask, resident_mask); 2486 2487 if (UVM_ID_IS_GPU(dest_id)) 2488 return block_populate_pages_gpu(block, retry, block_get_gpu(block, dest_id), region, populate_page_mask); 2489 2490 uvm_memcg_context_start(&memcg_context, block_context->mm); 2491 status = block_populate_pages_cpu(block, populate_page_mask, region, block_context); 2492 uvm_memcg_context_end(&memcg_context); 2493 return status; 2494 } 2495 2496 static const uvm_processor_mask_t *block_get_can_copy_from_mask(uvm_va_block_t *block, uvm_processor_id_t from) 2497 { 2498 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 2499 2500 return &va_space->can_copy_from[uvm_id_value(from)]; 2501 } 2502 2503 static bool block_can_copy_from(uvm_va_block_t *va_block, uvm_processor_id_t from, uvm_processor_id_t to) 2504 { 2505 return uvm_processor_mask_test(block_get_can_copy_from_mask(va_block, to), from); 2506 } 2507 2508 // Get the chunk containing the given page, along with the offset of that page 2509 // within the chunk. 2510 static uvm_gpu_chunk_t *block_phys_page_chunk(uvm_va_block_t *block, block_phys_page_t block_page, size_t *chunk_offset) 2511 { 2512 uvm_gpu_t *gpu = block_get_gpu(block, block_page.processor); 2513 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, block_page.processor); 2514 size_t chunk_index; 2515 uvm_gpu_chunk_t *chunk; 2516 uvm_chunk_size_t chunk_size; 2517 2518 UVM_ASSERT(gpu_state); 2519 2520 chunk_index = block_gpu_chunk_index(block, gpu, block_page.page_index, &chunk_size); 2521 chunk = gpu_state->chunks[chunk_index]; 2522 UVM_ASSERT(chunk); 2523 2524 if (chunk_offset) { 2525 size_t page_offset = block_page.page_index - 2526 uvm_va_block_chunk_region(block,chunk_size, block_page.page_index).first; 2527 *chunk_offset = page_offset * PAGE_SIZE; 2528 } 2529 2530 return chunk; 2531 } 2532 2533 // Get the physical GPU address of a block's page from the POV of the specified GPU 2534 // This is the address that should be used for making PTEs for the specified GPU. 2535 static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block, 2536 block_phys_page_t block_page, 2537 uvm_gpu_t *gpu) 2538 { 2539 uvm_va_block_gpu_state_t *accessing_gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 2540 size_t chunk_offset; 2541 uvm_gpu_chunk_t *chunk; 2542 2543 UVM_ASSERT(accessing_gpu_state); 2544 2545 if (UVM_ID_IS_CPU(block_page.processor)) { 2546 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, block_page.page_index); 2547 NvU64 dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 2548 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, 2549 uvm_cpu_chunk_get_size(chunk), 2550 block_page.page_index); 2551 2552 // The page should be mapped for physical access already as we do that 2553 // eagerly on CPU page population and GPU state alloc. 2554 UVM_ASSERT(dma_addr != 0); 2555 dma_addr += (block_page.page_index - chunk_region.first) * PAGE_SIZE; 2556 2557 return uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr); 2558 } 2559 2560 chunk = block_phys_page_chunk(block, block_page, &chunk_offset); 2561 2562 if (uvm_id_equal(block_page.processor, gpu->id)) { 2563 return uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address + chunk_offset); 2564 } 2565 else { 2566 uvm_gpu_phys_address_t phys_addr; 2567 uvm_gpu_t *owning_gpu = block_get_gpu(block, block_page.processor); 2568 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 2569 2570 UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu)); 2571 phys_addr = uvm_pmm_gpu_peer_phys_address(&owning_gpu->pmm, chunk, gpu); 2572 phys_addr.address += chunk_offset; 2573 return phys_addr; 2574 } 2575 } 2576 2577 // Get the physical GPU address of a block's page from the POV of the specified 2578 // GPU, suitable for accessing the memory from UVM-internal CE channels. 2579 // 2580 // Notably this is may be different from block_phys_page_address() to handle CE 2581 // limitations in addressing physical memory directly. 2582 static uvm_gpu_address_t block_phys_page_copy_address(uvm_va_block_t *block, 2583 block_phys_page_t block_page, 2584 uvm_gpu_t *gpu) 2585 { 2586 uvm_gpu_t *owning_gpu; 2587 size_t chunk_offset; 2588 uvm_gpu_chunk_t *chunk; 2589 uvm_gpu_address_t copy_addr; 2590 uvm_va_space_t *va_space; 2591 2592 UVM_ASSERT_MSG(block_can_copy_from(block, gpu->id, block_page.processor), 2593 "from %s to %s\n", 2594 block_processor_name(block, gpu->id), 2595 block_processor_name(block, block_page.processor)); 2596 2597 // CPU and local GPU accesses can rely on block_phys_page_address, but the 2598 // resulting physical address may need to be converted into virtual. 2599 if (UVM_ID_IS_CPU(block_page.processor) || uvm_id_equal(block_page.processor, gpu->id)) 2600 return uvm_gpu_address_copy(gpu, block_phys_page_address(block, block_page, gpu)); 2601 2602 va_space = uvm_va_block_get_va_space(block); 2603 2604 // See the comments on the peer_identity_mappings_supported assignments in 2605 // the HAL for why we disable direct copies between peers. 2606 owning_gpu = block_get_gpu(block, block_page.processor); 2607 2608 UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu)); 2609 2610 chunk = block_phys_page_chunk(block, block_page, &chunk_offset); 2611 copy_addr = uvm_pmm_gpu_peer_copy_address(&owning_gpu->pmm, chunk, gpu); 2612 copy_addr.address += chunk_offset; 2613 return copy_addr; 2614 } 2615 2616 uvm_gpu_phys_address_t uvm_va_block_res_phys_page_address(uvm_va_block_t *va_block, 2617 uvm_page_index_t page_index, 2618 uvm_processor_id_t residency, 2619 uvm_gpu_t *gpu) 2620 { 2621 uvm_assert_mutex_locked(&va_block->lock); 2622 2623 return block_phys_page_address(va_block, block_phys_page(residency, page_index), gpu); 2624 } 2625 2626 uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block, 2627 uvm_page_index_t page_index, 2628 uvm_gpu_t *gpu) 2629 { 2630 return uvm_va_block_res_phys_page_address(va_block, page_index, gpu->id, gpu); 2631 } 2632 2633 typedef struct 2634 { 2635 // Location of the memory 2636 uvm_processor_id_t id; 2637 2638 // Whether the whole block has a single physically-contiguous chunk of 2639 // storage on the processor. 2640 bool is_block_contig; 2641 2642 // Starting address of the physically-contiguous allocation, from the view 2643 // of the copying GPU. Valid only if is_block_contig. 2644 uvm_gpu_address_t gpu_address; 2645 } block_copy_addr_t; 2646 2647 typedef struct 2648 { 2649 block_copy_addr_t src; 2650 block_copy_addr_t dst; 2651 uvm_conf_computing_dma_buffer_t *dma_buffer; 2652 } block_copy_state_t; 2653 2654 // Begin a push appropriate for copying data from src_id processor to dst_id processor. 2655 // One of src_id and dst_id needs to be a GPU. 2656 static NV_STATUS block_copy_begin_push(uvm_va_block_t *va_block, 2657 block_copy_state_t *copy_state, 2658 uvm_tracker_t *tracker, 2659 uvm_push_t *push) 2660 { 2661 uvm_gpu_t *gpu; 2662 NV_STATUS status; 2663 uvm_channel_type_t channel_type; 2664 uvm_tracker_t *tracker_ptr = tracker; 2665 uvm_processor_id_t dst_id = copy_state->dst.id; 2666 uvm_processor_id_t src_id = copy_state->src.id; 2667 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 2668 2669 UVM_ASSERT_MSG(!uvm_id_equal(src_id, dst_id), 2670 "Unexpected copy to self, processor %s\n", 2671 block_processor_name(va_block, src_id)); 2672 2673 if (UVM_ID_IS_CPU(src_id)) { 2674 gpu = block_get_gpu(va_block, dst_id); 2675 channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU; 2676 } 2677 else if (UVM_ID_IS_CPU(dst_id)) { 2678 gpu = block_get_gpu(va_block, src_id); 2679 channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU; 2680 } 2681 else { 2682 // For GPU to GPU copies, prefer to "push" the data from the source as 2683 // that works better at least for P2P over PCI-E. 2684 gpu = block_get_gpu(va_block, src_id); 2685 2686 channel_type = UVM_CHANNEL_TYPE_GPU_TO_GPU; 2687 } 2688 2689 UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, dst_id), 2690 "GPU %s dst %s src %s\n", 2691 block_processor_name(va_block, gpu->id), 2692 block_processor_name(va_block, dst_id), 2693 block_processor_name(va_block, src_id)); 2694 UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, src_id), 2695 "GPU %s dst %s src %s\n", 2696 block_processor_name(va_block, gpu->id), 2697 block_processor_name(va_block, dst_id), 2698 block_processor_name(va_block, src_id)); 2699 2700 if (channel_type == UVM_CHANNEL_TYPE_GPU_TO_GPU) { 2701 uvm_gpu_t *dst_gpu = block_get_gpu(va_block, dst_id); 2702 return uvm_push_begin_acquire_gpu_to_gpu(gpu->channel_manager, 2703 dst_gpu, 2704 tracker, 2705 push, 2706 "Copy from %s to %s for block [0x%llx, 0x%llx]", 2707 block_processor_name(va_block, src_id), 2708 block_processor_name(va_block, dst_id), 2709 va_block->start, 2710 va_block->end); 2711 } 2712 2713 if (uvm_conf_computing_mode_enabled(gpu)) { 2714 // When the Confidential Feature is enabled, additional dependencies 2715 // apply to the input tracker as well as the dma_buffer tracker. 2716 // * In the CPU to GPU case, because UVM performs CPU side 2717 // crypto-operations first before the GPU copy, we both need to 2718 // ensure that the dma_buffer and the input tracker are completed. 2719 // * In the GPU to CPU case, the GPU copy happens first, but the same 2720 // principles apply. Hence, UVM acquires the input tracker and the 2721 // dma buffer. 2722 status = uvm_tracker_overwrite_safe(&local_tracker, tracker); 2723 if (status != NV_OK) 2724 goto error; 2725 2726 UVM_ASSERT(copy_state->dma_buffer == NULL); 2727 status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, 2728 ©_state->dma_buffer, 2729 &local_tracker); 2730 2731 if (status != NV_OK) 2732 goto error; 2733 2734 if (channel_type == UVM_CHANNEL_TYPE_CPU_TO_GPU) { 2735 status = uvm_tracker_wait(&local_tracker); 2736 if (status != NV_OK) 2737 goto error; 2738 } 2739 2740 tracker_ptr = &local_tracker; 2741 } 2742 2743 status = uvm_push_begin_acquire(gpu->channel_manager, 2744 channel_type, 2745 tracker_ptr, 2746 push, 2747 "Copy from %s to %s for block [0x%llx, 0x%llx]", 2748 block_processor_name(va_block, src_id), 2749 block_processor_name(va_block, dst_id), 2750 va_block->start, 2751 va_block->end); 2752 2753 error: 2754 // Caller is responsible for freeing the DMA buffer on error 2755 uvm_tracker_deinit(&local_tracker); 2756 return status; 2757 } 2758 2759 // A page is clean iff... 2760 // the destination is the preferred location and 2761 // the source is the CPU and 2762 // the destination does not support faults/eviction and 2763 // the CPU page is not dirty 2764 static bool block_page_is_clean(uvm_va_block_t *block, 2765 uvm_processor_id_t dst_id, 2766 uvm_processor_id_t src_id, 2767 uvm_page_index_t page_index) 2768 { 2769 return !uvm_va_block_is_hmm(block) && 2770 uvm_id_equal(dst_id, uvm_va_range_get_policy(block->va_range)->preferred_location) && 2771 UVM_ID_IS_CPU(src_id) && 2772 !block_get_gpu(block, dst_id)->parent->isr.replayable_faults.handling && 2773 !block_cpu_page_is_dirty(block, page_index); 2774 } 2775 2776 // When the destination is the CPU... 2777 // if the source is the preferred location, mark as clean 2778 // otherwise, mark as dirty 2779 static void block_update_page_dirty_state(uvm_va_block_t *block, 2780 uvm_processor_id_t dst_id, 2781 uvm_processor_id_t src_id, 2782 uvm_page_index_t page_index) 2783 { 2784 if (UVM_ID_IS_GPU(dst_id)) 2785 return; 2786 2787 if (uvm_id_equal(src_id, uvm_va_range_get_policy(block->va_range)->preferred_location)) 2788 block_mark_cpu_page_clean(block, page_index); 2789 else 2790 block_mark_cpu_page_dirty(block, page_index); 2791 } 2792 2793 static void block_mark_memory_used(uvm_va_block_t *block, uvm_processor_id_t id) 2794 { 2795 uvm_gpu_t *gpu; 2796 2797 if (UVM_ID_IS_CPU(id)) 2798 return; 2799 2800 gpu = block_get_gpu(block, id); 2801 2802 // If the block is of the max size and the GPU supports eviction, mark the 2803 // root chunk as used in PMM. 2804 // HMM always allocates PAGE_SIZE GPU chunks so skip HMM va_blocks. 2805 if (!uvm_va_block_is_hmm(block) && 2806 uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX && 2807 uvm_gpu_supports_eviction(gpu)) { 2808 // The chunk has to be there if this GPU is resident 2809 UVM_ASSERT(uvm_processor_mask_test(&block->resident, id)); 2810 uvm_pmm_gpu_mark_root_chunk_used(&gpu->pmm, uvm_va_block_gpu_state_get(block, gpu->id)->chunks[0]); 2811 } 2812 } 2813 2814 static void block_set_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id) 2815 { 2816 UVM_ASSERT(!uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id))); 2817 2818 if (uvm_processor_mask_test_and_set(&block->resident, id)) 2819 return; 2820 2821 block_mark_memory_used(block, id); 2822 } 2823 2824 static void block_clear_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id) 2825 { 2826 uvm_gpu_t *gpu; 2827 2828 UVM_ASSERT(uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id))); 2829 2830 if (!uvm_processor_mask_test_and_clear(&block->resident, id)) 2831 return; 2832 2833 if (UVM_ID_IS_CPU(id)) 2834 return; 2835 2836 gpu = block_get_gpu(block, id); 2837 2838 // If the block is of the max size and the GPU supports eviction, mark the 2839 // root chunk as unused in PMM. 2840 if (!uvm_va_block_is_hmm(block) && 2841 uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX && 2842 uvm_gpu_supports_eviction(gpu)) { 2843 // The chunk may not be there any more when residency is cleared. 2844 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 2845 if (gpu_state && gpu_state->chunks[0]) 2846 uvm_pmm_gpu_mark_root_chunk_unused(&gpu->pmm, gpu_state->chunks[0]); 2847 } 2848 } 2849 2850 static bool block_phys_copy_contig_check(uvm_va_block_t *block, 2851 uvm_page_index_t page_index, 2852 const uvm_gpu_address_t *base_address, 2853 uvm_processor_id_t proc_id, 2854 uvm_gpu_t *copying_gpu) 2855 { 2856 uvm_gpu_address_t page_address; 2857 uvm_gpu_address_t contig_address = *base_address; 2858 2859 contig_address.address += page_index * PAGE_SIZE; 2860 2861 page_address = block_phys_page_copy_address(block, block_phys_page(proc_id, page_index), copying_gpu); 2862 2863 return uvm_gpu_addr_cmp(page_address, contig_address) == 0; 2864 } 2865 2866 // Check if the VA block has a single physically-contiguous chunk of storage 2867 // on the processor. 2868 static bool is_block_phys_contig(uvm_va_block_t *block, uvm_processor_id_t id) 2869 { 2870 uvm_cpu_chunk_t *chunk; 2871 2872 if (UVM_ID_IS_GPU(id)) 2873 return uvm_va_block_size(block) == block_gpu_chunk_size(block, block_get_gpu(block, id), 0); 2874 2875 chunk = uvm_cpu_chunk_first_in_region(block, uvm_va_block_region_from_block(block), NULL); 2876 return chunk && (uvm_va_block_size(block) == uvm_cpu_chunk_get_size(chunk)); 2877 } 2878 2879 static uvm_va_block_region_t block_phys_contig_region(uvm_va_block_t *block, 2880 uvm_page_index_t page_index, 2881 uvm_processor_id_t resident_id) 2882 { 2883 if (UVM_ID_IS_CPU(resident_id)) { 2884 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 2885 return uvm_cpu_chunk_block_region(block, chunk, page_index); 2886 } 2887 else { 2888 uvm_chunk_size_t chunk_size; 2889 (void)block_gpu_chunk_index(block, block_get_gpu(block, resident_id), page_index, &chunk_size); 2890 return uvm_va_block_chunk_region(block, chunk_size, page_index); 2891 } 2892 } 2893 2894 // Like block_phys_page_copy_address, but uses the address cached in bca when 2895 // possible. 2896 static uvm_gpu_address_t block_copy_get_address(uvm_va_block_t *block, 2897 block_copy_addr_t *bca, 2898 uvm_page_index_t page_index, 2899 uvm_gpu_t *copying_gpu) 2900 { 2901 if (bca->is_block_contig) { 2902 uvm_gpu_address_t addr = bca->gpu_address; 2903 addr.address += page_index * PAGE_SIZE; 2904 UVM_ASSERT(block_phys_copy_contig_check(block, page_index, &bca->gpu_address, bca->id, copying_gpu)); 2905 return addr; 2906 } 2907 2908 return block_phys_page_copy_address(block, block_phys_page(bca->id, page_index), copying_gpu); 2909 } 2910 2911 // When the Confidential Computing feature is enabled, the function performs 2912 // CPU side page encryption and GPU side decryption to the CPR. 2913 // GPU operations respect the caller's membar previously set in the push. 2914 static void conf_computing_block_copy_push_cpu_to_gpu(uvm_va_block_t *block, 2915 block_copy_state_t *copy_state, 2916 uvm_va_block_region_t region, 2917 uvm_push_t *push) 2918 { 2919 uvm_push_flag_t membar_flag = 0; 2920 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 2921 uvm_page_index_t page_index = region.first; 2922 uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer; 2923 struct page *src_page = uvm_cpu_chunk_get_cpu_page(block, page_index); 2924 uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu); 2925 uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu); 2926 char *cpu_auth_tag_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag) + 2927 (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE); 2928 uvm_gpu_address_t dst_address = block_copy_get_address(block, ©_state->dst, page_index, gpu); 2929 char *cpu_va_staging_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc) + (page_index * PAGE_SIZE); 2930 2931 UVM_ASSERT(UVM_ID_IS_CPU(copy_state->src.id)); 2932 UVM_ASSERT(UVM_ID_IS_GPU(copy_state->dst.id)); 2933 2934 UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); 2935 2936 // See comment in block_copy_begin_push. 2937 UVM_ASSERT(uvm_tracker_is_completed(&block->tracker)); 2938 2939 staging_buffer.address += page_index * PAGE_SIZE; 2940 auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 2941 2942 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE)) 2943 membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE; 2944 else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU)) 2945 membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU; 2946 2947 // kmap() only guarantees PAGE_SIZE contiguity, all encryption and 2948 // decryption must happen on a PAGE_SIZE basis. 2949 for_each_va_block_page_in_region(page_index, region) { 2950 void *src_cpu_virt_addr; 2951 2952 // The caller guarantees that all pages in region are contiguous, 2953 // meaning they're guaranteed to be part of the same compound page. 2954 UVM_ASSERT(src_page == uvm_cpu_chunk_get_cpu_page(block, page_index)); 2955 2956 src_cpu_virt_addr = kmap(src_page); 2957 uvm_conf_computing_cpu_encrypt(push->channel, 2958 cpu_va_staging_buffer, 2959 src_cpu_virt_addr, 2960 NULL, 2961 PAGE_SIZE, 2962 cpu_auth_tag_buffer); 2963 kunmap(src_page); 2964 2965 // First LCE operation should be non-pipelined to guarantee ordering as 2966 // we do not know when was the last non-pipelined copy. 2967 // Last one applies the membar originally planned for the push if any 2968 // TODO: 3857691: Inherit policy instead of forcing first invocation to 2969 // be non pipelined. 2970 if (page_index > region.first) 2971 uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 2972 2973 if (page_index < (region.outer - 1)) 2974 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 2975 else if (membar_flag) 2976 uvm_push_set_flag(push, membar_flag); 2977 2978 gpu->parent->ce_hal->decrypt(push, dst_address, staging_buffer, PAGE_SIZE, auth_tag_buffer); 2979 2980 src_page++; 2981 dst_address.address += PAGE_SIZE; 2982 cpu_va_staging_buffer += PAGE_SIZE; 2983 staging_buffer.address += PAGE_SIZE; 2984 cpu_auth_tag_buffer += UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 2985 auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 2986 } 2987 } 2988 2989 // When the Confidential Computing feature is enabled, the function performs 2990 // GPU side page encryption. GPU operations respect the caller's membar 2991 // previously set in the push. 2992 static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block, 2993 block_copy_state_t *copy_state, 2994 uvm_va_block_region_t region, 2995 uvm_push_t *push) 2996 { 2997 uvm_push_flag_t membar_flag = 0; 2998 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 2999 uvm_page_index_t page_index = region.first; 3000 uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer; 3001 uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu); 3002 uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu); 3003 uvm_gpu_address_t src_address = block_copy_get_address(block, ©_state->src, page_index, gpu); 3004 3005 UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id)); 3006 UVM_ASSERT(UVM_ID_IS_CPU(copy_state->dst.id)); 3007 3008 UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); 3009 3010 staging_buffer.address += page_index * PAGE_SIZE; 3011 auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 3012 3013 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE)) 3014 membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE; 3015 else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU)) 3016 membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU; 3017 3018 // Because we use kmap() for mapping pages for CPU side 3019 // crypto-operations and it only guarantees PAGE_SIZE contiguity, all 3020 // encryptions and decryptions must happen on a PAGE_SIZE basis. 3021 for_each_va_block_page_in_region(page_index, region) { 3022 uvm_conf_computing_log_gpu_encryption(push->channel, &dma_buffer->decrypt_iv[page_index]); 3023 3024 // First LCE operation should be non-pipelined to guarantee ordering as 3025 // we do not know when was the last non-pipelined copy. 3026 // Last one applies the membar originally planned for the push if any 3027 // TODO: 3857691: Inherit policy instead of forcing first invocation to 3028 // be non pipelined. 3029 if (page_index > region.first) 3030 uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 3031 3032 if (page_index < (region.outer - 1)) 3033 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 3034 else if (membar_flag) 3035 uvm_push_set_flag(push, membar_flag); 3036 3037 gpu->parent->ce_hal->encrypt(push, staging_buffer, src_address, PAGE_SIZE, auth_tag_buffer); 3038 3039 src_address.address += PAGE_SIZE; 3040 staging_buffer.address += PAGE_SIZE; 3041 auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE; 3042 } 3043 3044 uvm_page_mask_region_fill(&dma_buffer->encrypted_page_mask, region); 3045 } 3046 3047 static NV_STATUS conf_computing_copy_pages_finish(uvm_va_block_t *block, 3048 block_copy_state_t *copy_state, 3049 uvm_push_t *push) 3050 { 3051 NV_STATUS status; 3052 uvm_page_index_t page_index; 3053 uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer; 3054 uvm_page_mask_t *encrypted_page_mask = &dma_buffer->encrypted_page_mask; 3055 void *auth_tag_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag); 3056 void *staging_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc); 3057 3058 UVM_ASSERT(uvm_conf_computing_mode_enabled(push->gpu)); 3059 3060 if (UVM_ID_IS_GPU(copy_state->dst.id)) 3061 return NV_OK; 3062 3063 UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id)); 3064 3065 status = uvm_push_wait(push); 3066 if (status != NV_OK) 3067 return status; 3068 3069 // kmap() only guarantees PAGE_SIZE contiguity, all encryption and 3070 // decryption must happen on a PAGE_SIZE basis. 3071 for_each_va_block_page_in_mask(page_index, encrypted_page_mask, block) { 3072 struct page *dst_page = uvm_cpu_chunk_get_cpu_page(block, page_index); 3073 void *staging_buffer = (char *)staging_buffer_base + (page_index * PAGE_SIZE); 3074 void *auth_tag_buffer = (char *)auth_tag_buffer_base + (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE); 3075 void *cpu_page_address = kmap(dst_page); 3076 3077 status = uvm_conf_computing_cpu_decrypt(push->channel, 3078 cpu_page_address, 3079 staging_buffer, 3080 &dma_buffer->decrypt_iv[page_index], 3081 PAGE_SIZE, 3082 auth_tag_buffer); 3083 kunmap(dst_page); 3084 if (status != NV_OK) { 3085 // TODO: Bug 3814087: [UVM][HCC] Handle CSL auth_tag verification 3086 // failures & other failures gracefully. 3087 // uvm_conf_computing_cpu_decrypt() can fail if the authentication 3088 // tag verification fails. May this happen, it is considered a 3089 // critical failure and cannot be recovered. 3090 uvm_global_set_fatal_error(status); 3091 return status; 3092 } 3093 } 3094 3095 return NV_OK; 3096 } 3097 3098 static void block_copy_push(uvm_va_block_t *block, 3099 block_copy_state_t *copy_state, 3100 uvm_va_block_region_t region, 3101 uvm_push_t *push) 3102 { 3103 uvm_gpu_address_t gpu_dst_address; 3104 uvm_gpu_address_t gpu_src_address; 3105 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 3106 3107 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 3108 3109 if (uvm_conf_computing_mode_enabled(gpu)) { 3110 if (UVM_ID_IS_CPU(copy_state->src.id)) 3111 conf_computing_block_copy_push_cpu_to_gpu(block, copy_state, region, push); 3112 else 3113 conf_computing_block_copy_push_gpu_to_cpu(block, copy_state, region, push); 3114 3115 return; 3116 } 3117 3118 gpu_dst_address = block_copy_get_address(block, ©_state->dst, region.first, gpu); 3119 gpu_src_address = block_copy_get_address(block, ©_state->src, region.first, gpu); 3120 gpu->parent->ce_hal->memcopy(push, gpu_dst_address, gpu_src_address, uvm_va_block_region_size(region)); 3121 } 3122 3123 static NV_STATUS block_copy_end_push(uvm_va_block_t *block, 3124 block_copy_state_t *copy_state, 3125 uvm_tracker_t *copy_tracker, 3126 NV_STATUS push_status, 3127 uvm_push_t *push) 3128 { 3129 NV_STATUS tracker_status; 3130 3131 // TODO: Bug 1766424: If the destination is a GPU and the copy was done 3132 // by that GPU, use a GPU-local membar if no peer can currently 3133 // map this page. When peer access gets enabled, do a MEMBAR_SYS 3134 // at that point. 3135 uvm_push_end(push); 3136 3137 if ((push_status == NV_OK) && uvm_conf_computing_mode_enabled(push->gpu)) 3138 push_status = conf_computing_copy_pages_finish(block, copy_state, push); 3139 3140 tracker_status = uvm_tracker_add_push_safe(copy_tracker, push); 3141 if (push_status == NV_OK) 3142 push_status = tracker_status; 3143 3144 if (uvm_conf_computing_mode_enabled(push->gpu)) { 3145 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 3146 3147 uvm_tracker_overwrite_with_push(&local_tracker, push); 3148 uvm_conf_computing_dma_buffer_free(&push->gpu->conf_computing.dma_buffer_pool, 3149 copy_state->dma_buffer, 3150 &local_tracker); 3151 copy_state->dma_buffer = NULL; 3152 uvm_tracker_deinit(&local_tracker); 3153 } 3154 3155 return push_status; 3156 } 3157 3158 // Copies pages resident on the src_id processor to the dst_id processor 3159 // 3160 // The function adds the pages that were successfully copied to the output 3161 // migrated_pages mask and returns the number of pages in copied_pages. These 3162 // fields are reliable even if an error is returned. 3163 // 3164 // Acquires the block's tracker and adds all of its pushes to the copy_tracker. 3165 static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block, 3166 uvm_va_block_context_t *block_context, 3167 uvm_processor_id_t dst_id, 3168 uvm_processor_id_t src_id, 3169 uvm_va_block_region_t region, 3170 uvm_page_mask_t *copy_mask, 3171 const uvm_page_mask_t *prefetch_page_mask, 3172 uvm_va_block_transfer_mode_t transfer_mode, 3173 uvm_page_mask_t *migrated_pages, 3174 NvU32 *copied_pages, 3175 uvm_tracker_t *copy_tracker) 3176 { 3177 NV_STATUS status = NV_OK; 3178 uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(block, dst_id); 3179 uvm_gpu_t *copying_gpu = NULL; 3180 uvm_push_t push; 3181 uvm_page_index_t page_index; 3182 uvm_page_index_t contig_start_index = region.outer; 3183 uvm_page_index_t last_index = region.outer; 3184 uvm_range_group_range_t *rgr = NULL; 3185 bool rgr_has_changed = false; 3186 uvm_make_resident_cause_t cause = block_context->make_resident.cause; 3187 uvm_make_resident_cause_t contig_cause = cause; 3188 const bool may_prefetch = (cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT || 3189 cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT || 3190 cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER) && !!prefetch_page_mask; 3191 block_copy_state_t copy_state = {0}; 3192 uvm_va_range_t *va_range = block->va_range; 3193 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 3194 3195 copy_state.src.id = src_id; 3196 copy_state.dst.id = dst_id; 3197 copy_state.src.is_block_contig = is_block_phys_contig(block, src_id); 3198 copy_state.dst.is_block_contig = is_block_phys_contig(block, dst_id); 3199 3200 *copied_pages = 0; 3201 3202 // If there are no pages to be copied, exit early 3203 if (!uvm_page_mask_andnot(copy_mask, copy_mask, dst_resident_mask) || 3204 !uvm_page_mask_andnot(copy_mask, copy_mask, migrated_pages)) 3205 return NV_OK; 3206 3207 // uvm_range_group_range_iter_first should only be called when the va_space 3208 // lock is held, which is always the case unless an eviction is taking 3209 // place. 3210 if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION) { 3211 rgr = uvm_range_group_range_iter_first(va_space, 3212 uvm_va_block_region_start(block, region), 3213 uvm_va_block_region_end(block, region)); 3214 rgr_has_changed = true; 3215 } 3216 3217 if (UVM_ID_IS_CPU(dst_id)) { 3218 uvm_memcg_context_t memcg_context; 3219 3220 // To support staging through CPU, populate CPU pages on demand. 3221 // GPU destinations should have their pages populated already, but 3222 // that might change if we add staging through GPUs. 3223 uvm_memcg_context_start(&memcg_context, block_context->mm); 3224 status = block_populate_pages_cpu(block, copy_mask, region, block_context); 3225 uvm_memcg_context_end(&memcg_context); 3226 if (status != NV_OK) 3227 return status; 3228 } 3229 3230 // TODO: Bug 3745051: This function is complicated and needs refactoring 3231 for_each_va_block_page_in_region_mask(page_index, copy_mask, region) { 3232 NvU64 page_start = uvm_va_block_cpu_page_address(block, page_index); 3233 uvm_make_resident_cause_t page_cause = (may_prefetch && uvm_page_mask_test(prefetch_page_mask, page_index)) ? 3234 UVM_MAKE_RESIDENT_CAUSE_PREFETCH: 3235 cause; 3236 3237 UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id)); 3238 UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index)); 3239 3240 // If we're not evicting and we're migrating away from the preferred 3241 // location, then we should add the range group range to the list of 3242 // migrated ranges in the range group. It's safe to skip this because 3243 // the use of range_group's migrated_ranges list is a UVM-Lite 3244 // optimization - eviction is not supported on UVM-Lite GPUs. 3245 if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION && !uvm_va_block_is_hmm(block) && 3246 uvm_id_equal(src_id, uvm_va_range_get_policy(va_range)->preferred_location)) { 3247 // rgr_has_changed is used to minimize the number of times the 3248 // migrated_ranges_lock is taken. It is set to false when the range 3249 // group range pointed by rgr is added to the migrated_ranges list, 3250 // and it is just set back to true when we move to a different 3251 // range group range. 3252 3253 // The current page could be after the end of rgr. Iterate over the 3254 // range group ranges until rgr's end location is greater than or 3255 // equal to the current page. 3256 while (rgr && rgr->node.end < page_start) { 3257 rgr = uvm_range_group_range_iter_next(va_space, rgr, uvm_va_block_region_end(block, region)); 3258 rgr_has_changed = true; 3259 } 3260 3261 // Check whether the current page lies within rgr. A single page 3262 // must entirely reside within a range group range. Since we've 3263 // incremented rgr until its end is higher than page_start, we now 3264 // check if page_start lies within rgr. 3265 if (rgr && rgr_has_changed && page_start >= rgr->node.start && page_start <= rgr->node.end) { 3266 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock); 3267 if (list_empty(&rgr->range_group_migrated_list_node)) 3268 list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges); 3269 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock); 3270 3271 rgr_has_changed = false; 3272 } 3273 } 3274 3275 // No need to copy pages that haven't changed. Just clear residency 3276 // information 3277 if (block_page_is_clean(block, dst_id, src_id, page_index)) 3278 continue; 3279 3280 if (!copying_gpu) { 3281 status = block_copy_begin_push(block, ©_state, &block->tracker, &push); 3282 3283 if (status != NV_OK) 3284 break; 3285 copying_gpu = uvm_push_get_gpu(&push); 3286 3287 // Record all processors involved in the copy 3288 uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, copying_gpu->id); 3289 uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, dst_id); 3290 uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, src_id); 3291 3292 // This function is called just once per VA block and needs to 3293 // receive the "main" cause for the migration (it mainly checks if 3294 // we are in the eviction path). Therefore, we pass cause instead 3295 // of contig_cause 3296 uvm_tools_record_block_migration_begin(block, &push, dst_id, src_id, page_start, cause); 3297 } 3298 else { 3299 uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 3300 } 3301 3302 if (!uvm_va_block_is_hmm(block)) 3303 block_update_page_dirty_state(block, dst_id, src_id, page_index); 3304 3305 if (last_index == region.outer) { 3306 bool can_cache_src_phys_addr = copy_state.src.is_block_contig; 3307 bool can_cache_dst_phys_addr = copy_state.dst.is_block_contig; 3308 contig_start_index = page_index; 3309 contig_cause = page_cause; 3310 3311 // When CC is enabled, transfers between GPU and CPU don't rely on 3312 // any GPU mapping of CPU chunks, physical or virtual. 3313 if (UVM_ID_IS_CPU(src_id) && uvm_conf_computing_mode_enabled(copying_gpu)) 3314 can_cache_src_phys_addr = false; 3315 if (UVM_ID_IS_CPU(dst_id) && uvm_conf_computing_mode_enabled(copying_gpu)) 3316 can_cache_dst_phys_addr = false; 3317 // Computing the physical address is a non-trivial operation and 3318 // seems to be a performance limiter on systems with 2 or more 3319 // NVLINK links. Therefore, for physically-contiguous block 3320 // storage, we cache the start address and compute the page address 3321 // using the page index. 3322 if (can_cache_src_phys_addr) { 3323 copy_state.src.gpu_address = block_phys_page_copy_address(block, 3324 block_phys_page(src_id, 0), 3325 copying_gpu); 3326 } 3327 if (can_cache_dst_phys_addr) { 3328 copy_state.dst.gpu_address = block_phys_page_copy_address(block, 3329 block_phys_page(dst_id, 0), 3330 copying_gpu); 3331 } 3332 } 3333 else if ((page_index != last_index + 1) || contig_cause != page_cause) { 3334 uvm_va_block_region_t contig_region = uvm_va_block_region(contig_start_index, last_index + 1); 3335 UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region)); 3336 3337 // If both src and dst are physically-contiguous, consolidate copies 3338 // of contiguous pages into a single method. 3339 if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig) 3340 block_copy_push(block, ©_state, contig_region, &push); 3341 3342 uvm_perf_event_notify_migration(&va_space->perf_events, 3343 &push, 3344 block, 3345 dst_id, 3346 src_id, 3347 uvm_va_block_region_start(block, contig_region), 3348 uvm_va_block_region_size(contig_region), 3349 transfer_mode, 3350 contig_cause, 3351 &block_context->make_resident); 3352 3353 contig_start_index = page_index; 3354 contig_cause = page_cause; 3355 } 3356 3357 if (!copy_state.src.is_block_contig || !copy_state.dst.is_block_contig) 3358 block_copy_push(block, ©_state, uvm_va_block_region_for_page(page_index), &push); 3359 3360 last_index = page_index; 3361 } 3362 3363 // Copy the remaining pages 3364 if (copying_gpu) { 3365 uvm_va_block_region_t contig_region = uvm_va_block_region(contig_start_index, last_index + 1); 3366 UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region)); 3367 3368 if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig) 3369 block_copy_push(block, ©_state, contig_region, &push); 3370 3371 uvm_perf_event_notify_migration(&va_space->perf_events, 3372 &push, 3373 block, 3374 dst_id, 3375 src_id, 3376 uvm_va_block_region_start(block, contig_region), 3377 uvm_va_block_region_size(contig_region), 3378 transfer_mode, 3379 contig_cause, 3380 &block_context->make_resident); 3381 3382 status = block_copy_end_push(block, ©_state, copy_tracker, status, &push); 3383 } 3384 3385 // Update VA block status bits 3386 // 3387 // Only update the bits for the pages that succeeded 3388 if (status != NV_OK) 3389 uvm_page_mask_region_clear(copy_mask, uvm_va_block_region(page_index, PAGES_PER_UVM_VA_BLOCK)); 3390 3391 *copied_pages = uvm_page_mask_weight(copy_mask); 3392 if (*copied_pages) 3393 uvm_page_mask_or(migrated_pages, migrated_pages, copy_mask); 3394 3395 return status; 3396 } 3397 3398 // Copy resident pages to the destination from all source processors in the 3399 // src_processor_mask 3400 // 3401 // The function adds the pages that were successfully copied to the output 3402 // migrated_pages mask and returns the number of pages in copied_pages. These 3403 // fields are reliable even if an error is returned. 3404 static NV_STATUS block_copy_resident_pages_mask(uvm_va_block_t *block, 3405 uvm_va_block_context_t *block_context, 3406 uvm_processor_id_t dst_id, 3407 const uvm_processor_mask_t *src_processor_mask, 3408 uvm_va_block_region_t region, 3409 const uvm_page_mask_t *page_mask, 3410 const uvm_page_mask_t *prefetch_page_mask, 3411 uvm_va_block_transfer_mode_t transfer_mode, 3412 NvU32 max_pages_to_copy, 3413 uvm_page_mask_t *migrated_pages, 3414 NvU32 *copied_pages_out, 3415 uvm_tracker_t *tracker_out) 3416 { 3417 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 3418 uvm_processor_id_t src_id; 3419 uvm_processor_mask_t search_mask; 3420 uvm_page_mask_t *copy_mask = &block_context->make_resident.copy_resident_pages_mask; 3421 3422 uvm_processor_mask_copy(&search_mask, src_processor_mask); 3423 3424 *copied_pages_out = 0; 3425 3426 for_each_closest_id(src_id, &search_mask, dst_id, va_space) { 3427 uvm_page_mask_t *src_resident_mask = uvm_va_block_resident_mask_get(block, src_id); 3428 NV_STATUS status; 3429 NvU32 copied_pages_from_src; 3430 3431 UVM_ASSERT(!uvm_id_equal(src_id, dst_id)); 3432 3433 uvm_page_mask_init_from_region(copy_mask, region, src_resident_mask); 3434 3435 if (page_mask) 3436 uvm_page_mask_and(copy_mask, copy_mask, page_mask); 3437 3438 status = block_copy_resident_pages_between(block, 3439 block_context, 3440 dst_id, 3441 src_id, 3442 region, 3443 copy_mask, 3444 prefetch_page_mask, 3445 transfer_mode, 3446 migrated_pages, 3447 &copied_pages_from_src, 3448 tracker_out); 3449 *copied_pages_out += copied_pages_from_src; 3450 UVM_ASSERT(*copied_pages_out <= max_pages_to_copy); 3451 3452 if (status != NV_OK) 3453 return status; 3454 3455 // Break out once we copied max pages already 3456 if (*copied_pages_out == max_pages_to_copy) 3457 break; 3458 } 3459 3460 return NV_OK; 3461 } 3462 3463 static void break_read_duplication_in_region(uvm_va_block_t *block, 3464 uvm_va_block_context_t *block_context, 3465 uvm_processor_id_t dst_id, 3466 uvm_va_block_region_t region, 3467 const uvm_page_mask_t *page_mask) 3468 { 3469 uvm_processor_id_t id; 3470 uvm_page_mask_t *break_pages_in_region = &block_context->scratch_page_mask; 3471 3472 uvm_page_mask_init_from_region(break_pages_in_region, region, page_mask); 3473 3474 UVM_ASSERT(uvm_page_mask_subset(break_pages_in_region, uvm_va_block_resident_mask_get(block, dst_id))); 3475 3476 // Clear read_duplicated bit for all pages in region 3477 uvm_page_mask_andnot(&block->read_duplicated_pages, &block->read_duplicated_pages, break_pages_in_region); 3478 3479 // Clear residency bits for all processors other than dst_id 3480 for_each_id_in_mask(id, &block->resident) { 3481 uvm_page_mask_t *other_resident_mask; 3482 3483 if (uvm_id_equal(id, dst_id)) 3484 continue; 3485 3486 other_resident_mask = uvm_va_block_resident_mask_get(block, id); 3487 3488 if (!uvm_page_mask_andnot(other_resident_mask, other_resident_mask, break_pages_in_region)) 3489 block_clear_resident_processor(block, id); 3490 } 3491 } 3492 3493 static void block_copy_set_first_touch_residency(uvm_va_block_t *block, 3494 uvm_va_block_context_t *block_context, 3495 uvm_processor_id_t dst_id, 3496 uvm_va_block_region_t region, 3497 const uvm_page_mask_t *page_mask) 3498 { 3499 uvm_page_index_t page_index; 3500 uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id); 3501 uvm_page_mask_t *first_touch_mask = &block_context->make_resident.page_mask; 3502 3503 if (page_mask) 3504 uvm_page_mask_andnot(first_touch_mask, page_mask, resident_mask); 3505 else 3506 uvm_page_mask_complement(first_touch_mask, resident_mask); 3507 3508 uvm_page_mask_region_clear_outside(first_touch_mask, region); 3509 3510 for_each_va_block_page_in_mask(page_index, first_touch_mask, block) { 3511 UVM_ASSERT(!block_is_page_resident_anywhere(block, page_index)); 3512 UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index)); 3513 UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id)); 3514 } 3515 3516 uvm_page_mask_or(resident_mask, resident_mask, first_touch_mask); 3517 if (!uvm_page_mask_empty(resident_mask)) 3518 block_set_resident_processor(block, dst_id); 3519 3520 // Add them to the output mask, too 3521 uvm_page_mask_or(&block_context->make_resident.pages_changed_residency, 3522 &block_context->make_resident.pages_changed_residency, 3523 first_touch_mask); 3524 } 3525 3526 // Copy resident pages from other processors to the destination. 3527 // All the pages on the destination need to be populated by the caller first. 3528 // Pages not resident anywhere else need to be zeroed out as well. 3529 // The transfer_mode is only used to tell uvm_perf_event_notify_migration() 3530 // whether the copy is for a migration or read duplication. 3531 static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block, 3532 uvm_va_block_context_t *block_context, 3533 uvm_processor_id_t dst_id, 3534 uvm_va_block_region_t region, 3535 const uvm_page_mask_t *page_mask, 3536 const uvm_page_mask_t *prefetch_page_mask, 3537 uvm_va_block_transfer_mode_t transfer_mode) 3538 { 3539 NV_STATUS status = NV_OK; 3540 NV_STATUS tracker_status; 3541 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 3542 uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id); 3543 NvU32 missing_pages_count; 3544 NvU32 pages_copied; 3545 NvU32 pages_copied_to_cpu; 3546 uvm_processor_mask_t src_processor_mask; 3547 uvm_page_mask_t *copy_page_mask = &block_context->make_resident.page_mask; 3548 uvm_page_mask_t *migrated_pages = &block_context->make_resident.pages_migrated; 3549 uvm_page_mask_t *staged_pages = &block_context->make_resident.pages_staged; 3550 3551 uvm_page_mask_zero(migrated_pages); 3552 uvm_page_mask_zero(staged_pages); 3553 3554 if (page_mask) 3555 uvm_page_mask_andnot(copy_page_mask, page_mask, resident_mask); 3556 else 3557 uvm_page_mask_complement(copy_page_mask, resident_mask); 3558 3559 missing_pages_count = uvm_page_mask_region_weight(copy_page_mask, region); 3560 3561 if (missing_pages_count == 0) 3562 goto out; 3563 3564 // TODO: Bug 1753731: Add P2P2P copies staged through a GPU 3565 // TODO: Bug 1753731: When a page is resident in multiple locations due to 3566 // read-duplication, spread out the source of the copy so we don't 3567 // bottleneck on a single location. 3568 3569 uvm_processor_mask_zero(&src_processor_mask); 3570 3571 if (!uvm_id_equal(dst_id, UVM_ID_CPU)) { 3572 // If the destination is a GPU, first copy everything from processors 3573 // with copy access supported. Notably this will copy pages from the CPU 3574 // as well even if later some extra copies from CPU are required for 3575 // staged copies. 3576 uvm_processor_mask_and(&src_processor_mask, block_get_can_copy_from_mask(block, dst_id), &block->resident); 3577 uvm_processor_mask_clear(&src_processor_mask, dst_id); 3578 3579 status = block_copy_resident_pages_mask(block, 3580 block_context, 3581 dst_id, 3582 &src_processor_mask, 3583 region, 3584 copy_page_mask, 3585 prefetch_page_mask, 3586 transfer_mode, 3587 missing_pages_count, 3588 migrated_pages, 3589 &pages_copied, 3590 &local_tracker); 3591 3592 UVM_ASSERT(missing_pages_count >= pages_copied); 3593 missing_pages_count -= pages_copied; 3594 3595 if (status != NV_OK) 3596 goto out; 3597 3598 if (missing_pages_count == 0) 3599 goto out; 3600 3601 if (pages_copied) 3602 uvm_page_mask_andnot(copy_page_mask, copy_page_mask, migrated_pages); 3603 } 3604 3605 // Now copy from everywhere else to the CPU. This is both for when the 3606 // destination is the CPU (src_processor_mask empty) and for a staged copy 3607 // (src_processor_mask containing processors with copy access to dst_id). 3608 uvm_processor_mask_andnot(&src_processor_mask, &block->resident, &src_processor_mask); 3609 uvm_processor_mask_clear(&src_processor_mask, dst_id); 3610 uvm_processor_mask_clear(&src_processor_mask, UVM_ID_CPU); 3611 3612 status = block_copy_resident_pages_mask(block, 3613 block_context, 3614 UVM_ID_CPU, 3615 &src_processor_mask, 3616 region, 3617 copy_page_mask, 3618 prefetch_page_mask, 3619 transfer_mode, 3620 missing_pages_count, 3621 staged_pages, 3622 &pages_copied_to_cpu, 3623 &local_tracker); 3624 if (status != NV_OK) 3625 goto out; 3626 3627 // If destination is the CPU then we copied everything there above 3628 if (UVM_ID_IS_CPU(dst_id)) { 3629 uvm_page_mask_or(migrated_pages, migrated_pages, staged_pages); 3630 missing_pages_count -= pages_copied_to_cpu; 3631 3632 goto out; 3633 } 3634 3635 // Add everything to the block's tracker so that the 3636 // block_copy_resident_pages_between() call below will acquire it. 3637 status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker); 3638 if (status != NV_OK) 3639 goto out; 3640 uvm_tracker_clear(&local_tracker); 3641 3642 // Now copy staged pages from the CPU to the destination. 3643 status = block_copy_resident_pages_between(block, 3644 block_context, 3645 dst_id, 3646 UVM_ID_CPU, 3647 region, 3648 staged_pages, 3649 prefetch_page_mask, 3650 transfer_mode, 3651 migrated_pages, 3652 &pages_copied, 3653 &local_tracker); 3654 3655 UVM_ASSERT(missing_pages_count >= pages_copied); 3656 missing_pages_count -= pages_copied; 3657 3658 if (status != NV_OK) 3659 goto out; 3660 3661 // If we get here, that means we were staging the copy through the CPU and 3662 // we should copy as many pages from the CPU as we copied to the CPU. 3663 UVM_ASSERT(pages_copied == pages_copied_to_cpu); 3664 3665 out: 3666 // Add everything from the local tracker to the block's tracker. 3667 // Notably this is also needed for handling 3668 // block_copy_resident_pages_between() failures in the first loop. 3669 tracker_status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker); 3670 uvm_tracker_deinit(&local_tracker); 3671 3672 return status == NV_OK ? tracker_status : status; 3673 } 3674 3675 NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block, 3676 uvm_va_block_retry_t *va_block_retry, 3677 uvm_va_block_context_t *va_block_context, 3678 uvm_processor_id_t dest_id, 3679 uvm_va_block_region_t region, 3680 const uvm_page_mask_t *page_mask, 3681 const uvm_page_mask_t *prefetch_page_mask, 3682 uvm_make_resident_cause_t cause) 3683 { 3684 NV_STATUS status; 3685 uvm_processor_mask_t unmap_processor_mask; 3686 uvm_page_mask_t *unmap_page_mask = &va_block_context->make_resident.page_mask; 3687 uvm_page_mask_t *resident_mask; 3688 3689 va_block_context->make_resident.dest_id = dest_id; 3690 va_block_context->make_resident.cause = cause; 3691 3692 if (prefetch_page_mask) { 3693 UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT || 3694 cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT || 3695 cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER); 3696 } 3697 3698 uvm_assert_mutex_locked(&va_block->lock); 3699 UVM_ASSERT(uvm_va_block_is_hmm(va_block) || va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 3700 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region)); 3701 3702 resident_mask = block_resident_mask_get_alloc(va_block, dest_id); 3703 if (!resident_mask) 3704 return NV_ERR_NO_MEMORY; 3705 3706 // Unmap all mapped processors except for UVM-Lite GPUs as their mappings 3707 // are largely persistent. 3708 uvm_processor_mask_andnot(&unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block)); 3709 3710 if (page_mask) 3711 uvm_page_mask_andnot(unmap_page_mask, page_mask, resident_mask); 3712 else 3713 uvm_page_mask_complement(unmap_page_mask, resident_mask); 3714 uvm_page_mask_region_clear_outside(unmap_page_mask, region); 3715 3716 // Unmap all pages not resident on the destination 3717 status = uvm_va_block_unmap_mask(va_block, va_block_context, &unmap_processor_mask, region, unmap_page_mask); 3718 if (status != NV_OK) 3719 return status; 3720 3721 if (page_mask) 3722 uvm_page_mask_and(unmap_page_mask, page_mask, &va_block->read_duplicated_pages); 3723 else 3724 uvm_page_mask_init_from_region(unmap_page_mask, region, &va_block->read_duplicated_pages); 3725 uvm_page_mask_region_clear_outside(unmap_page_mask, region); 3726 3727 // Also unmap read-duplicated pages excluding dest_id 3728 uvm_processor_mask_clear(&unmap_processor_mask, dest_id); 3729 status = uvm_va_block_unmap_mask(va_block, va_block_context, &unmap_processor_mask, region, unmap_page_mask); 3730 if (status != NV_OK) 3731 return status; 3732 3733 uvm_tools_record_read_duplicate_invalidate(va_block, 3734 dest_id, 3735 region, 3736 unmap_page_mask); 3737 3738 // Note that block_populate_pages and block_copy_resident_pages also use 3739 // va_block_context->make_resident.page_mask. 3740 unmap_page_mask = NULL; 3741 3742 status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask); 3743 if (status != NV_OK) 3744 return status; 3745 3746 return block_copy_resident_pages(va_block, 3747 va_block_context, 3748 dest_id, 3749 region, 3750 page_mask, 3751 prefetch_page_mask, 3752 UVM_VA_BLOCK_TRANSFER_MODE_MOVE); 3753 } 3754 3755 static void block_make_resident_clear_evicted(uvm_va_block_t *va_block, 3756 uvm_processor_id_t dst_id, 3757 uvm_page_mask_t *page_mask) 3758 { 3759 uvm_va_block_gpu_state_t *dst_gpu_state = uvm_va_block_gpu_state_get(va_block, dst_id); 3760 3761 UVM_ASSERT(dst_gpu_state); 3762 3763 if (!uvm_page_mask_andnot(&dst_gpu_state->evicted, &dst_gpu_state->evicted, page_mask)) 3764 uvm_processor_mask_clear(&va_block->evicted_gpus, dst_id); 3765 } 3766 3767 static void block_make_resident_update_state(uvm_va_block_t *va_block, 3768 uvm_va_block_context_t *va_block_context, 3769 uvm_processor_id_t dst_id, 3770 uvm_va_block_region_t region, 3771 uvm_page_mask_t *copy_mask, 3772 uvm_make_resident_cause_t cause) 3773 { 3774 uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dst_id); 3775 3776 uvm_page_mask_or(dst_resident_mask, dst_resident_mask, copy_mask); 3777 block_set_resident_processor(va_block, dst_id); 3778 3779 // Accumulate the pages that migrated into the output mask. 3780 uvm_page_mask_or(&va_block_context->make_resident.pages_changed_residency, 3781 &va_block_context->make_resident.pages_changed_residency, 3782 copy_mask); 3783 3784 // Any move operation implies that mappings have been removed from all 3785 // non-UVM-Lite GPUs. 3786 uvm_page_mask_andnot(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, copy_mask); 3787 3788 // If we are migrating due to an eviction, set the GPU as evicted and 3789 // mark the evicted pages. If we are migrating away from the CPU this 3790 // means that those pages are not evicted. 3791 if (cause == UVM_MAKE_RESIDENT_CAUSE_EVICTION) { 3792 uvm_processor_id_t src_id; 3793 3794 UVM_ASSERT(UVM_ID_IS_CPU(dst_id)); 3795 3796 // Note that the destination is the CPU so this loop excludes it. 3797 for_each_gpu_id_in_mask(src_id, &va_block_context->make_resident.all_involved_processors) { 3798 uvm_va_block_gpu_state_t *src_gpu_state = uvm_va_block_gpu_state_get(va_block, src_id); 3799 3800 UVM_ASSERT(src_gpu_state); 3801 3802 uvm_page_mask_or(&src_gpu_state->evicted, &src_gpu_state->evicted, copy_mask); 3803 uvm_processor_mask_set(&va_block->evicted_gpus, src_id); 3804 } 3805 } 3806 else if (UVM_ID_IS_GPU(dst_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dst_id)) 3807 block_make_resident_clear_evicted(va_block, dst_id, copy_mask); 3808 } 3809 3810 void uvm_va_block_make_resident_finish(uvm_va_block_t *va_block, 3811 uvm_va_block_context_t *va_block_context, 3812 uvm_va_block_region_t region, 3813 const uvm_page_mask_t *page_mask) 3814 { 3815 uvm_page_mask_t *migrated_pages = &va_block_context->make_resident.pages_migrated; 3816 uvm_processor_id_t dst_id = va_block_context->make_resident.dest_id; 3817 3818 uvm_assert_mutex_locked(&va_block->lock); 3819 3820 if (page_mask) 3821 uvm_page_mask_and(migrated_pages, migrated_pages, page_mask); 3822 3823 if (!uvm_page_mask_empty(migrated_pages)) { 3824 // The migrated pages are now resident on the destination. 3825 block_make_resident_update_state(va_block, 3826 va_block_context, 3827 dst_id, 3828 region, 3829 migrated_pages, 3830 va_block_context->make_resident.cause); 3831 } 3832 3833 // Pages that weren't resident anywhere else were populated at the 3834 // destination directly. Mark them as resident now. 3835 block_copy_set_first_touch_residency(va_block, va_block_context, dst_id, region, page_mask); 3836 3837 // Break read duplication and clear residency from other processors. 3838 break_read_duplication_in_region(va_block, va_block_context, dst_id, region, page_mask); 3839 3840 // Update eviction heuristics, if needed. Notably this could repeat the call 3841 // done in block_set_resident_processor(), but that doesn't do anything bad 3842 // and it's simpler to keep it in both places. 3843 // 3844 // Skip this if we didn't do anything (the input region and/or page mask was 3845 // empty). 3846 if (uvm_processor_mask_test(&va_block->resident, dst_id)) 3847 block_mark_memory_used(va_block, dst_id); 3848 } 3849 3850 NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block, 3851 uvm_va_block_retry_t *va_block_retry, 3852 uvm_va_block_context_t *va_block_context, 3853 uvm_processor_id_t dest_id, 3854 uvm_va_block_region_t region, 3855 const uvm_page_mask_t *page_mask, 3856 const uvm_page_mask_t *prefetch_page_mask, 3857 uvm_make_resident_cause_t cause) 3858 { 3859 NV_STATUS status; 3860 3861 status = uvm_va_block_make_resident_copy(va_block, 3862 va_block_retry, 3863 va_block_context, 3864 dest_id, 3865 region, 3866 page_mask, 3867 prefetch_page_mask, 3868 cause); 3869 if (status != NV_OK) 3870 return status; 3871 3872 uvm_va_block_make_resident_finish(va_block, 3873 va_block_context, 3874 region, 3875 page_mask); 3876 3877 return NV_OK; 3878 } 3879 3880 // Combination function which prepares the input {region, page_mask} for 3881 // entering read-duplication. It: 3882 // - Unmaps all processors but revoke_id 3883 // - Revokes write access from revoke_id 3884 static NV_STATUS block_prep_read_duplicate_mapping(uvm_va_block_t *va_block, 3885 uvm_va_block_context_t *va_block_context, 3886 uvm_processor_id_t revoke_id, 3887 uvm_va_block_region_t region, 3888 const uvm_page_mask_t *page_mask) 3889 { 3890 uvm_processor_mask_t unmap_processor_mask; 3891 uvm_processor_id_t unmap_id; 3892 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 3893 NV_STATUS status, tracker_status; 3894 3895 // Unmap everybody except revoke_id 3896 uvm_processor_mask_andnot(&unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block)); 3897 uvm_processor_mask_clear(&unmap_processor_mask, revoke_id); 3898 3899 for_each_id_in_mask(unmap_id, &unmap_processor_mask) { 3900 status = uvm_va_block_unmap(va_block, 3901 va_block_context, 3902 unmap_id, 3903 region, 3904 page_mask, 3905 &local_tracker); 3906 if (status != NV_OK) 3907 goto out; 3908 } 3909 3910 // Revoke WRITE/ATOMIC access permissions from the remaining mapped 3911 // processor. 3912 status = uvm_va_block_revoke_prot(va_block, 3913 va_block_context, 3914 revoke_id, 3915 region, 3916 page_mask, 3917 UVM_PROT_READ_WRITE, 3918 &local_tracker); 3919 if (status != NV_OK) 3920 goto out; 3921 3922 out: 3923 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 3924 uvm_tracker_deinit(&local_tracker); 3925 return status == NV_OK ? tracker_status : status; 3926 } 3927 3928 NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block, 3929 uvm_va_block_retry_t *va_block_retry, 3930 uvm_va_block_context_t *va_block_context, 3931 uvm_processor_id_t dest_id, 3932 uvm_va_block_region_t region, 3933 const uvm_page_mask_t *page_mask, 3934 const uvm_page_mask_t *prefetch_page_mask, 3935 uvm_make_resident_cause_t cause) 3936 { 3937 NV_STATUS status = NV_OK; 3938 uvm_processor_id_t src_id; 3939 uvm_page_mask_t *dst_resident_mask; 3940 uvm_page_mask_t *cpu_resident_mask; 3941 uvm_page_mask_t *migrated_pages; 3942 uvm_page_mask_t *staged_pages; 3943 uvm_page_mask_t *first_touch_mask; 3944 3945 // TODO: Bug 3660922: need to implement HMM read duplication support. 3946 UVM_ASSERT(!uvm_va_block_is_hmm(va_block)); 3947 UVM_ASSERT(va_block_context->policy == uvm_va_range_get_policy(va_block->va_range)); 3948 3949 va_block_context->make_resident.dest_id = dest_id; 3950 va_block_context->make_resident.cause = cause; 3951 3952 if (prefetch_page_mask) { 3953 // TODO: Bug 1877578: investigate automatic read-duplicate policies 3954 UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT || 3955 cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT || 3956 cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER); 3957 } 3958 3959 uvm_assert_mutex_locked(&va_block->lock); 3960 UVM_ASSERT(!uvm_va_block_is_dead(va_block)); 3961 3962 // For pages that are entering read-duplication we need to unmap remote 3963 // mappings and revoke RW and higher access permissions. 3964 // 3965 // The current implementation: 3966 // - Unmaps pages from all processors but the one with the resident copy 3967 // - Revokes write access from the processor with the resident copy 3968 for_each_id_in_mask(src_id, &va_block->resident) { 3969 // Note that the below calls to block_populate_pages and 3970 // block_copy_resident_pages also use 3971 // va_block_context->make_resident.page_mask. 3972 uvm_page_mask_t *preprocess_page_mask = &va_block_context->make_resident.page_mask; 3973 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id); 3974 UVM_ASSERT(!uvm_page_mask_empty(resident_mask)); 3975 3976 if (page_mask) 3977 uvm_page_mask_andnot(preprocess_page_mask, page_mask, &va_block->read_duplicated_pages); 3978 else 3979 uvm_page_mask_complement(preprocess_page_mask, &va_block->read_duplicated_pages); 3980 3981 // If there are no pages that need to be unmapped/revoked, skip to the 3982 // next processor 3983 if (!uvm_page_mask_and(preprocess_page_mask, preprocess_page_mask, resident_mask)) 3984 continue; 3985 3986 status = block_prep_read_duplicate_mapping(va_block, va_block_context, src_id, region, preprocess_page_mask); 3987 if (status != NV_OK) 3988 return status; 3989 } 3990 3991 status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask); 3992 if (status != NV_OK) 3993 return status; 3994 3995 status = block_copy_resident_pages(va_block, 3996 va_block_context, 3997 dest_id, 3998 region, 3999 page_mask, 4000 prefetch_page_mask, 4001 UVM_VA_BLOCK_TRANSFER_MODE_COPY); 4002 if (status != NV_OK) 4003 return status; 4004 4005 // Pages that weren't resident anywhere else were populated at the 4006 // destination directly. Mark them as resident now, since there were no 4007 // errors from block_copy_resident_pages() above. 4008 // Note that va_block_context->scratch_page_mask is passed to 4009 // block_copy_set_first_touch_residency() which is generally unsafe but in 4010 // this case, block_copy_set_first_touch_residency() copies page_mask 4011 // before scratch_page_mask could be clobbered. 4012 migrated_pages = &va_block_context->make_resident.pages_migrated; 4013 first_touch_mask = &va_block_context->scratch_page_mask; 4014 uvm_page_mask_init_from_region(first_touch_mask, region, page_mask); 4015 uvm_page_mask_andnot(first_touch_mask, first_touch_mask, migrated_pages); 4016 4017 if (!uvm_page_mask_empty(first_touch_mask)) 4018 block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, first_touch_mask); 4019 4020 staged_pages = &va_block_context->make_resident.pages_staged; 4021 if (!UVM_ID_IS_CPU(dest_id) && !uvm_page_mask_empty(staged_pages)) { 4022 cpu_resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU); 4023 uvm_page_mask_or(cpu_resident_mask, cpu_resident_mask, staged_pages); 4024 block_set_resident_processor(va_block, UVM_ID_CPU); 4025 uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, staged_pages); 4026 uvm_tools_record_read_duplicate(va_block, UVM_ID_CPU, region, staged_pages); 4027 } 4028 4029 if (!uvm_page_mask_empty(migrated_pages)) { 4030 dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dest_id); 4031 uvm_page_mask_or(dst_resident_mask, dst_resident_mask, migrated_pages); 4032 block_set_resident_processor(va_block, dest_id); 4033 uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, migrated_pages); 4034 uvm_tools_record_read_duplicate(va_block, dest_id, region, migrated_pages); 4035 } 4036 4037 UVM_ASSERT(cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION); 4038 if (UVM_ID_IS_GPU(dest_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dest_id)) 4039 block_make_resident_clear_evicted(va_block, dest_id, migrated_pages); 4040 4041 // Update eviction heuristics, if needed. Notably this could repeat the call 4042 // done in block_set_resident_processor(), but that doesn't do anything bad 4043 // and it's simpler to keep it in both places. 4044 // 4045 // Skip this if we didn't do anything (the input region and/or page mask was 4046 // empty). 4047 if (uvm_processor_mask_test(&va_block->resident, dest_id)) 4048 block_mark_memory_used(va_block, dest_id); 4049 4050 return NV_OK; 4051 } 4052 4053 // Looks up the current CPU mapping state of page from the 4054 // block->cpu.pte_bits bitmaps. If write access is enabled, 4055 // UVM_PROT_READ_WRITE_ATOMIC is returned instead of UVM_PROT_READ_WRITE, since 4056 // write access implies atomic access for CPUs. 4057 static uvm_prot_t block_page_prot_cpu(uvm_va_block_t *block, uvm_page_index_t page_index) 4058 { 4059 uvm_prot_t prot; 4060 4061 UVM_ASSERT(!uvm_va_block_is_dead(block)); 4062 4063 if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index)) 4064 prot = UVM_PROT_READ_WRITE_ATOMIC; 4065 else if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index)) 4066 prot = UVM_PROT_READ_ONLY; 4067 else 4068 prot = UVM_PROT_NONE; 4069 4070 return prot; 4071 } 4072 4073 // Looks up the current GPU mapping state of page from the 4074 // block->gpus[i]->pte_bits bitmaps. 4075 static uvm_prot_t block_page_prot_gpu(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t page_index) 4076 { 4077 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4078 uvm_prot_t prot; 4079 4080 UVM_ASSERT(!uvm_va_block_is_dead(block)); 4081 4082 if (!gpu_state) 4083 return UVM_PROT_NONE; 4084 4085 if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], page_index)) 4086 prot = UVM_PROT_READ_WRITE_ATOMIC; 4087 else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], page_index)) 4088 prot = UVM_PROT_READ_WRITE; 4089 else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index)) 4090 prot = UVM_PROT_READ_ONLY; 4091 else 4092 prot = UVM_PROT_NONE; 4093 4094 return prot; 4095 } 4096 4097 static uvm_prot_t block_page_prot(uvm_va_block_t *block, uvm_processor_id_t id, uvm_page_index_t page_index) 4098 { 4099 if (UVM_ID_IS_CPU(id)) 4100 return block_page_prot_cpu(block, page_index); 4101 else 4102 return block_page_prot_gpu(block, block_get_gpu(block, id), page_index); 4103 } 4104 4105 // Returns true if the block has any valid CPU PTE mapping in the block region. 4106 static bool block_has_valid_mapping_cpu(uvm_va_block_t *block, uvm_va_block_region_t region) 4107 { 4108 size_t valid_page; 4109 4110 UVM_ASSERT(region.outer <= uvm_va_block_num_cpu_pages(block)); 4111 4112 // Early-out: check whether any address in this block has a CPU mapping 4113 if (!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) { 4114 UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])); 4115 UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])); 4116 return false; 4117 } 4118 4119 // All valid mappings have at least read permissions so we only need to 4120 // inspect the read bits. 4121 valid_page = uvm_va_block_first_page_in_mask(region, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]); 4122 if (valid_page == region.outer) 4123 return false; 4124 4125 UVM_ASSERT(block_page_prot_cpu(block, valid_page) != UVM_PROT_NONE); 4126 return true; 4127 } 4128 4129 static bool block_check_chunk_indirect_peers(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) 4130 { 4131 uvm_gpu_t *accessing_gpu; 4132 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 4133 4134 if (!uvm_pmm_sysmem_mappings_indirect_supported()) 4135 return true; 4136 4137 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 4138 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); 4139 uvm_reverse_map_t reverse_map; 4140 size_t num_mappings; 4141 4142 num_mappings = uvm_pmm_sysmem_mappings_dma_to_virt(&accessing_gpu->pmm_reverse_sysmem_mappings, 4143 peer_addr, 4144 uvm_gpu_chunk_get_size(chunk), 4145 &reverse_map, 4146 1); 4147 UVM_ASSERT(num_mappings == 1); 4148 UVM_ASSERT(reverse_map.va_block == block); 4149 UVM_ASSERT(reverse_map.region.first == chunk->va_block_page_index); 4150 UVM_ASSERT(uvm_va_block_region_size(reverse_map.region) == uvm_gpu_chunk_get_size(chunk)); 4151 4152 uvm_va_block_release_no_destroy(reverse_map.va_block); 4153 } 4154 4155 return true; 4156 } 4157 4158 // Sanity check the given GPU's chunks array 4159 static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id) 4160 { 4161 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id); 4162 uvm_gpu_t *gpu; 4163 size_t i, num_chunks; 4164 uvm_page_index_t page_index; 4165 uvm_chunk_size_t chunk_size; 4166 4167 if (!gpu_state) 4168 return true; 4169 4170 gpu = block_get_gpu(block, id); 4171 4172 num_chunks = block_num_gpu_chunks(block, gpu); 4173 for (page_index = 0, i = 0; i < num_chunks; i++) { 4174 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 4175 size_t chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size); 4176 4177 if (chunk_index != i) { 4178 UVM_ERR_PRINT("chunk index mismatch: calculated %zu, is in %zu. VA block [0x%llx, 0x%llx) GPU %u page_index: %u\n", 4179 chunk_index, 4180 i, 4181 block->start, 4182 block->end + 1, 4183 uvm_id_value(id), 4184 page_index); 4185 return false; 4186 } 4187 4188 if (chunk) { 4189 if (chunk_size != uvm_gpu_chunk_get_size(chunk)) { 4190 UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu\n", 4191 chunk_size, 4192 uvm_gpu_chunk_get_size(chunk), 4193 block->start, 4194 block->end + 1, 4195 uvm_id_value(id), 4196 page_index, 4197 i); 4198 return false; 4199 } 4200 4201 if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) { 4202 UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu chunk_size: %u\n", 4203 uvm_pmm_gpu_chunk_state_string(chunk->state), 4204 block->start, 4205 block->end + 1, 4206 uvm_id_value(id), 4207 page_index, 4208 i, 4209 chunk_size); 4210 return false; 4211 } 4212 4213 UVM_ASSERT(chunk->va_block == block); 4214 UVM_ASSERT(chunk->va_block_page_index == page_index); 4215 4216 UVM_ASSERT(block_check_chunk_indirect_peers(block, gpu, chunk)); 4217 } 4218 4219 page_index += chunk_size / PAGE_SIZE; 4220 } 4221 4222 return true; 4223 } 4224 4225 static bool block_check_chunks(uvm_va_block_t *va_block) 4226 { 4227 uvm_gpu_id_t id; 4228 4229 for_each_gpu_id(id) { 4230 if (!block_check_gpu_chunks(va_block, id)) 4231 return false; 4232 } 4233 4234 return block_check_cpu_chunks(va_block); 4235 } 4236 4237 // Sanity checks for page mappings 4238 static bool block_check_mappings_page(uvm_va_block_t *block, uvm_page_index_t page_index) 4239 { 4240 uvm_processor_mask_t atomic_mappings, write_mappings, read_mappings; 4241 uvm_processor_mask_t lite_read_mappings, lite_atomic_mappings; 4242 uvm_processor_mask_t remaining_mappings, temp_mappings; 4243 uvm_processor_mask_t resident_processors; 4244 const uvm_processor_mask_t *residency_accessible_from = NULL; 4245 const uvm_processor_mask_t *residency_has_native_atomics = NULL; 4246 uvm_processor_id_t residency, id; 4247 uvm_va_range_t *va_range = block->va_range; 4248 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 4249 uvm_processor_id_t preferred_location = va_range ? 4250 uvm_va_range_get_policy(va_range)->preferred_location : 4251 UVM_ID_INVALID; 4252 const uvm_processor_mask_t *uvm_lite_gpus = block_get_uvm_lite_gpus(block); 4253 4254 block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings); 4255 block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE, &write_mappings); 4256 block_page_authorized_processors(block, page_index, UVM_PROT_READ_ONLY, &read_mappings); 4257 4258 // Each access bit implies all accesses below it 4259 UVM_ASSERT(uvm_processor_mask_subset(&atomic_mappings, &write_mappings)); 4260 UVM_ASSERT(uvm_processor_mask_subset(&write_mappings, &read_mappings)); 4261 UVM_ASSERT(uvm_processor_mask_subset(&read_mappings, &block->mapped)); 4262 4263 uvm_va_block_page_resident_processors(block, page_index, &resident_processors); 4264 UVM_ASSERT(uvm_processor_mask_subset(&resident_processors, &block->resident)); 4265 4266 // Sanity check block_get_mapped_processors 4267 uvm_processor_mask_copy(&remaining_mappings, &read_mappings); 4268 for_each_id_in_mask(residency, &resident_processors) { 4269 block_get_mapped_processors(block, residency, page_index, &temp_mappings); 4270 UVM_ASSERT(uvm_processor_mask_subset(&temp_mappings, &remaining_mappings)); 4271 uvm_processor_mask_andnot(&remaining_mappings, &remaining_mappings, &temp_mappings); 4272 } 4273 4274 // Any remaining mappings point to non-resident locations, so they must be 4275 // UVM-Lite mappings. 4276 UVM_ASSERT(uvm_processor_mask_subset(&remaining_mappings, uvm_lite_gpus)); 4277 4278 residency = uvm_processor_mask_find_first_id(&resident_processors); 4279 4280 if (uvm_processor_mask_get_count(&resident_processors) > 0) { 4281 residency_accessible_from = &va_space->accessible_from[uvm_id_value(residency)]; 4282 residency_has_native_atomics = &va_space->has_native_atomics[uvm_id_value(residency)]; 4283 } 4284 4285 // If the page is not resident, there should be no valid mappings 4286 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&resident_processors) > 0 || 4287 uvm_processor_mask_get_count(&read_mappings) == 0, 4288 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", 4289 *resident_processors.bitmap, 4290 *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, 4291 *va_space->system_wide_atomics_enabled_processors.bitmap, 4292 *block->read_duplicated_pages.bitmap); 4293 4294 // Test read_duplicated_pages mask 4295 UVM_ASSERT_MSG((uvm_processor_mask_get_count(&resident_processors) <= 1 && 4296 !uvm_page_mask_test(&block->read_duplicated_pages, page_index)) || 4297 (uvm_processor_mask_get_count(&resident_processors) > 1 && 4298 uvm_page_mask_test(&block->read_duplicated_pages, page_index)), 4299 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", 4300 *resident_processors.bitmap, 4301 *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, 4302 *va_space->system_wide_atomics_enabled_processors.bitmap, 4303 *block->read_duplicated_pages.bitmap); 4304 4305 if (!uvm_processor_mask_empty(uvm_lite_gpus)) 4306 UVM_ASSERT(UVM_ID_IS_VALID(preferred_location)); 4307 4308 // UVM-Lite checks. Since the range group is made non-migratable before the 4309 // actual migrations for that range group happen, we can only make those 4310 // checks which are valid on both migratable and non-migratable range 4311 // groups. 4312 uvm_processor_mask_and(&lite_read_mappings, &read_mappings, uvm_lite_gpus); 4313 uvm_processor_mask_and(&lite_atomic_mappings, &atomic_mappings, uvm_lite_gpus); 4314 4315 // Any mapping from a UVM-Lite GPU must be atomic... 4316 UVM_ASSERT(uvm_processor_mask_equal(&lite_read_mappings, &lite_atomic_mappings)); 4317 4318 // ... and must have access to preferred_location 4319 if (UVM_ID_IS_VALID(preferred_location)) { 4320 const uvm_processor_mask_t *preferred_location_accessible_from; 4321 4322 preferred_location_accessible_from = &va_space->accessible_from[uvm_id_value(preferred_location)]; 4323 UVM_ASSERT(uvm_processor_mask_subset(&lite_atomic_mappings, preferred_location_accessible_from)); 4324 } 4325 4326 for_each_id_in_mask(id, &lite_atomic_mappings) 4327 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], preferred_location)); 4328 4329 // Exclude uvm_lite_gpus from mappings' masks after UVM-Lite tests 4330 uvm_processor_mask_andnot(&read_mappings, &read_mappings, uvm_lite_gpus); 4331 uvm_processor_mask_andnot(&write_mappings, &write_mappings, uvm_lite_gpus); 4332 uvm_processor_mask_andnot(&atomic_mappings, &atomic_mappings, uvm_lite_gpus); 4333 4334 // Pages set to zero in maybe_mapped_pages must not be mapped on any 4335 // non-UVM-Lite GPU 4336 if (!uvm_page_mask_test(&block->maybe_mapped_pages, page_index)) { 4337 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&read_mappings) == 0, 4338 "Resident: 0x%lx - Mappings Block: 0x%lx / Page R: 0x%lx W: 0x%lx A: 0x%lx\n", 4339 *resident_processors.bitmap, 4340 *block->mapped.bitmap, 4341 *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap); 4342 } 4343 4344 // atomic mappings from GPUs with disabled system-wide atomics are treated 4345 // as write mappings. Therefore, we remove them from the atomic mappings mask 4346 uvm_processor_mask_and(&atomic_mappings, &atomic_mappings, &va_space->system_wide_atomics_enabled_processors); 4347 4348 if (!uvm_processor_mask_empty(&read_mappings)) { 4349 // Read-duplicate: if a page is resident in multiple locations, it 4350 // must be resident locally on each mapped processor. 4351 if (uvm_processor_mask_get_count(&resident_processors) > 1) { 4352 UVM_ASSERT_MSG(uvm_processor_mask_subset(&read_mappings, &resident_processors), 4353 "Read-duplicate copies from remote processors\n" 4354 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", 4355 *resident_processors.bitmap, 4356 *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, 4357 *va_space->system_wide_atomics_enabled_processors.bitmap, 4358 *block->read_duplicated_pages.bitmap); 4359 } 4360 else { 4361 // Processors with mappings must have access to the processor that 4362 // has the valid copy 4363 UVM_ASSERT_MSG(uvm_processor_mask_subset(&read_mappings, residency_accessible_from), 4364 "Not all processors have access to %s\n" 4365 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" 4366 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", 4367 uvm_va_space_processor_name(va_space, residency), 4368 *resident_processors.bitmap, 4369 *read_mappings.bitmap, 4370 *write_mappings.bitmap, 4371 *atomic_mappings.bitmap, 4372 *residency_accessible_from->bitmap, 4373 *residency_has_native_atomics->bitmap, 4374 *va_space->system_wide_atomics_enabled_processors.bitmap); 4375 for_each_id_in_mask(id, &read_mappings) { 4376 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], residency)); 4377 4378 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(residency)], id)) { 4379 uvm_gpu_t *resident_gpu = uvm_va_space_get_gpu(va_space, residency); 4380 uvm_gpu_t *mapped_gpu = uvm_va_space_get_gpu(va_space, id); 4381 uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_phys_page(residency, page_index), NULL); 4382 4383 // This function will assert if no mapping exists 4384 (void)uvm_pmm_gpu_indirect_peer_addr(&resident_gpu->pmm, chunk, mapped_gpu); 4385 } 4386 } 4387 } 4388 } 4389 4390 // If any processor has a writable mapping, there must only be one copy of 4391 // the page in the system 4392 if (!uvm_processor_mask_empty(&write_mappings)) { 4393 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&resident_processors) == 1, 4394 "Too many resident copies for pages with write_mappings\n" 4395 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", 4396 *resident_processors.bitmap, 4397 *read_mappings.bitmap, 4398 *write_mappings.bitmap, 4399 *atomic_mappings.bitmap, 4400 *va_space->system_wide_atomics_enabled_processors.bitmap, 4401 *block->read_duplicated_pages.bitmap); 4402 } 4403 4404 if (!uvm_processor_mask_empty(&atomic_mappings)) { 4405 uvm_processor_mask_t native_atomics; 4406 4407 uvm_processor_mask_and(&native_atomics, &atomic_mappings, residency_has_native_atomics); 4408 4409 if (uvm_processor_mask_empty(&native_atomics)) { 4410 // No other faultable processor should be able to write 4411 uvm_processor_mask_and(&write_mappings, &write_mappings, &va_space->faultable_processors); 4412 4413 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&write_mappings) == 1, 4414 "Too many write mappings to %s from processors with non-native atomics\n" 4415 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" 4416 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", 4417 uvm_va_space_processor_name(va_space, residency), 4418 *resident_processors.bitmap, 4419 *read_mappings.bitmap, 4420 *write_mappings.bitmap, 4421 *atomic_mappings.bitmap, 4422 *residency_accessible_from->bitmap, 4423 *residency_has_native_atomics->bitmap, 4424 *va_space->system_wide_atomics_enabled_processors.bitmap); 4425 4426 // Only one processor outside of the native group can have atomics enabled 4427 UVM_ASSERT_MSG(uvm_processor_mask_get_count(&atomic_mappings) == 1, 4428 "Too many atomics mappings to %s from processors with non-native atomics\n" 4429 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" 4430 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", 4431 uvm_va_space_processor_name(va_space, residency), 4432 *resident_processors.bitmap, 4433 *read_mappings.bitmap, 4434 *write_mappings.bitmap, 4435 *atomic_mappings.bitmap, 4436 *residency_accessible_from->bitmap, 4437 *residency_has_native_atomics->bitmap, 4438 *va_space->system_wide_atomics_enabled_processors.bitmap); 4439 } 4440 else { 4441 uvm_processor_mask_t non_native_atomics; 4442 4443 // One or more processors within the native group have atomics enabled. 4444 // All processors outside of that group may have write but not atomic 4445 // permissions. 4446 uvm_processor_mask_andnot(&non_native_atomics, &atomic_mappings, residency_has_native_atomics); 4447 4448 UVM_ASSERT_MSG(uvm_processor_mask_empty(&non_native_atomics), 4449 "atomic mappings to %s from processors native and non-native\n" 4450 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" 4451 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", 4452 uvm_va_space_processor_name(va_space, residency), 4453 *resident_processors.bitmap, 4454 *read_mappings.bitmap, 4455 *write_mappings.bitmap, 4456 *atomic_mappings.bitmap, 4457 *residency_accessible_from->bitmap, 4458 *residency_has_native_atomics->bitmap, 4459 *va_space->system_wide_atomics_enabled_processors.bitmap); 4460 } 4461 } 4462 4463 return true; 4464 } 4465 4466 static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_gpu_t *gpu) 4467 { 4468 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4469 uvm_va_block_gpu_state_t *resident_gpu_state; 4470 uvm_pte_bits_gpu_t pte_bit; 4471 uvm_processor_id_t resident_id; 4472 uvm_prot_t prot; 4473 NvU32 big_page_size; 4474 size_t num_big_pages, big_page_index; 4475 uvm_va_block_region_t big_region, chunk_region; 4476 uvm_gpu_chunk_t *chunk; 4477 4478 if (!gpu_state->page_table_range_4k.table) 4479 UVM_ASSERT(!gpu_state->activated_4k); 4480 4481 if (!gpu_state->page_table_range_big.table) { 4482 UVM_ASSERT(!gpu_state->initialized_big); 4483 UVM_ASSERT(!gpu_state->activated_big); 4484 } 4485 4486 // It's only safe to check the PTE mappings if we have page tables. See 4487 // uvm_va_block_get_gpu_va_space. 4488 if (!block_gpu_has_page_tables(block, gpu)) { 4489 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, gpu->id)); 4490 return true; 4491 } 4492 4493 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); 4494 num_big_pages = uvm_va_block_num_big_pages(block, big_page_size); 4495 4496 if (block_gpu_supports_2m(block, gpu)) { 4497 if (gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table) { 4498 // 2M blocks require the 2M entry to be allocated for the lower 4499 // ranges to also be allocated. 4500 UVM_ASSERT(gpu_state->page_table_range_2m.table); 4501 } 4502 else if (gpu_state->page_table_range_2m.table) { 4503 // If the 2M entry is present but the lower ones aren't, the PTE 4504 // must be 2M. 4505 UVM_ASSERT(gpu_state->pte_is_2m); 4506 } 4507 } 4508 else { 4509 UVM_ASSERT(!gpu_state->page_table_range_2m.table); 4510 if (num_big_pages == 0) 4511 UVM_ASSERT(!gpu_state->page_table_range_big.table); 4512 } 4513 4514 // If we have the big table and it's in use then it must have been 4515 // initialized, even if it doesn't currently contain active PTEs. 4516 if ((!block_gpu_supports_2m(block, gpu) && gpu_state->page_table_range_big.table) || 4517 (block_gpu_supports_2m(block, gpu) && !gpu_state->pte_is_2m && gpu_state->activated_big)) 4518 UVM_ASSERT(gpu_state->initialized_big); 4519 4520 if (gpu_state->pte_is_2m) { 4521 UVM_ASSERT(block_gpu_supports_2m(block, gpu)); 4522 UVM_ASSERT(gpu_state->page_table_range_2m.table); 4523 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 4524 UVM_ASSERT(!gpu_state->force_4k_ptes); 4525 4526 // GPU architectures which support 2M pages only support 64K as the big 4527 // page size. All of the 2M code assumes that 4528 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK covers a 2M PTE exactly (bitmap_full, 4529 // bitmap_complement, etc). 4530 BUILD_BUG_ON((UVM_PAGE_SIZE_2M / UVM_PAGE_SIZE_64K) != MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 4531 4532 prot = block_page_prot_gpu(block, gpu, 0); 4533 4534 // All page permissions match 4535 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { 4536 if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot)) 4537 UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[pte_bit])); 4538 else 4539 UVM_ASSERT(uvm_page_mask_full(&gpu_state->pte_bits[pte_bit])); 4540 } 4541 4542 if (prot != UVM_PROT_NONE) { 4543 resident_id = block_gpu_get_processor_to_map(block, gpu, 0); 4544 4545 // block_check_resident_proximity verifies that no closer processor 4546 // has a resident page, so we don't need to check that all pages 4547 // have the same resident_id. 4548 4549 // block_check_mappings_page verifies that all pages marked resident 4550 // are backed by populated memory. 4551 4552 // The mapped processor should be fully resident and physically- 4553 // contiguous. 4554 UVM_ASSERT(uvm_page_mask_full(uvm_va_block_resident_mask_get(block, resident_id))); 4555 4556 if (UVM_ID_IS_GPU(resident_id)) { 4557 resident_gpu_state = uvm_va_block_gpu_state_get(block, resident_id); 4558 UVM_ASSERT(resident_gpu_state); 4559 UVM_ASSERT(uvm_gpu_chunk_get_size(resident_gpu_state->chunks[0]) == UVM_CHUNK_SIZE_2M); 4560 } 4561 else { 4562 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_first_in_region(block, 4563 uvm_va_block_region_from_block(block), 4564 NULL); 4565 4566 UVM_ASSERT(uvm_page_mask_full(&block->cpu.allocated)); 4567 UVM_ASSERT(chunk); 4568 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M); 4569 } 4570 } 4571 } 4572 else if (!bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 4573 UVM_ASSERT(gpu_state->page_table_range_big.table); 4574 UVM_ASSERT(!gpu_state->force_4k_ptes); 4575 UVM_ASSERT(num_big_pages > 0); 4576 UVM_ASSERT(gpu_state->initialized_big); 4577 4578 for (big_page_index = 0; big_page_index < num_big_pages; big_page_index++) { 4579 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 4580 4581 if (!test_bit(big_page_index, gpu_state->big_ptes)) { 4582 // If there are valid mappings but this isn't a big PTE, the 4583 // mapping must be using the 4k PTEs. 4584 if (!uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region)) 4585 UVM_ASSERT(gpu_state->page_table_range_4k.table); 4586 continue; 4587 } 4588 4589 prot = block_page_prot_gpu(block, gpu, big_region.first); 4590 4591 // All page permissions match 4592 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { 4593 if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot)) 4594 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[pte_bit], big_region)); 4595 else 4596 UVM_ASSERT(uvm_page_mask_region_full(&gpu_state->pte_bits[pte_bit], big_region)); 4597 } 4598 4599 if (prot != UVM_PROT_NONE) { 4600 resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first); 4601 4602 // The mapped processor should be fully resident and physically- 4603 // contiguous. Exception: UVM-Lite GPUs always map the preferred 4604 // location even if the memory is resident elsewhere. Skip the 4605 // residency check but still verify contiguity. 4606 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) { 4607 UVM_ASSERT(uvm_page_mask_region_full(uvm_va_block_resident_mask_get(block, resident_id), 4608 big_region)); 4609 } 4610 4611 if (UVM_ID_IS_CPU(resident_id)) { 4612 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, big_region.first); 4613 4614 UVM_ASSERT(gpu->parent->can_map_sysmem_with_large_pages); 4615 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) >= uvm_va_block_region_size(big_region)); 4616 } 4617 else { 4618 // Check GPU chunks 4619 chunk = block_phys_page_chunk(block, block_phys_page(resident_id, big_region.first), NULL); 4620 chunk_region = uvm_va_block_chunk_region(block, uvm_gpu_chunk_get_size(chunk), big_region.first); 4621 UVM_ASSERT(uvm_va_block_region_contains_region(chunk_region, big_region)); 4622 } 4623 } 4624 } 4625 } 4626 4627 return true; 4628 } 4629 4630 static bool block_check_mappings(uvm_va_block_t *block) 4631 { 4632 uvm_page_index_t page_index; 4633 uvm_processor_id_t id; 4634 4635 // Verify the master masks, since block_check_mappings_page relies on them 4636 for_each_processor_id(id) { 4637 const uvm_page_mask_t *resident_mask, *map_mask; 4638 4639 if (UVM_ID_IS_GPU(id) && !uvm_va_block_gpu_state_get(block, id)) { 4640 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id)); 4641 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id)); 4642 UVM_ASSERT(!uvm_processor_mask_test(&block->evicted_gpus, id)); 4643 continue; 4644 } 4645 4646 resident_mask = uvm_va_block_resident_mask_get(block, id); 4647 UVM_ASSERT(uvm_processor_mask_test(&block->resident, id) == !uvm_page_mask_empty(resident_mask)); 4648 4649 map_mask = uvm_va_block_map_mask_get(block, id); 4650 UVM_ASSERT(uvm_processor_mask_test(&block->mapped, id) == !uvm_page_mask_empty(map_mask)); 4651 4652 if (UVM_ID_IS_GPU(id)) { 4653 const uvm_page_mask_t *evicted_mask = block_evicted_mask_get(block, id); 4654 UVM_ASSERT(uvm_processor_mask_test(&block->evicted_gpus, id) == !uvm_page_mask_empty(evicted_mask)); 4655 4656 // Pages cannot be resident if they are marked as evicted 4657 UVM_ASSERT(!uvm_page_mask_intersects(evicted_mask, resident_mask)); 4658 4659 // Pages cannot be resident on a GPU with no memory 4660 if (!block_processor_has_memory(block, id)) 4661 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id)); 4662 } 4663 } 4664 4665 // Check that every page has coherent mappings 4666 for_each_va_block_page(page_index, block) 4667 block_check_mappings_page(block, page_index); 4668 4669 for_each_gpu_id(id) { 4670 if (uvm_va_block_gpu_state_get(block, id)) { 4671 uvm_gpu_t *gpu = block_get_gpu(block, id); 4672 4673 // Check big and/or 2M PTE state 4674 block_check_mappings_ptes(block, gpu); 4675 } 4676 } 4677 4678 return true; 4679 } 4680 4681 // See the comments on uvm_va_block_unmap 4682 static void block_unmap_cpu(uvm_va_block_t *block, uvm_va_block_region_t region, const uvm_page_mask_t *unmap_pages) 4683 { 4684 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 4685 uvm_pte_bits_cpu_t pte_bit; 4686 bool unmapped_something = false; 4687 uvm_va_block_region_t subregion; 4688 NvU32 num_mapped_processors; 4689 4690 // Early-out if nothing in the region is mapped or being unmapped. 4691 if (!block_has_valid_mapping_cpu(block, region) || 4692 (unmap_pages && !uvm_page_mask_intersects(unmap_pages, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]))) 4693 return; 4694 4695 // We can't actually unmap HMM ranges from the CPU here. 4696 // Unmapping happens as part of migrate_vma_setup(). 4697 if (uvm_va_block_is_hmm(block)) { 4698 UVM_ASSERT(!uvm_va_block_is_hmm(block)); 4699 return; 4700 } 4701 4702 num_mapped_processors = uvm_processor_mask_get_count(&block->mapped); 4703 4704 // If we are unmapping a page which we are tracking due to CPU faults with 4705 // correct permissions, clear the info. This will cover both the unmap and 4706 // revoke cases (since we implement CPU revocation by unmap + map) 4707 if (block->cpu.fault_authorized.first_fault_stamp && 4708 uvm_page_mask_region_test(unmap_pages, region, block->cpu.fault_authorized.page_index)) 4709 block->cpu.fault_authorized.first_fault_stamp = 0; 4710 4711 for_each_va_block_subregion_in_mask(subregion, unmap_pages, region) { 4712 if (!block_has_valid_mapping_cpu(block, subregion)) 4713 continue; 4714 4715 unmap_mapping_range(va_space->mapping, 4716 uvm_va_block_region_start(block, subregion), 4717 uvm_va_block_region_size(subregion), 1); 4718 4719 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++) 4720 uvm_page_mask_region_clear(&block->cpu.pte_bits[pte_bit], subregion); 4721 4722 // If the CPU is the only processor with mappings we can safely mark 4723 // the pages as fully unmapped 4724 if (num_mapped_processors == 1) 4725 uvm_page_mask_region_clear(&block->maybe_mapped_pages, subregion); 4726 4727 unmapped_something = true; 4728 } 4729 4730 if (!unmapped_something) 4731 return; 4732 4733 // Check whether the block has any more mappings 4734 if (uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) { 4735 UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])); 4736 uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU); 4737 } 4738 4739 UVM_ASSERT(block_check_mappings(block)); 4740 } 4741 4742 // Given a mask of mapped pages, returns true if any of the pages in the mask 4743 // are mapped remotely by the given GPU. 4744 static bool block_has_remote_mapping_gpu(uvm_va_block_t *block, 4745 uvm_va_block_context_t *block_context, 4746 uvm_gpu_id_t gpu_id, 4747 const uvm_page_mask_t *mapped_pages) 4748 { 4749 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id); 4750 4751 if (!gpu_state) 4752 return false; 4753 4754 // The caller must ensure that all pages of the input mask are really mapped 4755 UVM_ASSERT(uvm_page_mask_subset(mapped_pages, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ])); 4756 4757 // UVM-Lite GPUs map the preferred location if it's accessible, regardless 4758 // of the resident location. 4759 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu_id)) { 4760 if (uvm_page_mask_empty(mapped_pages)) 4761 return false; 4762 4763 return !uvm_id_equal(uvm_va_range_get_policy(block->va_range)->preferred_location, gpu_id); 4764 } 4765 4766 // Remote pages are pages which are mapped but not resident locally 4767 return uvm_page_mask_andnot(&block_context->scratch_page_mask, mapped_pages, &gpu_state->resident); 4768 } 4769 4770 // Writes pte_clear_val to the 4k PTEs covered by clear_page_mask. If 4771 // clear_page_mask is NULL, all 4k PTEs in the {block, gpu} are written. 4772 // 4773 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The 4774 // caller is responsible for ending the TLB batch with the appropriate membar. 4775 static void block_gpu_pte_clear_4k(uvm_va_block_t *block, 4776 uvm_gpu_t *gpu, 4777 const uvm_page_mask_t *clear_page_mask, 4778 NvU64 pte_clear_val, 4779 uvm_pte_batch_t *pte_batch, 4780 uvm_tlb_batch_t *tlb_batch) 4781 { 4782 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4783 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 4784 uvm_gpu_phys_address_t pte_addr; 4785 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K); 4786 uvm_va_block_region_t region = uvm_va_block_region_from_block(block); 4787 uvm_va_block_region_t subregion; 4788 size_t num_ptes, ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K; 4789 4790 for_each_va_block_subregion_in_mask(subregion, clear_page_mask, region) { 4791 num_ptes = uvm_va_block_region_num_pages(subregion) * ptes_per_page; 4792 4793 pte_addr = uvm_page_table_range_entry_address(tree, 4794 &gpu_state->page_table_range_4k, 4795 subregion.first * ptes_per_page); 4796 4797 uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, num_ptes); 4798 4799 if (tlb_batch) { 4800 uvm_tlb_batch_invalidate(tlb_batch, 4801 uvm_va_block_region_start(block, subregion), 4802 uvm_va_block_region_size(subregion), 4803 UVM_PAGE_SIZE_4K, 4804 UVM_MEMBAR_NONE); 4805 } 4806 } 4807 } 4808 4809 // Writes the 4k PTEs covered by write_page_mask using memory from resident_id 4810 // with new_prot permissions. new_prot must not be UVM_PROT_NONE: use 4811 // block_gpu_pte_clear_4k instead. 4812 // 4813 // If write_page_mask is NULL, all 4k PTEs in the {block, gpu} are written. 4814 // 4815 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The 4816 // caller is responsible for ending the TLB batch with the appropriate membar. 4817 static void block_gpu_pte_write_4k(uvm_va_block_t *block, 4818 uvm_gpu_t *gpu, 4819 uvm_processor_id_t resident_id, 4820 uvm_prot_t new_prot, 4821 const uvm_page_mask_t *write_page_mask, 4822 uvm_pte_batch_t *pte_batch, 4823 uvm_tlb_batch_t *tlb_batch) 4824 { 4825 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4826 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 4827 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K); 4828 const size_t ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K; 4829 uvm_va_block_region_t contig_region = {0}; 4830 uvm_gpu_phys_address_t contig_addr = {0}; 4831 uvm_gpu_phys_address_t page_addr = {0}; 4832 uvm_page_index_t page_index; 4833 NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id); 4834 4835 UVM_ASSERT(new_prot != UVM_PROT_NONE); 4836 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 4837 4838 for_each_va_block_page_in_mask(page_index, write_page_mask, block) { 4839 uvm_gpu_phys_address_t pte_addr; 4840 size_t i; 4841 4842 // Assume that this mapping will be used to write to the page 4843 if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block)) 4844 block_mark_cpu_page_dirty(block, page_index); 4845 4846 if (page_index >= contig_region.outer) { 4847 contig_region = block_phys_contig_region(block, page_index, resident_id); 4848 contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu); 4849 page_addr = contig_addr; 4850 } 4851 4852 page_addr.address = contig_addr.address + (page_index - contig_region.first) * PAGE_SIZE; 4853 4854 pte_addr = uvm_page_table_range_entry_address(tree, 4855 &gpu_state->page_table_range_4k, 4856 page_index * ptes_per_page); 4857 4858 // Handle PAGE_SIZE > GPU PTE size 4859 for (i = 0; i < ptes_per_page; i++) { 4860 NvU64 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags); 4861 uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size); 4862 page_addr.address += UVM_PAGE_SIZE_4K; 4863 pte_addr.address += pte_size; 4864 } 4865 4866 if (tlb_batch) { 4867 NvU64 page_virt_addr = uvm_va_block_cpu_page_address(block, page_index); 4868 uvm_tlb_batch_invalidate(tlb_batch, page_virt_addr, PAGE_SIZE, UVM_PAGE_SIZE_4K, UVM_MEMBAR_NONE); 4869 } 4870 } 4871 } 4872 4873 // Writes all 4k PTEs under the big PTE regions described by big_ptes_covered. 4874 // This is used to initialize the 4k PTEs when splitting 2M and big PTEs. It 4875 // only writes 4k PTEs, not big PTEs. 4876 // 4877 // For those 4k PTEs, new_pages_mask indicates which ones should inherit the 4878 // mapping from the corresponding big page (0) and which ones should be written 4879 // using memory from resident_id and new_prot (1). Unlike the other pte_write 4880 // functions, new_prot may be UVM_PROT_NONE. 4881 // 4882 // If resident_id is UVM_ID_INVALID, this function looks up the resident ID 4883 // which should inherit the current permissions. new_prot must be UVM_PROT_NONE 4884 // in this case. 4885 // 4886 // new_pages_mask must not be NULL. 4887 // 4888 // No TLB invalidates are required since we've set up the lower PTEs to never be 4889 // cached by the GPU's MMU when covered by larger PTEs. 4890 static void block_gpu_pte_big_split_write_4k(uvm_va_block_t *block, 4891 uvm_va_block_context_t *block_context, 4892 uvm_gpu_t *gpu, 4893 uvm_processor_id_t resident_id, 4894 uvm_prot_t new_prot, 4895 const unsigned long *big_ptes_covered, 4896 const uvm_page_mask_t *new_pages_mask, 4897 uvm_pte_batch_t *pte_batch) 4898 { 4899 uvm_va_block_region_t big_region; 4900 size_t big_page_index; 4901 uvm_processor_id_t curr_resident_id; 4902 uvm_prot_t curr_prot; 4903 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); 4904 4905 if (UVM_ID_IS_INVALID(resident_id)) 4906 UVM_ASSERT(new_prot == UVM_PROT_NONE); 4907 4908 for_each_set_bit(big_page_index, big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 4909 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 4910 4911 curr_prot = block_page_prot_gpu(block, gpu, big_region.first); 4912 4913 // The unmap path doesn't know the current residency ahead of time, so 4914 // we have to look it up. 4915 if (UVM_ID_IS_INVALID(resident_id)) { 4916 curr_resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first); 4917 } 4918 else { 4919 // Check that we aren't changing the aperture of the existing 4920 // mappings. It could be legal in some cases (switching from {RO, A} 4921 // to {RO, B} for example) but we'd need to issue TLB membars. 4922 if (curr_prot != UVM_PROT_NONE) 4923 UVM_ASSERT(uvm_id_equal(block_gpu_get_processor_to_map(block, gpu, big_region.first), resident_id)); 4924 4925 curr_resident_id = resident_id; 4926 } 4927 4928 // pages in new_pages_mask under this big page get new_prot 4929 uvm_page_mask_zero(&block_context->scratch_page_mask); 4930 uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region); 4931 if (uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) { 4932 if (new_prot == UVM_PROT_NONE) { 4933 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 4934 } 4935 else { 4936 block_gpu_pte_write_4k(block, 4937 gpu, 4938 curr_resident_id, 4939 new_prot, 4940 &block_context->scratch_page_mask, 4941 pte_batch, 4942 NULL); 4943 } 4944 } 4945 4946 // All other pages under this big page inherit curr_prot 4947 uvm_page_mask_zero(&block_context->scratch_page_mask); 4948 uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region); 4949 if (uvm_page_mask_andnot(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) { 4950 if (curr_prot == UVM_PROT_NONE) { 4951 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 4952 } 4953 else { 4954 block_gpu_pte_write_4k(block, 4955 gpu, 4956 curr_resident_id, 4957 curr_prot, 4958 &block_context->scratch_page_mask, 4959 pte_batch, 4960 NULL); 4961 } 4962 } 4963 } 4964 } 4965 4966 // Writes pte_clear_val to the big PTEs in big_ptes_mask. If big_ptes_mask is 4967 // NULL, all big PTEs in the {block, gpu} are cleared. 4968 // 4969 // If tlb_batch is provided, the big PTEs written are added to the batch. The 4970 // caller is responsible for ending the TLB batch with the appropriate membar. 4971 static void block_gpu_pte_clear_big(uvm_va_block_t *block, 4972 uvm_gpu_t *gpu, 4973 const unsigned long *big_ptes_mask, 4974 NvU64 pte_clear_val, 4975 uvm_pte_batch_t *pte_batch, 4976 uvm_tlb_batch_t *tlb_batch) 4977 { 4978 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 4979 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 4980 NvU32 big_page_size = gpu_va_space->page_tables.big_page_size; 4981 uvm_gpu_phys_address_t pte_addr; 4982 NvU32 pte_size = uvm_mmu_pte_size(&gpu_va_space->page_tables, big_page_size); 4983 size_t big_page_index; 4984 DECLARE_BITMAP(big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 4985 4986 if (big_ptes_mask) 4987 bitmap_copy(big_ptes_to_clear, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 4988 else 4989 bitmap_set(big_ptes_to_clear, 0, uvm_va_block_num_big_pages(block, big_page_size)); 4990 4991 for_each_set_bit(big_page_index, big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 4992 pte_addr = uvm_page_table_range_entry_address(&gpu_va_space->page_tables, 4993 &gpu_state->page_table_range_big, 4994 big_page_index); 4995 uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, 1); 4996 4997 if (tlb_batch) { 4998 uvm_tlb_batch_invalidate(tlb_batch, 4999 uvm_va_block_big_page_addr(block, big_page_index, big_page_size), 5000 big_page_size, 5001 big_page_size, 5002 UVM_MEMBAR_NONE); 5003 } 5004 } 5005 } 5006 5007 // Writes the big PTEs in big_ptes_mask using memory from resident_id with 5008 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use 5009 // block_gpu_pte_clear_big instead. 5010 // 5011 // Unlike block_gpu_pte_clear_big, big_ptes_mask must not be NULL. 5012 // 5013 // If tlb_batch is provided, the big PTEs written are added to the batch. The 5014 // caller is responsible for ending the TLB batch with the appropriate membar. 5015 static void block_gpu_pte_write_big(uvm_va_block_t *block, 5016 uvm_gpu_t *gpu, 5017 uvm_processor_id_t resident_id, 5018 uvm_prot_t new_prot, 5019 const unsigned long *big_ptes_mask, 5020 uvm_pte_batch_t *pte_batch, 5021 uvm_tlb_batch_t *tlb_batch) 5022 { 5023 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5024 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 5025 uvm_page_tree_t *tree = &gpu_va_space->page_tables; 5026 NvU32 big_page_size = tree->big_page_size; 5027 NvU32 pte_size = uvm_mmu_pte_size(tree, big_page_size); 5028 size_t big_page_index; 5029 uvm_va_block_region_t contig_region = {0}; 5030 uvm_gpu_phys_address_t contig_addr = {0}; 5031 uvm_gpu_phys_address_t page_addr = {0}; 5032 NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id); 5033 5034 UVM_ASSERT(new_prot != UVM_PROT_NONE); 5035 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 5036 UVM_ASSERT(big_ptes_mask); 5037 5038 if (!bitmap_empty(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 5039 UVM_ASSERT(uvm_va_block_num_big_pages(block, big_page_size) > 0); 5040 5041 if (!gpu->parent->can_map_sysmem_with_large_pages) 5042 UVM_ASSERT(UVM_ID_IS_GPU(resident_id)); 5043 } 5044 5045 for_each_set_bit(big_page_index, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 5046 NvU64 pte_val; 5047 uvm_gpu_phys_address_t pte_addr; 5048 uvm_va_block_region_t big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 5049 5050 // Assume that this mapping will be used to write to the page 5051 if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block)) { 5052 uvm_page_index_t page_index; 5053 5054 for_each_va_block_page_in_region(page_index, big_region) 5055 block_mark_cpu_page_dirty(block, page_index); 5056 } 5057 5058 if (big_region.first >= contig_region.outer) { 5059 contig_region = block_phys_contig_region(block, big_region.first, resident_id); 5060 contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu); 5061 page_addr = contig_addr; 5062 } 5063 5064 page_addr.address = contig_addr.address + (big_region.first - contig_region.first) * PAGE_SIZE; 5065 5066 pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_big, big_page_index); 5067 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags); 5068 uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size); 5069 5070 if (tlb_batch) { 5071 uvm_tlb_batch_invalidate(tlb_batch, 5072 uvm_va_block_region_start(block, big_region), 5073 big_page_size, 5074 big_page_size, 5075 UVM_MEMBAR_NONE); 5076 } 5077 } 5078 } 5079 5080 // Switches any mix of valid or invalid 4k PTEs under the big PTEs in 5081 // big_ptes_to_merge to an unmapped big PTE. This also ends both pte_batch and 5082 // tlb_batch in order to poison the now-unused 4k PTEs. 5083 // 5084 // The 4k PTEs are invalidated with the specified membar. 5085 static void block_gpu_pte_merge_big_and_end(uvm_va_block_t *block, 5086 uvm_va_block_context_t *block_context, 5087 uvm_gpu_t *gpu, 5088 const unsigned long *big_ptes_to_merge, 5089 uvm_push_t *push, 5090 uvm_pte_batch_t *pte_batch, 5091 uvm_tlb_batch_t *tlb_batch, 5092 uvm_membar_t tlb_membar) 5093 { 5094 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5095 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5096 NvU32 big_page_size = tree->big_page_size; 5097 NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size); 5098 size_t big_page_index; 5099 DECLARE_BITMAP(dummy_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5100 5101 UVM_ASSERT(!bitmap_empty(big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5102 UVM_ASSERT(!bitmap_and(dummy_big_ptes, gpu_state->big_ptes, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5103 5104 // We can be called with the 4k PTEs in two cases: 5105 // 1) 4k PTEs allocated. In this case the 4k PTEs are currently active. 5106 // 5107 // 2) 4k PTEs unallocated. In this case the GPU may not have invalid 4k PTEs 5108 // active under the big PTE, depending on whether neighboring blocks 5109 // caused the page tables to be allocated. 5110 // 5111 // In both cases we need to invalidate the 4k PTEs in case the GPU MMU has 5112 // them cached. 5113 5114 // Each big PTE is currently invalid so the 4ks are active (or unallocated). 5115 // First make the big PTEs unmapped to disable future lookups of the 4ks 5116 // under it. We can't directly transition the entry from valid 4k PTEs to 5117 // valid big PTEs, because that could cause the GPU TLBs to cache the same 5118 // VA in different cache lines. That could cause memory ordering to not be 5119 // maintained. 5120 block_gpu_pte_clear_big(block, gpu, big_ptes_to_merge, unmapped_pte_val, pte_batch, tlb_batch); 5121 5122 // Now invalidate the big PTEs we just wrote as well as all 4ks under them. 5123 // Subsequent MMU fills will stop at the now-unmapped big PTEs, so we only 5124 // need to invalidate the 4k PTEs without actually writing them. 5125 for_each_set_bit(big_page_index, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 5126 uvm_tlb_batch_invalidate(tlb_batch, 5127 uvm_va_block_big_page_addr(block, big_page_index, big_page_size), 5128 big_page_size, 5129 big_page_size | UVM_PAGE_SIZE_4K, 5130 UVM_MEMBAR_NONE); 5131 } 5132 5133 // End the batches for the caller. We need to do this here in order to 5134 // poison the 4ks below. 5135 uvm_pte_batch_end(pte_batch); 5136 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5137 5138 // As a guard against bad PTE writes/TLB invalidates, fill the now-unused 5139 // PTEs with a pattern which will trigger fatal faults on access. We have to 5140 // do this after the TLB invalidate of the big PTEs, or the GPU might use 5141 // the new values. 5142 if (UVM_IS_DEBUG() && gpu_state->page_table_range_4k.table) { 5143 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_to_merge); 5144 uvm_pte_batch_begin(push, pte_batch); 5145 block_gpu_pte_clear_4k(block, 5146 gpu, 5147 &block_context->scratch_page_mask, 5148 tree->hal->poisoned_pte(), 5149 pte_batch, 5150 NULL); 5151 uvm_pte_batch_end(pte_batch); 5152 } 5153 } 5154 5155 // Writes 0 (invalid) to the 2M PTE for this {block, gpu}. 5156 // 5157 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is 5158 // responsible for ending the TLB batch with the appropriate membar. 5159 static void block_gpu_pte_clear_2m(uvm_va_block_t *block, 5160 uvm_gpu_t *gpu, 5161 uvm_pte_batch_t *pte_batch, 5162 uvm_tlb_batch_t *tlb_batch) 5163 { 5164 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5165 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5166 uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0); 5167 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M); 5168 5169 // uvm_pte_batch_write_pte only writes the lower 8 bytes of the 16-byte PTE, 5170 // which would cause a problem when trying to make the entry invalid since 5171 // both halves must be 0. Using uvm_pte_batch_clear_ptes writes the entire 5172 // 16 bytes. 5173 uvm_pte_batch_clear_ptes(pte_batch, pte_addr, 0, pte_size, 1); 5174 5175 if (tlb_batch) 5176 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE); 5177 } 5178 5179 // Writes the 2M PTE for {block, gpu} using memory from resident_id with 5180 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use 5181 // block_gpu_pte_clear_2m instead. 5182 // 5183 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is 5184 // responsible for ending the TLB batch with the appropriate membar. 5185 static void block_gpu_pte_write_2m(uvm_va_block_t *block, 5186 uvm_gpu_t *gpu, 5187 uvm_processor_id_t resident_id, 5188 uvm_prot_t new_prot, 5189 uvm_pte_batch_t *pte_batch, 5190 uvm_tlb_batch_t *tlb_batch) 5191 { 5192 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5193 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5194 uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0); 5195 uvm_gpu_phys_address_t page_addr; 5196 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M); 5197 NvU64 pte_val; 5198 NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id); 5199 5200 UVM_ASSERT(new_prot != UVM_PROT_NONE); 5201 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 5202 5203 if (UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block)) 5204 block_mark_cpu_page_dirty(block, 0); 5205 5206 page_addr = block_phys_page_address(block, block_phys_page(resident_id, 0), gpu); 5207 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags); 5208 uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size); 5209 5210 if (tlb_batch) 5211 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE); 5212 } 5213 5214 static bool block_gpu_needs_to_activate_table(uvm_va_block_t *block, uvm_gpu_t *gpu) 5215 { 5216 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5217 5218 if (!block_gpu_supports_2m(block, gpu)) 5219 return false; 5220 5221 if ((gpu_state->page_table_range_big.table && !gpu_state->activated_big) || 5222 (gpu_state->page_table_range_4k.table && !gpu_state->activated_4k)) 5223 return true; 5224 5225 return false; 5226 } 5227 5228 // Only used if 2M PTEs are supported. Either transitions a 2M PTE to a PDE, or 5229 // activates a newly-allocated page table (big or 4k) while the other is already 5230 // active. The caller must have already written the new PTEs under the table 5231 // with the appropriate membar. 5232 static void block_gpu_write_pde(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_push_t *push, uvm_tlb_batch_t *tlb_batch) 5233 { 5234 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5235 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5236 5237 if (!gpu_state->pte_is_2m) 5238 UVM_ASSERT(block_gpu_needs_to_activate_table(block, gpu)); 5239 5240 UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table); 5241 5242 // We always need a membar to order PDE/PTE writes with the TLB invalidate. 5243 // write_pde will do a MEMBAR_SYS by default. 5244 if (uvm_page_table_range_aperture(&gpu_state->page_table_range_2m) == UVM_APERTURE_VID) 5245 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU); 5246 uvm_page_tree_write_pde(tree, &gpu_state->page_table_range_2m, push); 5247 5248 gpu->parent->host_hal->wait_for_idle(push); 5249 5250 // Invalidate just the PDE 5251 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE); 5252 5253 if (gpu_state->page_table_range_big.table) 5254 gpu_state->activated_big = true; 5255 5256 if (gpu_state->page_table_range_4k.table) 5257 gpu_state->activated_4k = true; 5258 } 5259 5260 // Called to switch the 2M PTE (valid or invalid) to a PDE. The caller should 5261 // have written all lower PTEs as appropriate into the given pte_batch already. 5262 // This function ends the PTE batch, activates the 2M PDE, and does a TLB 5263 // invalidate. 5264 // 5265 // The caller does not need to do any TLB invalidates since none of the lower 5266 // PTEs could be cached. 5267 static void block_gpu_pte_finish_split_2m(uvm_va_block_t *block, 5268 uvm_gpu_t *gpu, 5269 uvm_push_t *push, 5270 uvm_pte_batch_t *pte_batch, 5271 uvm_tlb_batch_t *tlb_batch, 5272 uvm_membar_t tlb_membar) 5273 { 5274 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5275 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); 5276 5277 // Step 1: Make the 2M entry invalid. We can't directly transition from a 5278 // valid 2M PTE to valid lower PTEs, because that could cause the 5279 // GPU TLBs to cache the same VA in different cache lines. That 5280 // could cause memory ordering to not be maintained. 5281 // 5282 // If the 2M PTE is already invalid, no TLB invalidate is needed. 5283 5284 if (curr_prot == UVM_PROT_NONE) { 5285 // If we aren't downgrading, then we don't need a membar. 5286 UVM_ASSERT(tlb_membar == UVM_MEMBAR_NONE); 5287 5288 // End the batch, which pushes a membar to ensure that the caller's PTE 5289 // writes below 2M are observed before the PDE write we're about to do. 5290 uvm_pte_batch_end(pte_batch); 5291 } 5292 else { 5293 // The 64k and 4k PTEs can't possibly be cached since the 2M entry is 5294 // not yet a PDE, so we just need to invalidate this single 2M entry. 5295 uvm_tlb_batch_begin(tree, tlb_batch); 5296 block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch); 5297 5298 // Make sure the PTE writes are observed before the TLB invalidate 5299 uvm_pte_batch_end(pte_batch); 5300 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5301 } 5302 5303 // Step 2: Switch the 2M entry from invalid to a PDE. This activates the 5304 // smaller PTEs. 5305 uvm_tlb_batch_begin(tree, tlb_batch); 5306 block_gpu_write_pde(block, gpu, push, tlb_batch); 5307 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 5308 } 5309 5310 // Switches any mix of valid or invalid 4k or 64k PTEs to an invalid 2M PTE. 5311 // Any lower PTEs are invalidated with the specified membar. 5312 static void block_gpu_pte_merge_2m(uvm_va_block_t *block, 5313 uvm_va_block_context_t *block_context, 5314 uvm_gpu_t *gpu, 5315 uvm_push_t *push, 5316 uvm_membar_t tlb_membar) 5317 { 5318 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5319 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5320 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5321 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5322 NvU32 tlb_inval_sizes; 5323 5324 UVM_ASSERT(!gpu_state->pte_is_2m); 5325 UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table); 5326 5327 // The 2M entry is currently a PDE, so first make it invalid. We can't 5328 // directly transition the entry from a valid PDE to a valid 2M PTE, because 5329 // that could cause the GPU TLBs to cache the same VA in different cache 5330 // lines. That could cause memory ordering to not be maintained. 5331 uvm_pte_batch_begin(push, pte_batch); 5332 block_gpu_pte_clear_2m(block, gpu, pte_batch, NULL); 5333 uvm_pte_batch_end(pte_batch); 5334 5335 // Now invalidate both the 2M entry we just wrote as well as all lower-level 5336 // entries which could be cached. Subsequent MMU fills will stop at the now- 5337 // invalid 2M entry, so we only need to invalidate the lower PTEs without 5338 // actually writing them. 5339 tlb_inval_sizes = UVM_PAGE_SIZE_2M; 5340 if (gpu_state->page_table_range_big.table) 5341 tlb_inval_sizes |= UVM_PAGE_SIZE_64K; 5342 5343 // Strictly-speaking we only need to invalidate those 4k ranges which are 5344 // not covered by a big pte. However, any such invalidate will require 5345 // enough 4k invalidates to force the TLB batching to invalidate everything 5346 // anyway, so just do the simpler thing. 5347 if (!bitmap_full(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 5348 tlb_inval_sizes |= UVM_PAGE_SIZE_4K; 5349 5350 uvm_tlb_batch_begin(tree, tlb_batch); 5351 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, tlb_inval_sizes, UVM_MEMBAR_NONE); 5352 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5353 5354 // As a guard against bad PTE writes/TLB invalidates, fill the now-unused 5355 // PTEs with a pattern which will trigger fatal faults on access. We have to 5356 // do this after the TLB invalidate of the 2M entry, or the GPU might use 5357 // the new values. 5358 if (UVM_IS_DEBUG()) { 5359 uvm_pte_batch_begin(push, pte_batch); 5360 5361 if (gpu_state->page_table_range_big.table) { 5362 block_gpu_pte_clear_big(block, 5363 gpu, 5364 NULL, 5365 tree->hal->poisoned_pte(), 5366 pte_batch, 5367 NULL); 5368 } 5369 5370 if (gpu_state->page_table_range_4k.table) { 5371 block_gpu_pte_clear_4k(block, 5372 gpu, 5373 NULL, 5374 tree->hal->poisoned_pte(), 5375 pte_batch, 5376 NULL); 5377 } 5378 5379 uvm_pte_batch_end(pte_batch); 5380 } 5381 } 5382 5383 static uvm_membar_t block_pte_op_membar(block_pte_op_t pte_op, uvm_gpu_t *gpu, uvm_processor_id_t resident_id) 5384 { 5385 // Permissions upgrades (MAP) don't need membars 5386 if (pte_op == BLOCK_PTE_OP_MAP) 5387 return UVM_MEMBAR_NONE; 5388 5389 UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); 5390 UVM_ASSERT(pte_op == BLOCK_PTE_OP_REVOKE); 5391 5392 return uvm_hal_downgrade_membar_type(gpu, uvm_id_equal(gpu->id, resident_id)); 5393 } 5394 5395 // Write the 2M PTE for {block, gpu} to the memory on resident_id with new_prot 5396 // permissions. If the 2M entry is currently a PDE, it is first merged into a 5397 // PTE. 5398 // 5399 // new_prot must not be UVM_PROT_NONE: use block_gpu_unmap_to_2m instead. 5400 // 5401 // pte_op specifies whether this is a MAP or REVOKE operation, which determines 5402 // the TLB membar required. 5403 static void block_gpu_map_to_2m(uvm_va_block_t *block, 5404 uvm_va_block_context_t *block_context, 5405 uvm_gpu_t *gpu, 5406 uvm_processor_id_t resident_id, 5407 uvm_prot_t new_prot, 5408 uvm_push_t *push, 5409 block_pte_op_t pte_op) 5410 { 5411 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5412 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 5413 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5414 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5415 uvm_membar_t tlb_membar; 5416 5417 UVM_ASSERT(new_prot != UVM_PROT_NONE); 5418 5419 // If we have a mix of big and 4k PTEs, we have to first merge them to an 5420 // invalid 2M PTE. 5421 if (!gpu_state->pte_is_2m) { 5422 block_gpu_pte_merge_2m(block, block_context, gpu, push, UVM_MEMBAR_NONE); 5423 5424 gpu_state->pte_is_2m = true; 5425 bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5426 } 5427 5428 // Write the new permissions 5429 uvm_pte_batch_begin(push, pte_batch); 5430 uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch); 5431 5432 block_gpu_pte_write_2m(block, gpu, resident_id, new_prot, pte_batch, tlb_batch); 5433 5434 uvm_pte_batch_end(pte_batch); 5435 5436 tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id); 5437 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5438 } 5439 5440 // Combination split + map operation, called when only part of a 2M PTE mapping 5441 // is being changed. This splits an existing valid or invalid 2M PTE into the 5442 // mix of big and 4k PTEs described by block_context->mapping.new_pte_state. 5443 // 5444 // The PTEs covering the pages in pages_to_write are written to the memory on 5445 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE. 5446 // 5447 // The PTEs covering the pages not set in pages_to_write inherit the mapping of 5448 // the current 2M PTE. If the current mapping is valid, it must target 5449 // resident_id. 5450 // 5451 // pte_op specifies whether this is a MAP or REVOKE operation, which determines 5452 // the TLB membar required. 5453 static void block_gpu_map_split_2m(uvm_va_block_t *block, 5454 uvm_va_block_context_t *block_context, 5455 uvm_gpu_t *gpu, 5456 uvm_processor_id_t resident_id, 5457 const uvm_page_mask_t *pages_to_write, 5458 uvm_prot_t new_prot, 5459 uvm_push_t *push, 5460 block_pte_op_t pte_op) 5461 { 5462 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5463 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5464 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 5465 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5466 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5467 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); 5468 uvm_membar_t tlb_membar; 5469 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5470 DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5471 DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5472 5473 UVM_ASSERT(gpu_state->pte_is_2m); 5474 5475 if (!gpu_state->page_table_range_4k.table) 5476 UVM_ASSERT(bitmap_full(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5477 5478 uvm_pte_batch_begin(push, pte_batch); 5479 5480 // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries 5481 // from the lower levels. This means we don't need to issue a TLB invalidate 5482 // when writing those levels. 5483 5484 // Cases to handle: 5485 // 1) Big PTEs which inherit curr_prot 5486 // 2) Big PTEs which get new_prot 5487 // 3) Big PTEs which are split to 4k 5488 // a) 4k PTEs which inherit curr_prot under the split big PTEs 5489 // b) 4k PTEs which get new_prot under the split big PTEs 5490 5491 // Compute the big PTEs which will need to be split to 4k, if any. 5492 bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5493 5494 if (gpu_state->page_table_range_big.table) { 5495 // Case 1: Write the big PTEs which will inherit the 2M permissions, if 5496 // any. These are the big PTEs which are unchanged (uncovered) by the 5497 // operation. 5498 bitmap_andnot(big_ptes_inherit, 5499 new_pte_state->big_ptes, 5500 new_pte_state->big_ptes_covered, 5501 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5502 5503 if (curr_prot == UVM_PROT_NONE) { 5504 block_gpu_pte_clear_big(block, 5505 gpu, 5506 big_ptes_inherit, 5507 tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K), 5508 pte_batch, 5509 NULL); 5510 } 5511 else { 5512 block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL); 5513 } 5514 5515 // Case 2: Write the new big PTEs 5516 bitmap_and(big_ptes_new_prot, 5517 new_pte_state->big_ptes, 5518 new_pte_state->big_ptes_covered, 5519 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5520 block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_new_prot, pte_batch, NULL); 5521 5522 // Case 3: Write the big PTEs which cover 4k PTEs 5523 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL); 5524 5525 // We just wrote all possible big PTEs, so mark them as initialized 5526 gpu_state->initialized_big = true; 5527 } 5528 else { 5529 UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5530 } 5531 5532 // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs 5533 block_gpu_pte_big_split_write_4k(block, 5534 block_context, 5535 gpu, 5536 resident_id, 5537 new_prot, 5538 big_ptes_split, 5539 pages_to_write, 5540 pte_batch); 5541 5542 // Activate the 2M PDE. This ends the pte_batch and issues a single TLB 5543 // invalidate for the 2M entry. 5544 tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id); 5545 block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar); 5546 5547 gpu_state->pte_is_2m = false; 5548 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5549 } 5550 5551 // Split the existing 2M PTE into big and 4k PTEs. No permissions are changed. 5552 // 5553 // new_big_ptes specifies which PTEs should be big. NULL means all PTEs should 5554 // be 4k. 5555 static void block_gpu_split_2m(uvm_va_block_t *block, 5556 uvm_va_block_context_t *block_context, 5557 uvm_gpu_t *gpu, 5558 const unsigned long *new_big_ptes, 5559 uvm_push_t *push) 5560 { 5561 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5562 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5563 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5564 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5565 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); 5566 DECLARE_BITMAP(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5567 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5568 NvU64 unmapped_pte_val; 5569 uvm_processor_id_t curr_residency; 5570 5571 UVM_ASSERT(gpu_state->pte_is_2m); 5572 5573 if (new_big_ptes) 5574 bitmap_copy(new_big_ptes_local, new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5575 else 5576 bitmap_zero(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5577 5578 if (!bitmap_empty(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 5579 UVM_ASSERT(gpu_state->page_table_range_big.table); 5580 5581 // We're splitting from 2M to big only, so we'll be writing all big PTEs 5582 if (gpu_state->page_table_range_big.table) 5583 gpu_state->initialized_big = true; 5584 5585 // Cases to handle: 5586 // 1) Big PTEs which inherit curr_prot 5587 // 2) Big PTEs which are split to 4k 5588 // a) 4k PTEs inherit curr_prot under the split big PTEs 5589 5590 // big_ptes_split will cover the 4k regions 5591 bitmap_complement(big_ptes_split, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5592 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_split); 5593 5594 uvm_pte_batch_begin(push, pte_batch); 5595 5596 // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries 5597 // from the lower levels. This means we don't need to issue a TLB invalidate 5598 // when writing those levels. 5599 5600 if (curr_prot == UVM_PROT_NONE) { 5601 unmapped_pte_val = tree->hal->unmapped_pte(tree->big_page_size); 5602 5603 // Case 2a: Clear the 4k PTEs under big_ptes_split 5604 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 5605 5606 // Case 1: Make the remaining big PTEs unmapped 5607 block_gpu_pte_clear_big(block, gpu, new_big_ptes_local, unmapped_pte_val, pte_batch, NULL); 5608 } 5609 else { 5610 curr_residency = block_gpu_get_processor_to_map(block, gpu, 0); 5611 5612 // Case 2a: Write the new 4k PTEs under big_ptes_split 5613 block_gpu_pte_write_4k(block, 5614 gpu, 5615 curr_residency, 5616 curr_prot, 5617 &block_context->scratch_page_mask, 5618 pte_batch, 5619 NULL); 5620 5621 // Case 1: Write the new big PTEs 5622 block_gpu_pte_write_big(block, gpu, curr_residency, curr_prot, new_big_ptes_local, pte_batch, NULL); 5623 } 5624 5625 // Case 2: Make big_ptes_split invalid to activate the 4k PTEs 5626 if (gpu_state->page_table_range_big.table) 5627 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL); 5628 5629 // Activate the 2M PDE. This ends the pte_batch and issues a single TLB 5630 // invalidate for the 2M entry. No membar is necessary since we aren't 5631 // changing permissions. 5632 block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, UVM_MEMBAR_NONE); 5633 5634 gpu_state->pte_is_2m = false; 5635 bitmap_copy(gpu_state->big_ptes, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5636 } 5637 5638 // Split the big PTEs in big_ptes_to_split into 4k PTEs. No permissions are 5639 // changed. 5640 // 5641 // big_ptes_to_split must not be NULL. 5642 static void block_gpu_split_big(uvm_va_block_t *block, 5643 uvm_va_block_context_t *block_context, 5644 uvm_gpu_t *gpu, 5645 const unsigned long *big_ptes_to_split, 5646 uvm_push_t *push) 5647 { 5648 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5649 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5650 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5651 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5652 NvU32 big_page_size = tree->big_page_size; 5653 uvm_va_block_region_t big_region; 5654 uvm_processor_id_t resident_id; 5655 size_t big_page_index; 5656 uvm_prot_t curr_prot; 5657 DECLARE_BITMAP(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5658 5659 UVM_ASSERT(!gpu_state->pte_is_2m); 5660 UVM_ASSERT(bitmap_subset(big_ptes_to_split, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5661 UVM_ASSERT(!bitmap_empty(big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5662 5663 uvm_pte_batch_begin(push, pte_batch); 5664 uvm_tlb_batch_begin(tree, tlb_batch); 5665 5666 // Write all 4k PTEs under all big PTEs which are being split. We'll make 5667 // the big PTEs inactive below after flushing these writes. No TLB 5668 // invalidate is needed since the big PTE is active. 5669 bitmap_zero(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5670 for_each_set_bit(big_page_index, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 5671 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 5672 curr_prot = block_page_prot_gpu(block, gpu, big_region.first); 5673 5674 uvm_page_mask_zero(&block_context->scratch_page_mask); 5675 uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region); 5676 if (curr_prot == UVM_PROT_NONE) { 5677 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 5678 } 5679 else { 5680 __set_bit(big_page_index, big_ptes_valid); 5681 5682 resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first); 5683 5684 block_gpu_pte_write_4k(block, 5685 gpu, 5686 resident_id, 5687 curr_prot, 5688 &block_context->scratch_page_mask, 5689 pte_batch, 5690 NULL); 5691 } 5692 } 5693 5694 // Unmap the big PTEs which are valid and are being split to 4k. We can't 5695 // directly transition from a valid big PTE to valid lower PTEs, because 5696 // that could cause the GPU TLBs to cache the same VA in different cache 5697 // lines. That could cause memory ordering to not be maintained. 5698 block_gpu_pte_clear_big(block, gpu, big_ptes_valid, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch); 5699 5700 // End the batches. We have to commit the membars and TLB invalidates 5701 // before we finish splitting formerly-big PTEs. No membar is necessary 5702 // since we aren't changing permissions. 5703 uvm_pte_batch_end(pte_batch); 5704 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 5705 5706 // Finish the split by switching the big PTEs from unmapped to invalid. This 5707 // causes the GPU MMU to start reading the 4k PTEs instead of stopping at 5708 // the unmapped big PTEs. 5709 uvm_pte_batch_begin(push, pte_batch); 5710 uvm_tlb_batch_begin(tree, tlb_batch); 5711 5712 block_gpu_pte_clear_big(block, gpu, big_ptes_to_split, 0, pte_batch, tlb_batch); 5713 5714 uvm_pte_batch_end(pte_batch); 5715 5716 // Finally, activate the page tables if they're inactive 5717 if (block_gpu_needs_to_activate_table(block, gpu)) 5718 block_gpu_write_pde(block, gpu, push, tlb_batch); 5719 5720 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 5721 5722 bitmap_andnot(gpu_state->big_ptes, gpu_state->big_ptes, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5723 } 5724 5725 // Changes permissions on some pre-existing mix of big and 4k PTEs into some 5726 // other mix of big and 4k PTEs, as described by 5727 // block_context->mapping.new_pte_state. 5728 // 5729 // The PTEs covering the pages in pages_to_write are written to the memory on 5730 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE. 5731 // 5732 // pte_op specifies whether this is a MAP or REVOKE operation, which determines 5733 // the TLB membar required. 5734 static void block_gpu_map_big_and_4k(uvm_va_block_t *block, 5735 uvm_va_block_context_t *block_context, 5736 uvm_gpu_t *gpu, 5737 uvm_processor_id_t resident_id, 5738 const uvm_page_mask_t *pages_to_write, 5739 uvm_prot_t new_prot, 5740 uvm_push_t *push, 5741 block_pte_op_t pte_op) 5742 { 5743 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5744 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5745 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 5746 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5747 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5748 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5749 DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5750 DECLARE_BITMAP(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5751 DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5752 uvm_va_block_region_t big_region; 5753 size_t big_page_index; 5754 NvU32 big_page_size = tree->big_page_size; 5755 uvm_membar_t tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id); 5756 5757 UVM_ASSERT(!gpu_state->pte_is_2m); 5758 5759 uvm_pte_batch_begin(push, pte_batch); 5760 uvm_tlb_batch_begin(tree, tlb_batch); 5761 5762 // All of these cases might be perfomed in the same call: 5763 // 1) Split currently-big PTEs to 4k 5764 // a) Write new 4k PTEs which inherit curr_prot under the split big PTEs 5765 // b) Write new 4k PTEs which get new_prot under the split big PTEs 5766 // 2) Merge currently-4k PTEs to big with new_prot 5767 // 3) Write currently-big PTEs which wholly get new_prot 5768 // 4) Write currently-4k PTEs which get new_prot 5769 // 5) Initialize big PTEs which are not covered by this operation 5770 5771 // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are 5772 // being split. We'll make the big PTEs inactive below after flushing these 5773 // writes. No TLB invalidate is needed since the big PTE is active. 5774 // 5775 // Mask computation: big_before && !big_after 5776 bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5777 5778 block_gpu_pte_big_split_write_4k(block, 5779 block_context, 5780 gpu, 5781 resident_id, 5782 new_prot, 5783 big_ptes_split, 5784 pages_to_write, 5785 pte_batch); 5786 5787 // Case 4: Write the 4k PTEs which weren't covered by a big PTE before, and 5788 // remain uncovered after the operation. 5789 // 5790 // Mask computation: !big_before && !big_after 5791 bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5792 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after); 5793 if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_write, &block_context->scratch_page_mask)) { 5794 block_gpu_pte_write_4k(block, 5795 gpu, 5796 resident_id, 5797 new_prot, 5798 &block_context->scratch_page_mask, 5799 pte_batch, 5800 tlb_batch); 5801 } 5802 5803 // Case 5: If the big page table is newly-allocated, make sure that all big 5804 // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are 5805 // all initialized to invalid. 5806 // 5807 // The similar case of making newly-allocated big PTEs unmapped when no 5808 // lower 4k table is present is handled by having 5809 // block_gpu_compute_new_pte_state set new_pte_state->big_ptes 5810 // appropriately. 5811 if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) { 5812 // TODO: Bug 1766424: If we have the 4k page table already, we could 5813 // attempt to merge all uncovered big PTE regions when first 5814 // allocating the big table. That's probably not worth doing. 5815 UVM_ASSERT(gpu_state->page_table_range_4k.table); 5816 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 5817 bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size)); 5818 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch); 5819 gpu_state->initialized_big = true; 5820 } 5821 5822 // Case 1 (step 1): Unmap the currently-big PTEs which are valid and are 5823 // being split to 4k. We can't directly transition from a valid big PTE to 5824 // valid lower PTEs, because that could cause the GPU TLBs to cache the same 5825 // VA in different cache lines. That could cause memory ordering to not be 5826 // maintained. 5827 bitmap_zero(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5828 for_each_set_bit(big_page_index, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { 5829 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 5830 if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region.first)) 5831 __set_bit(big_page_index, big_ptes_mask); 5832 } 5833 5834 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch); 5835 5836 // Case 3: Write the currently-big PTEs which remain big PTEs, and are 5837 // wholly changing permissions. 5838 // 5839 // Mask computation: big_before && big_after && covered 5840 bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5841 if (bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 5842 block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_mask, pte_batch, tlb_batch); 5843 5844 // Case 2 (step 1): Merge the new big PTEs and end the batches, now that 5845 // we've done all of the independent PTE writes we can. This also merges 5846 // newly-allocated uncovered big PTEs to unmapped (see 5847 // block_gpu_compute_new_pte_state). 5848 // 5849 // Mask computation: !big_before && big_after 5850 if (bitmap_andnot(big_ptes_merge, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 5851 // This writes the newly-big PTEs to unmapped and ends the PTE and TLB 5852 // batches. 5853 block_gpu_pte_merge_big_and_end(block, 5854 block_context, 5855 gpu, 5856 big_ptes_merge, 5857 push, 5858 pte_batch, 5859 tlb_batch, 5860 tlb_membar); 5861 5862 // Remove uncovered big PTEs. We needed to merge them to unmapped above, 5863 // but they shouldn't get new_prot below. 5864 bitmap_and(big_ptes_merge, big_ptes_merge, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5865 } 5866 else { 5867 // End the batches. We have to commit the membars and TLB invalidates 5868 // before we finish splitting formerly-big PTEs. 5869 uvm_pte_batch_end(pte_batch); 5870 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5871 } 5872 5873 if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) || 5874 !bitmap_empty(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) || 5875 block_gpu_needs_to_activate_table(block, gpu)) { 5876 5877 uvm_pte_batch_begin(push, pte_batch); 5878 uvm_tlb_batch_begin(tree, tlb_batch); 5879 5880 // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by 5881 // switching them from unmapped to invalid. This causes the GPU MMU to 5882 // start reading the 4k PTEs instead of stopping at the unmapped big 5883 // PTEs. 5884 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch); 5885 5886 // Case 2 (step 2): Finish merging our big PTEs, if we have any, by 5887 // switching them from unmapped to new_prot. 5888 block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_merge, pte_batch, tlb_batch); 5889 5890 uvm_pte_batch_end(pte_batch); 5891 5892 // Finally, activate the page tables if they're inactive 5893 if (block_gpu_needs_to_activate_table(block, gpu)) 5894 block_gpu_write_pde(block, gpu, push, tlb_batch); 5895 5896 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 5897 } 5898 5899 // Update gpu_state 5900 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5901 } 5902 5903 // Unmap all PTEs for {block, gpu}. If the 2M entry is currently a PDE, it is 5904 // merged into a PTE. 5905 static void block_gpu_unmap_to_2m(uvm_va_block_t *block, 5906 uvm_va_block_context_t *block_context, 5907 uvm_gpu_t *gpu, 5908 uvm_push_t *push, 5909 uvm_membar_t tlb_membar) 5910 { 5911 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5912 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); 5913 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5914 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5915 5916 if (gpu_state->pte_is_2m) { 5917 // If we're already mapped as a valid 2M PTE, just write it to invalid 5918 uvm_pte_batch_begin(push, pte_batch); 5919 uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch); 5920 5921 block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch); 5922 5923 uvm_pte_batch_end(pte_batch); 5924 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 5925 } 5926 else { 5927 // Otherwise we have a mix of big and 4K PTEs which need to be merged 5928 // into an invalid 2M PTE. 5929 block_gpu_pte_merge_2m(block, block_context, gpu, push, tlb_membar); 5930 5931 gpu_state->pte_is_2m = true; 5932 bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5933 } 5934 } 5935 5936 // Combination split + unmap operation, called when only part of a valid 2M PTE 5937 // mapping is being unmapped. The 2M PTE is split into a mix of valid and 5938 // invalid big and/or 4k PTEs, as described by 5939 // block_context->mapping.new_pte_state. 5940 // 5941 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped). 5942 // 5943 // The PTEs covering the pages not set in pages_to_unmap inherit the mapping of 5944 // the current 2M PTE. 5945 static void block_gpu_unmap_split_2m(uvm_va_block_t *block, 5946 uvm_va_block_context_t *block_context, 5947 uvm_gpu_t *gpu, 5948 const uvm_page_mask_t *pages_to_unmap, 5949 uvm_push_t *push, 5950 uvm_membar_t tlb_membar) 5951 { 5952 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 5953 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 5954 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 5955 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 5956 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 5957 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); 5958 uvm_processor_id_t resident_id; 5959 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5960 DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5961 DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5962 5963 UVM_ASSERT(gpu_state->pte_is_2m); 5964 5965 resident_id = block_gpu_get_processor_to_map(block, gpu, 0); 5966 5967 uvm_pte_batch_begin(push, pte_batch); 5968 5969 // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries 5970 // from the lower levels. This means we don't need to issue a TLB invalidate 5971 // when writing those levels. 5972 5973 // Cases to handle: 5974 // 1) Big PTEs which inherit curr_prot 5975 // 2) Big PTEs which get unmapped 5976 // 3) Big PTEs which are split to 4k 5977 // a) 4k PTEs which inherit curr_prot under the split big PTEs 5978 // b) 4k PTEs which get unmapped under the split big PTEs 5979 5980 // Compute the big PTEs which will need to be split to 4k, if any. 5981 bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5982 5983 if (gpu_state->page_table_range_big.table) { 5984 // Case 1: Write the big PTEs which will inherit the 2M permissions, if 5985 // any. These are the big PTEs which are unchanged (uncovered) by the 5986 // operation. 5987 bitmap_andnot(big_ptes_inherit, 5988 new_pte_state->big_ptes, 5989 new_pte_state->big_ptes_covered, 5990 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 5991 5992 block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL); 5993 5994 // Case 2: Clear the new big PTEs which get unmapped (those not covering 5995 // 4ks) 5996 bitmap_and(big_ptes_new_prot, 5997 new_pte_state->big_ptes, 5998 new_pte_state->big_ptes_covered, 5999 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6000 6001 block_gpu_pte_clear_big(block, 6002 gpu, 6003 big_ptes_new_prot, 6004 tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K), 6005 pte_batch, 6006 NULL); 6007 6008 // Case 3: Write the big PTEs which cover 4k PTEs 6009 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL); 6010 6011 // We just wrote all possible big PTEs, so mark them as initialized 6012 gpu_state->initialized_big = true; 6013 } 6014 else { 6015 UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6016 UVM_ASSERT(bitmap_full(new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6017 } 6018 6019 // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs 6020 block_gpu_pte_big_split_write_4k(block, 6021 block_context, 6022 gpu, 6023 resident_id, 6024 UVM_PROT_NONE, 6025 big_ptes_split, 6026 pages_to_unmap, 6027 pte_batch); 6028 6029 // And activate the 2M PDE. This ends the pte_batch and issues a single TLB 6030 // invalidate for the 2M entry. 6031 block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar); 6032 6033 gpu_state->pte_is_2m = false; 6034 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6035 } 6036 6037 // Unmap some pre-existing mix of big and 4k PTEs into some other mix of big 6038 // and 4k PTEs. 6039 // 6040 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped). 6041 static void block_gpu_unmap_big_and_4k(uvm_va_block_t *block, 6042 uvm_va_block_context_t *block_context, 6043 uvm_gpu_t *gpu, 6044 const uvm_page_mask_t *pages_to_unmap, 6045 uvm_push_t *push, 6046 uvm_membar_t tlb_membar) 6047 { 6048 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 6049 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; 6050 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 6051 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 6052 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 6053 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6054 DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6055 DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6056 NvU32 big_page_size = tree->big_page_size; 6057 NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size); 6058 6059 UVM_ASSERT(!gpu_state->pte_is_2m); 6060 6061 uvm_pte_batch_begin(push, pte_batch); 6062 uvm_tlb_batch_begin(tree, tlb_batch); 6063 6064 // All of these cases might be perfomed in the same call: 6065 // 1) Split currently-big PTEs to 4k 6066 // a) Write new 4k PTEs which inherit curr_prot under the split big PTEs 6067 // b) Clear new 4k PTEs which get unmapped under the split big PTEs 6068 // 2) Merge currently-4k PTEs to unmapped big 6069 // 3) Clear currently-big PTEs which wholly get unmapped 6070 // 4) Clear currently-4k PTEs which get unmapped 6071 // 5) Initialize big PTEs which are not covered by this operation 6072 6073 // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are 6074 // being split. We'll make the big PTEs inactive below after flushing these 6075 // writes. No TLB invalidate is needed since the big PTE is active. 6076 // 6077 // Mask computation: big_before && !big_after 6078 bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6079 6080 block_gpu_pte_big_split_write_4k(block, 6081 block_context, 6082 gpu, 6083 UVM_ID_INVALID, 6084 UVM_PROT_NONE, 6085 big_ptes_split, 6086 pages_to_unmap, 6087 pte_batch); 6088 6089 // Case 4: Clear the 4k PTEs which weren't covered by a big PTE before, and 6090 // remain uncovered after the unmap. 6091 // 6092 // Mask computation: !big_before && !big_after 6093 bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6094 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after); 6095 if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_unmap, &block_context->scratch_page_mask)) 6096 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, tlb_batch); 6097 6098 // Case 5: If the big page table is newly-allocated, make sure that all big 6099 // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are 6100 // all initialized to invalid. 6101 // 6102 // The similar case of making newly-allocated big PTEs unmapped when no 6103 // lower 4k table is present is handled by having 6104 // block_gpu_compute_new_pte_state set new_pte_state->big_ptes 6105 // appropriately. 6106 if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) { 6107 // TODO: Bug 1766424: If we have the 4k page table already, we could 6108 // attempt to merge all uncovered big PTE regions when first 6109 // allocating the big table. That's probably not worth doing. 6110 UVM_ASSERT(gpu_state->page_table_range_4k.table); 6111 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6112 bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size)); 6113 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch); 6114 gpu_state->initialized_big = true; 6115 } 6116 6117 // Case 3 and step 1 of case 1: Unmap both currently-big PTEs which are 6118 // getting wholly unmapped, and those currently-big PTEs which are being 6119 // split to 4k. We can't directly transition from a valid big PTE to valid 6120 // lower PTEs, because that could cause the GPU TLBs to cache the same VA in 6121 // different cache lines. That could cause memory ordering to not be 6122 // maintained. 6123 // 6124 // Mask computation: (big_before && big_after && covered) || 6125 // (big_before && !big_after) 6126 bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6127 bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6128 bitmap_or(big_ptes_mask, big_ptes_mask, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6129 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, unmapped_pte_val, pte_batch, tlb_batch); 6130 6131 // Case 2: Merge the new big PTEs and end the batches, now that we've done 6132 // all of the independent PTE writes we can. 6133 // 6134 // Mask computation: !big_before && big_after 6135 if (bitmap_andnot(big_ptes_mask, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 6136 // This writes the newly-big PTEs to unmapped and ends the PTE and TLB 6137 // batches. 6138 block_gpu_pte_merge_big_and_end(block, 6139 block_context, 6140 gpu, 6141 big_ptes_mask, 6142 push, 6143 pte_batch, 6144 tlb_batch, 6145 tlb_membar); 6146 } 6147 else { 6148 // End the batches. We have to commit the membars and TLB invalidates 6149 // before we finish splitting formerly-big PTEs. 6150 uvm_pte_batch_end(pte_batch); 6151 uvm_tlb_batch_end(tlb_batch, push, tlb_membar); 6152 } 6153 6154 if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) || 6155 block_gpu_needs_to_activate_table(block, gpu)) { 6156 uvm_pte_batch_begin(push, pte_batch); 6157 uvm_tlb_batch_begin(tree, tlb_batch); 6158 6159 // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by 6160 // switching them from unmapped to invalid. This causes the GPU MMU to 6161 // start reading the 4k PTEs instead of stopping at the unmapped big 6162 // PTEs. 6163 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch); 6164 6165 uvm_pte_batch_end(pte_batch); 6166 6167 // Finally, activate the page tables if they're inactive 6168 if (block_gpu_needs_to_activate_table(block, gpu)) 6169 block_gpu_write_pde(block, gpu, push, tlb_batch); 6170 6171 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); 6172 } 6173 6174 // Update gpu_state 6175 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6176 } 6177 6178 // When PTE state is about to change (for example due to a map/unmap/revoke 6179 // operation), this function decides how to split and merge the PTEs in response 6180 // to that operation. 6181 // 6182 // The operation is described with the two page masks: 6183 // 6184 // - pages_changing indicates which pages will have their PTE mappings changed 6185 // on the GPU in some way as a result of the operation (for example, which 6186 // pages will actually have their mapping permissions upgraded). 6187 // 6188 // - page_mask_after indicates which pages on this GPU will have exactly the 6189 // same PTE attributes (permissions, residency) as pages_changing after the 6190 // operation is applied. 6191 // 6192 // PTEs are merged eagerly. 6193 static void block_gpu_compute_new_pte_state(uvm_va_block_t *block, 6194 uvm_gpu_t *gpu, 6195 uvm_processor_id_t resident_id, 6196 const uvm_page_mask_t *pages_changing, 6197 const uvm_page_mask_t *page_mask_after, 6198 uvm_va_block_new_pte_state_t *new_pte_state) 6199 { 6200 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 6201 uvm_va_block_region_t big_region_all, big_page_region, region; 6202 NvU32 big_page_size; 6203 uvm_page_index_t page_index; 6204 size_t big_page_index; 6205 DECLARE_BITMAP(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6206 bool can_make_new_big_ptes; 6207 6208 memset(new_pte_state, 0, sizeof(*new_pte_state)); 6209 new_pte_state->needs_4k = true; 6210 6211 // TODO: Bug 1676485: Force a specific page size for perf testing 6212 6213 if (gpu_state->force_4k_ptes) 6214 return; 6215 6216 // Limit HMM GPU allocations to PAGE_SIZE since migrate_vma_*(), 6217 // hmm_range_fault(), and make_device_exclusive_range() don't handle folios 6218 // yet. Also, it makes mremap() difficult since the new address may not 6219 // align with the GPU block size otherwise. 6220 // If PAGE_SIZE is 64K, the code following this check is OK since 64K 6221 // big_pages is supported on all HMM supported GPUs (Turing+). 6222 // TODO: Bug 3368756: add support for transparent huge pages (THP). 6223 if (uvm_va_block_is_hmm(block) && PAGE_SIZE == UVM_PAGE_SIZE_4K) 6224 return; 6225 6226 UVM_ASSERT(uvm_page_mask_subset(pages_changing, page_mask_after)); 6227 6228 // If all pages in the 2M mask have the same attributes after the 6229 // operation is applied, we can use a 2M PTE. 6230 if (block_gpu_supports_2m(block, gpu) && 6231 uvm_page_mask_full(page_mask_after) && 6232 (UVM_ID_IS_INVALID(resident_id) || is_block_phys_contig(block, resident_id))) { 6233 new_pte_state->pte_is_2m = true; 6234 new_pte_state->needs_4k = false; 6235 return; 6236 } 6237 6238 // Find big PTEs with matching attributes 6239 6240 // Can this block fit any big pages? 6241 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); 6242 big_region_all = uvm_va_block_big_page_region_all(block, big_page_size); 6243 if (big_region_all.first >= big_region_all.outer) 6244 return; 6245 6246 new_pte_state->needs_4k = false; 6247 6248 can_make_new_big_ptes = true; 6249 6250 // Big pages can be used when mapping sysmem if the GPU supports it (Pascal+). 6251 if (UVM_ID_IS_CPU(resident_id) && !gpu->parent->can_map_sysmem_with_large_pages) 6252 can_make_new_big_ptes = false; 6253 6254 // We must not fail during teardown: unmap (resident_id == UVM_ID_INVALID) 6255 // with no splits required. That means we should avoid allocating PTEs 6256 // which are only needed for merges. 6257 // 6258 // This only matters if we're merging to big PTEs. If we're merging to 2M, 6259 // then we must already have the 2M level (since it has to be allocated 6260 // before the lower levels). 6261 // 6262 // If pte_is_2m already and we don't have a big table, we're splitting so we 6263 // have to allocate. 6264 if (UVM_ID_IS_INVALID(resident_id) && !gpu_state->page_table_range_big.table && !gpu_state->pte_is_2m) 6265 can_make_new_big_ptes = false; 6266 6267 for_each_va_block_page_in_region_mask(page_index, pages_changing, big_region_all) { 6268 uvm_va_block_region_t contig_region = {0}; 6269 6270 big_page_index = uvm_va_block_big_page_index(block, page_index, big_page_size); 6271 big_page_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); 6272 6273 if (!UVM_ID_IS_INVALID(resident_id)) 6274 contig_region = block_phys_contig_region(block, page_index, resident_id); 6275 6276 __set_bit(big_page_index, new_pte_state->big_ptes_covered); 6277 6278 // When mapping sysmem, we can use big pages only if we are mapping all 6279 // pages in the big page subregion and the CPU pages backing the 6280 // subregion are physically contiguous. 6281 if (can_make_new_big_ptes && 6282 uvm_page_mask_region_full(page_mask_after, big_page_region) && 6283 (!UVM_ID_IS_CPU(resident_id) || 6284 (contig_region.first <= big_page_region.first && contig_region.outer >= big_page_region.outer))) { 6285 __set_bit(big_page_index, new_pte_state->big_ptes); 6286 } 6287 6288 if (!test_bit(big_page_index, new_pte_state->big_ptes)) 6289 new_pte_state->needs_4k = true; 6290 6291 // Skip to the end of the region 6292 page_index = big_page_region.outer - 1; 6293 } 6294 6295 if (!new_pte_state->needs_4k) { 6296 // All big page regions in pages_changing will be big PTEs. Now check if 6297 // there are any unaligned pages outside of big_region_all which are 6298 // changing. 6299 region = uvm_va_block_region(0, big_region_all.first); 6300 if (!uvm_page_mask_region_empty(pages_changing, region)) { 6301 new_pte_state->needs_4k = true; 6302 } 6303 else { 6304 region = uvm_va_block_region(big_region_all.outer, uvm_va_block_num_cpu_pages(block)); 6305 if (!uvm_page_mask_region_empty(pages_changing, region)) 6306 new_pte_state->needs_4k = true; 6307 } 6308 } 6309 6310 // Now add in the PTEs which should be big but weren't covered by this 6311 // operation. 6312 // 6313 // Note that we can't assume that a given page table range has been 6314 // initialized if it's present here, since it could have been allocated by a 6315 // thread which had to restart its operation due to allocation retry. 6316 if (gpu_state->pte_is_2m || (block_gpu_supports_2m(block, gpu) && !gpu_state->page_table_range_2m.table)) { 6317 // We're splitting a 2M PTE so all of the uncovered big PTE regions will 6318 // become big PTEs which inherit the 2M permissions. If we haven't 6319 // allocated the 2M table yet, it will start as a 2M PTE until the lower 6320 // levels are allocated, so it's the same split case regardless of 6321 // whether this operation will need to retry a later allocation. 6322 bitmap_complement(big_ptes_not_covered, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6323 } 6324 else if (!gpu_state->page_table_range_4k.table && !new_pte_state->needs_4k) { 6325 // If we don't have 4k PTEs and we won't be allocating them for this 6326 // operation, all of our PTEs need to be big. 6327 UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6328 bitmap_zero(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6329 bitmap_set(big_ptes_not_covered, 0, uvm_va_block_num_big_pages(block, big_page_size)); 6330 } 6331 else { 6332 // Otherwise, add in all of the currently-big PTEs which are unchanging. 6333 // They won't be written, but they need to be carried into the new 6334 // gpu_state->big_ptes when it's updated. 6335 bitmap_andnot(big_ptes_not_covered, 6336 gpu_state->big_ptes, 6337 new_pte_state->big_ptes_covered, 6338 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6339 } 6340 6341 bitmap_or(new_pte_state->big_ptes, new_pte_state->big_ptes, big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 6342 } 6343 6344 // Wrapper around uvm_page_tree_get_ptes() and uvm_page_tree_alloc_table() that 6345 // handles allocation retry. If the block lock has been unlocked and relocked as 6346 // part of the allocation, NV_ERR_MORE_PROCESSING_REQUIRED is returned to signal 6347 // to the caller that the operation likely needs to be restarted. If that 6348 // happens, the pending tracker is added to the block's tracker. 6349 static NV_STATUS block_alloc_pt_range_with_retry(uvm_va_block_t *va_block, 6350 uvm_gpu_t *gpu, 6351 NvU32 page_size, 6352 uvm_page_table_range_t *page_table_range, 6353 uvm_tracker_t *pending_tracker) 6354 { 6355 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 6356 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu); 6357 uvm_page_tree_t *page_tables = &gpu_va_space->page_tables; 6358 uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block); 6359 uvm_page_table_range_t local_range; 6360 NV_STATUS status; 6361 6362 // Blocks may contain large PTEs without starting on a PTE boundary or 6363 // having an aligned size. Cover the PTEs of this size in the block's 6364 // interior so we match uvm_va_block_gpu_state_t::big_ptes. 6365 NvU64 start = UVM_ALIGN_UP(va_block->start, page_size); 6366 NvU64 size = UVM_ALIGN_DOWN(va_block->end + 1, page_size) - start; 6367 6368 // VA blocks which can use the 2MB level as either a PTE or a PDE need to 6369 // account for the PDE specially, so they must use uvm_page_tree_alloc_table 6370 // to allocate the lower levels. 6371 bool use_alloc_table = block_gpu_supports_2m(va_block, gpu) && page_size < UVM_PAGE_SIZE_2M; 6372 6373 UVM_ASSERT(page_table_range->table == NULL); 6374 6375 if (va_block_test && va_block_test->page_table_allocation_retry_force_count > 0) { 6376 --va_block_test->page_table_allocation_retry_force_count; 6377 status = NV_ERR_NO_MEMORY; 6378 } 6379 else if (use_alloc_table) { 6380 // Pascal+: 4k/64k tables under a 2M entry 6381 UVM_ASSERT(gpu_state->page_table_range_2m.table); 6382 status = uvm_page_tree_alloc_table(page_tables, 6383 page_size, 6384 UVM_PMM_ALLOC_FLAGS_NONE, 6385 &gpu_state->page_table_range_2m, 6386 page_table_range); 6387 } 6388 else { 6389 // 4k/big tables on pre-Pascal, and the 2M entry on Pascal+ 6390 status = uvm_page_tree_get_ptes(page_tables, 6391 page_size, 6392 start, 6393 size, 6394 UVM_PMM_ALLOC_FLAGS_NONE, 6395 page_table_range); 6396 } 6397 6398 if (status == NV_OK) 6399 goto allocated; 6400 6401 if (status != NV_ERR_NO_MEMORY) 6402 return status; 6403 6404 // Before unlocking the block lock, any pending work on the block has to be 6405 // added to the block's tracker. 6406 if (pending_tracker) { 6407 status = uvm_tracker_add_tracker_safe(&va_block->tracker, pending_tracker); 6408 if (status != NV_OK) 6409 return status; 6410 } 6411 6412 // Unlock the va block and retry with eviction enabled 6413 uvm_mutex_unlock(&va_block->lock); 6414 6415 if (use_alloc_table) { 6416 // Although we don't hold the block lock here, it's safe to pass 6417 // gpu_state->page_table_range_2m to the page tree code because we know 6418 // that the 2m range has already been allocated, and that it can't go 6419 // away while we have the va_space lock held. 6420 status = uvm_page_tree_alloc_table(page_tables, 6421 page_size, 6422 UVM_PMM_ALLOC_FLAGS_EVICT, 6423 &gpu_state->page_table_range_2m, 6424 &local_range); 6425 } 6426 else { 6427 status = uvm_page_tree_get_ptes(page_tables, 6428 page_size, 6429 start, 6430 size, 6431 UVM_PMM_ALLOC_FLAGS_EVICT, 6432 &local_range); 6433 } 6434 6435 uvm_mutex_lock(&va_block->lock); 6436 6437 if (status != NV_OK) 6438 return status; 6439 6440 status = NV_ERR_MORE_PROCESSING_REQUIRED; 6441 6442 if (page_table_range->table) { 6443 // A different caller allocated the page tables in the meantime, release the 6444 // local copy. 6445 uvm_page_tree_put_ptes(page_tables, &local_range); 6446 return status; 6447 } 6448 6449 *page_table_range = local_range; 6450 6451 allocated: 6452 // Mark the 2M PTE as active when we first allocate it, since we don't have 6453 // any PTEs below it yet. 6454 if (page_size == UVM_PAGE_SIZE_2M) { 6455 UVM_ASSERT(!gpu_state->pte_is_2m); 6456 gpu_state->pte_is_2m = true; 6457 } 6458 else if (page_size != UVM_PAGE_SIZE_4K) { 6459 // uvm_page_tree_get_ptes initializes big PTEs to invalid. 6460 // uvm_page_tree_alloc_table does not, so we'll have to do it later. 6461 if (use_alloc_table) 6462 UVM_ASSERT(!gpu_state->initialized_big); 6463 else 6464 gpu_state->initialized_big = true; 6465 } 6466 6467 return status; 6468 } 6469 6470 // Helper which allocates all page table ranges necessary for the given page 6471 // sizes. See block_alloc_pt_range_with_retry. 6472 static NV_STATUS block_alloc_ptes_with_retry(uvm_va_block_t *va_block, 6473 uvm_gpu_t *gpu, 6474 NvU32 page_sizes, 6475 uvm_tracker_t *pending_tracker) 6476 { 6477 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 6478 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu); 6479 uvm_page_table_range_t *range; 6480 NvU32 page_size; 6481 NV_STATUS status, final_status = NV_OK; 6482 6483 UVM_ASSERT(gpu_state); 6484 6485 // Blocks which can map 2M PTE/PDEs must always allocate the 2MB level first 6486 // in order to allocate the levels below. 6487 if (block_gpu_supports_2m(va_block, gpu)) 6488 page_sizes |= UVM_PAGE_SIZE_2M; 6489 6490 UVM_ASSERT((page_sizes & gpu_va_space->page_tables.hal->page_sizes()) == page_sizes); 6491 6492 for_each_chunk_size_rev(page_size, page_sizes) { 6493 if (page_size == UVM_PAGE_SIZE_2M) 6494 range = &gpu_state->page_table_range_2m; 6495 else if (page_size == UVM_PAGE_SIZE_4K) 6496 range = &gpu_state->page_table_range_4k; 6497 else 6498 range = &gpu_state->page_table_range_big; 6499 6500 if (range->table) 6501 continue; 6502 6503 if (page_size == UVM_PAGE_SIZE_2M) { 6504 UVM_ASSERT(!gpu_state->pte_is_2m); 6505 UVM_ASSERT(!gpu_state->page_table_range_big.table); 6506 UVM_ASSERT(!gpu_state->page_table_range_4k.table); 6507 } 6508 else if (page_size != UVM_PAGE_SIZE_4K) { 6509 UVM_ASSERT(uvm_va_block_num_big_pages(va_block, uvm_va_block_gpu_big_page_size(va_block, gpu)) > 0); 6510 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6511 } 6512 6513 status = block_alloc_pt_range_with_retry(va_block, gpu, page_size, range, pending_tracker); 6514 6515 // Keep going to allocate the remaining levels even if the allocation 6516 // requires a retry, since we'll likely still need them when we retry 6517 // anyway. 6518 if (status == NV_ERR_MORE_PROCESSING_REQUIRED) 6519 final_status = NV_ERR_MORE_PROCESSING_REQUIRED; 6520 else if (status != NV_OK) 6521 return status; 6522 } 6523 6524 return final_status; 6525 } 6526 6527 static NV_STATUS block_alloc_ptes_new_state(uvm_va_block_t *va_block, 6528 uvm_gpu_t *gpu, 6529 uvm_va_block_new_pte_state_t *new_pte_state, 6530 uvm_tracker_t *pending_tracker) 6531 { 6532 NvU32 page_sizes = 0; 6533 6534 if (new_pte_state->pte_is_2m) { 6535 page_sizes |= UVM_PAGE_SIZE_2M; 6536 } 6537 else { 6538 if (!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 6539 page_sizes |= uvm_va_block_gpu_big_page_size(va_block, gpu); 6540 6541 if (new_pte_state->needs_4k) 6542 page_sizes |= UVM_PAGE_SIZE_4K; 6543 else 6544 UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); 6545 } 6546 6547 return block_alloc_ptes_with_retry(va_block, gpu, page_sizes, pending_tracker); 6548 } 6549 6550 // Make sure that GMMU PDEs down to PDE1 are populated for the given VA block. 6551 // This is currently used on ATS systems to prevent GPUs from inadvertently 6552 // accessing sysmem via ATS because there is no PDE1 in the GMMU page tables, 6553 // which is where the NOATS bit resides. 6554 // 6555 // The current implementation simply pre-allocates the PTEs for the VA Block, 6556 // which is wasteful because the GPU may never need them. 6557 // 6558 // TODO: Bug 2064188: Change the MMU code to be able to directly refcount PDE1 6559 // page table entries without having to request PTEs. 6560 static NV_STATUS block_pre_populate_pde1_gpu(uvm_va_block_t *block, 6561 uvm_gpu_va_space_t *gpu_va_space, 6562 uvm_tracker_t *pending_tracker) 6563 { 6564 NvU32 page_sizes; 6565 NvU32 big_page_size; 6566 uvm_gpu_t *gpu; 6567 uvm_va_block_gpu_state_t *gpu_state; 6568 6569 UVM_ASSERT(block); 6570 UVM_ASSERT(gpu_va_space); 6571 UVM_ASSERT(gpu_va_space->ats.enabled); 6572 UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE); 6573 6574 gpu = gpu_va_space->gpu; 6575 big_page_size = gpu_va_space->page_tables.big_page_size; 6576 6577 gpu_state = block_gpu_state_get_alloc(block, gpu); 6578 if (!gpu_state) 6579 return NV_ERR_NO_MEMORY; 6580 6581 // If the VA Block supports 2M pages, allocate the 2M PTE only, as it 6582 // requires less memory 6583 if (block_gpu_supports_2m(block, gpu)) 6584 page_sizes = UVM_PAGE_SIZE_2M; 6585 else if (uvm_va_block_num_big_pages(block, big_page_size) > 0) 6586 page_sizes = big_page_size; 6587 else 6588 page_sizes = UVM_PAGE_SIZE_4K; 6589 6590 return block_alloc_ptes_with_retry(block, gpu, page_sizes, pending_tracker); 6591 } 6592 6593 static NV_STATUS block_pre_populate_pde1_all_gpus(uvm_va_block_t *block, uvm_tracker_t *pending_tracker) 6594 { 6595 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 6596 NV_STATUS status = NV_OK; 6597 6598 // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See 6599 // comments in block_pre_populate_pde1_gpu. 6600 if (g_uvm_global.ats.enabled && !block->cpu.ever_mapped) { 6601 uvm_gpu_va_space_t *gpu_va_space; 6602 6603 for_each_gpu_va_space(gpu_va_space, va_space) { 6604 // We only care about systems where ATS is supported and the application 6605 // enabled it. 6606 if (!gpu_va_space->ats.enabled) 6607 continue; 6608 6609 status = block_pre_populate_pde1_gpu(block, gpu_va_space, pending_tracker); 6610 if (status != NV_OK) 6611 break; 6612 } 6613 } 6614 6615 return status; 6616 } 6617 6618 static NV_STATUS block_unmap_gpu(uvm_va_block_t *block, 6619 uvm_va_block_context_t *block_context, 6620 uvm_gpu_t *gpu, 6621 const uvm_page_mask_t *unmap_page_mask, 6622 uvm_tracker_t *out_tracker) 6623 { 6624 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 6625 uvm_pte_bits_gpu_t pte_bit; 6626 uvm_push_t push; 6627 uvm_membar_t tlb_membar; 6628 bool only_local_mappings; 6629 uvm_page_mask_t *pages_to_unmap = &block_context->mapping.page_mask; 6630 NV_STATUS status; 6631 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 6632 bool mask_empty; 6633 6634 // We have to check gpu_state before looking at any VA space state like our 6635 // gpu_va_space, because we could be on the eviction path where we don't 6636 // have a lock on that state. However, since remove_gpu_va_space walks each 6637 // block to unmap the GPU before destroying the gpu_va_space, we're 6638 // guaranteed that if this GPU has page tables, the gpu_va_space can't go 6639 // away while we're holding the block lock. 6640 if (!block_gpu_has_page_tables(block, gpu)) 6641 return NV_OK; 6642 6643 if (!uvm_page_mask_and(pages_to_unmap, unmap_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ])) 6644 return NV_OK; 6645 6646 // block_gpu_compute_new_pte_state needs a mask of pages which will have 6647 // matching attributes after the operation is performed. In the case of 6648 // unmap, those are the pages with unset bits. 6649 uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], pages_to_unmap); 6650 uvm_page_mask_complement(&block_context->scratch_page_mask, &block_context->scratch_page_mask); 6651 block_gpu_compute_new_pte_state(block, 6652 gpu, 6653 UVM_ID_INVALID, 6654 pages_to_unmap, 6655 &block_context->scratch_page_mask, 6656 new_pte_state); 6657 6658 status = block_alloc_ptes_new_state(block, gpu, new_pte_state, out_tracker); 6659 if (status != NV_OK) 6660 return status; 6661 6662 only_local_mappings = !block_has_remote_mapping_gpu(block, block_context, gpu->id, pages_to_unmap); 6663 tlb_membar = uvm_hal_downgrade_membar_type(gpu, only_local_mappings); 6664 6665 status = uvm_push_begin_acquire(gpu->channel_manager, 6666 UVM_CHANNEL_TYPE_MEMOPS, 6667 &block->tracker, 6668 &push, 6669 "Unmapping pages in block [0x%llx, 0x%llx)", 6670 block->start, 6671 block->end + 1); 6672 if (status != NV_OK) 6673 return status; 6674 6675 if (new_pte_state->pte_is_2m) { 6676 // We're either unmapping a whole valid 2M PTE, or we're unmapping all 6677 // remaining pages in a split 2M PTE. 6678 block_gpu_unmap_to_2m(block, block_context, gpu, &push, tlb_membar); 6679 } 6680 else if (gpu_state->pte_is_2m) { 6681 // The block is currently mapped as a valid 2M PTE and we're unmapping 6682 // some pages within the 2M, so we have to split it into the appropriate 6683 // mix of big and 4k PTEs. 6684 block_gpu_unmap_split_2m(block, block_context, gpu, pages_to_unmap, &push, tlb_membar); 6685 } 6686 else { 6687 // We're unmapping some pre-existing mix of big and 4K PTEs into some 6688 // other mix of big and 4K PTEs. 6689 block_gpu_unmap_big_and_4k(block, block_context, gpu, pages_to_unmap, &push, tlb_membar); 6690 } 6691 6692 uvm_push_end(&push); 6693 6694 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) { 6695 uvm_processor_mask_t non_uvm_lite_gpus; 6696 uvm_processor_mask_andnot(&non_uvm_lite_gpus, &block->mapped, block_get_uvm_lite_gpus(block)); 6697 6698 UVM_ASSERT(uvm_processor_mask_test(&non_uvm_lite_gpus, gpu->id)); 6699 6700 // If the GPU is the only non-UVM-Lite processor with mappings, we can 6701 // safely mark pages as fully unmapped 6702 if (uvm_processor_mask_get_count(&non_uvm_lite_gpus) == 1) 6703 uvm_page_mask_andnot(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_unmap); 6704 } 6705 6706 // Clear block PTE state 6707 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { 6708 mask_empty = !uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit], 6709 &gpu_state->pte_bits[pte_bit], 6710 pages_to_unmap); 6711 if (pte_bit == UVM_PTE_BITS_GPU_READ && mask_empty) 6712 uvm_processor_mask_clear(&block->mapped, gpu->id); 6713 } 6714 6715 UVM_ASSERT(block_check_mappings(block)); 6716 6717 return uvm_tracker_add_push_safe(out_tracker, &push); 6718 } 6719 6720 NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block, 6721 uvm_va_block_context_t *va_block_context, 6722 uvm_processor_id_t id, 6723 uvm_va_block_region_t region, 6724 const uvm_page_mask_t *unmap_page_mask, 6725 uvm_tracker_t *out_tracker) 6726 { 6727 uvm_page_mask_t *region_page_mask = &va_block_context->mapping.map_running_page_mask; 6728 6729 UVM_ASSERT(!uvm_va_block_is_dead(va_block)); 6730 uvm_assert_mutex_locked(&va_block->lock); 6731 6732 if (UVM_ID_IS_CPU(id)) { 6733 block_unmap_cpu(va_block, region, unmap_page_mask); 6734 return NV_OK; 6735 } 6736 6737 uvm_page_mask_init_from_region(region_page_mask, region, unmap_page_mask); 6738 6739 return block_unmap_gpu(va_block, va_block_context, block_get_gpu(va_block, id), region_page_mask, out_tracker); 6740 } 6741 6742 // This function essentially works as a wrapper around vm_insert_page (hence 6743 // the similar function prototype). This is needed since vm_insert_page 6744 // doesn't take permissions as input, but uses vma->vm_page_prot instead. 6745 // Since we may have multiple VA blocks under one VMA which need to map 6746 // with different permissions, we have to manually change vma->vm_page_prot for 6747 // each call to vm_insert_page. Multiple faults under one VMA in separate 6748 // blocks can be serviced concurrently, so the VMA wrapper lock is used 6749 // to protect access to vma->vm_page_prot. 6750 static NV_STATUS uvm_cpu_insert_page(struct vm_area_struct *vma, 6751 NvU64 addr, 6752 struct page *page, 6753 uvm_prot_t new_prot) 6754 { 6755 uvm_vma_wrapper_t *vma_wrapper; 6756 unsigned long target_flags; 6757 pgprot_t target_pgprot; 6758 int ret; 6759 6760 UVM_ASSERT(vma); 6761 UVM_ASSERT(vma->vm_private_data); 6762 6763 vma_wrapper = vma->vm_private_data; 6764 target_flags = vma->vm_flags; 6765 6766 if (new_prot == UVM_PROT_READ_ONLY) 6767 target_flags &= ~VM_WRITE; 6768 6769 target_pgprot = vm_get_page_prot(target_flags); 6770 6771 // Take VMA wrapper lock to check vma->vm_page_prot 6772 uvm_down_read(&vma_wrapper->lock); 6773 6774 // Take a write lock if we need to modify the VMA vm_page_prot 6775 // - vma->vm_page_prot creates writable PTEs but new prot is RO 6776 // - vma->vm_page_prot creates read-only PTEs but new_prot is RW 6777 if (pgprot_val(vma->vm_page_prot) != pgprot_val(target_pgprot)) { 6778 uvm_up_read(&vma_wrapper->lock); 6779 uvm_down_write(&vma_wrapper->lock); 6780 6781 vma->vm_page_prot = target_pgprot; 6782 6783 uvm_downgrade_write(&vma_wrapper->lock); 6784 } 6785 6786 ret = vm_insert_page(vma, addr, page); 6787 uvm_up_read(&vma_wrapper->lock); 6788 if (ret) { 6789 UVM_ASSERT_MSG(ret == -ENOMEM, "ret: %d\n", ret); 6790 return errno_to_nv_status(ret); 6791 } 6792 6793 return NV_OK; 6794 } 6795 6796 static uvm_prot_t compute_logical_prot(uvm_va_block_t *va_block, 6797 uvm_va_block_context_t *va_block_context, 6798 uvm_page_index_t page_index) 6799 { 6800 struct vm_area_struct *vma; 6801 uvm_prot_t logical_prot; 6802 6803 if (uvm_va_block_is_hmm(va_block)) { 6804 NvU64 addr = uvm_va_block_cpu_page_address(va_block, page_index); 6805 6806 logical_prot = uvm_hmm_compute_logical_prot(va_block, va_block_context, addr); 6807 } 6808 else { 6809 uvm_va_range_t *va_range = va_block->va_range; 6810 6811 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 6812 6813 // Zombified VA ranges no longer have a vma, so they have no permissions 6814 if (uvm_va_range_is_managed_zombie(va_range)) { 6815 logical_prot = UVM_PROT_NONE; 6816 } 6817 else { 6818 vma = uvm_va_range_vma(va_range); 6819 6820 if (!(vma->vm_flags & VM_READ)) 6821 logical_prot = UVM_PROT_NONE; 6822 else if (!(vma->vm_flags & VM_WRITE)) 6823 logical_prot = UVM_PROT_READ_ONLY; 6824 else 6825 logical_prot = UVM_PROT_READ_WRITE_ATOMIC; 6826 } 6827 } 6828 6829 return logical_prot; 6830 } 6831 6832 static struct page *block_page_get(uvm_va_block_t *block, block_phys_page_t block_page) 6833 { 6834 struct page *page; 6835 6836 if (UVM_ID_IS_CPU(block_page.processor)) { 6837 page = uvm_cpu_chunk_get_cpu_page(block, block_page.page_index); 6838 } 6839 else { 6840 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 6841 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, block_page.processor); 6842 size_t chunk_offset; 6843 uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_page, &chunk_offset); 6844 6845 UVM_ASSERT(gpu->mem_info.numa.enabled); 6846 page = uvm_gpu_chunk_to_page(&gpu->pmm, chunk) + chunk_offset / PAGE_SIZE; 6847 } 6848 6849 UVM_ASSERT(page); 6850 return page; 6851 } 6852 6853 // Creates or upgrades a CPU mapping for the given page, updating the block's 6854 // mapping and pte_bits bitmaps as appropriate. Upon successful return, the page 6855 // will be mapped with at least new_prot permissions. 6856 // 6857 // This never downgrades mappings, so new_prot must not be UVM_PROT_NONE. Use 6858 // block_unmap_cpu or uvm_va_block_revoke_prot instead. 6859 // 6860 // If the existing mapping is >= new_prot already, this is a no-op. 6861 // 6862 // It is the caller's responsibility to: 6863 // - Revoke mappings from other processors as appropriate so the CPU can map 6864 // with new_prot permissions 6865 // - Guarantee that vm_insert_page is safe to use (vma->vm_mm has a reference 6866 // and mmap_lock is held in at least read mode) 6867 // - Ensure that the struct page corresponding to the physical memory being 6868 // mapped exists 6869 // - Manage the block's residency bitmap 6870 // - Ensure that the block hasn't been killed (block->va_range is present) 6871 // - Update the pte/mapping tracking state on success 6872 static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block, 6873 uvm_va_block_context_t *va_block_context, 6874 uvm_processor_id_t resident_id, 6875 uvm_page_index_t page_index, 6876 uvm_prot_t new_prot) 6877 { 6878 uvm_prot_t curr_prot = block_page_prot_cpu(block, page_index); 6879 uvm_va_range_t *va_range = block->va_range; 6880 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 6881 struct vm_area_struct *vma; 6882 NV_STATUS status; 6883 NvU64 addr; 6884 struct page *page; 6885 6886 UVM_ASSERT(uvm_va_block_is_hmm(block) || va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 6887 UVM_ASSERT(new_prot != UVM_PROT_NONE); 6888 UVM_ASSERT(new_prot < UVM_PROT_MAX); 6889 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU)); 6890 6891 uvm_assert_mutex_locked(&block->lock); 6892 if (UVM_ID_IS_CPU(resident_id)) 6893 UVM_ASSERT(uvm_page_mask_test(&block->cpu.allocated, page_index)); 6894 6895 // For the CPU, write implies atomic 6896 if (new_prot == UVM_PROT_READ_WRITE) 6897 new_prot = UVM_PROT_READ_WRITE_ATOMIC; 6898 6899 // Only upgrades are supported in this function 6900 UVM_ASSERT(curr_prot <= new_prot); 6901 6902 if (new_prot == curr_prot) 6903 return NV_OK; 6904 6905 // Check for existing VMA permissions. They could have been modified after 6906 // the initial mmap by mprotect. 6907 if (new_prot > compute_logical_prot(block, va_block_context, page_index)) 6908 return NV_ERR_INVALID_ACCESS_TYPE; 6909 6910 if (uvm_va_block_is_hmm(block)) { 6911 // Do not map CPU pages because they belong to the Linux kernel. 6912 return NV_OK; 6913 } 6914 6915 UVM_ASSERT(va_range); 6916 6917 if (UVM_ID_IS_CPU(resident_id) && UVM_ID_IS_CPU(uvm_va_range_get_policy(va_range)->preferred_location)) { 6918 // Add the page's range group range to the range group's migrated list. 6919 uvm_range_group_range_t *rgr = uvm_range_group_range_find(va_space, 6920 uvm_va_block_cpu_page_address(block, page_index)); 6921 if (rgr != NULL) { 6922 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock); 6923 if (list_empty(&rgr->range_group_migrated_list_node)) 6924 list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges); 6925 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock); 6926 } 6927 } 6928 6929 // It's possible here that current->mm != vma->vm_mm. That can happen for 6930 // example due to access_process_vm (ptrace) or get_user_pages from another 6931 // driver. 6932 // 6933 // In such cases the caller has taken care of ref counting vma->vm_mm for 6934 // us, so we can safely operate on the vma but we can't use 6935 // uvm_va_range_vma_current. 6936 vma = uvm_va_range_vma(va_range); 6937 uvm_assert_mmap_lock_locked(vma->vm_mm); 6938 UVM_ASSERT(!uvm_va_space_mm_enabled(va_space) || va_space->va_space_mm.mm == vma->vm_mm); 6939 6940 // Add the mapping 6941 addr = uvm_va_block_cpu_page_address(block, page_index); 6942 6943 // This unmap handles upgrades as vm_insert_page returns -EBUSY when 6944 // there's already a mapping present at fault_addr, so we have to unmap 6945 // first anyway when upgrading from RO -> RW. 6946 if (curr_prot != UVM_PROT_NONE) 6947 unmap_mapping_range(va_space->mapping, addr, PAGE_SIZE, 1); 6948 6949 // Don't map the CPU until prior copies and GPU PTE updates finish, 6950 // otherwise we might not stay coherent. 6951 status = uvm_tracker_wait(&block->tracker); 6952 if (status != NV_OK) 6953 return status; 6954 6955 page = block_page_get(block, block_phys_page(resident_id, page_index)); 6956 return uvm_cpu_insert_page(vma, addr, page, new_prot); 6957 } 6958 6959 // Maps the CPU to the given pages which are resident on resident_id. 6960 // map_page_mask is an in/out parameter: the pages which are mapped to 6961 // resident_id are removed from the mask before returning. 6962 // 6963 // Caller must ensure that: 6964 // - Pages in map_page_mask must not be set in the corresponding cpu.pte_bits 6965 // mask for the requested protection. 6966 static NV_STATUS block_map_cpu_to(uvm_va_block_t *block, 6967 uvm_va_block_context_t *block_context, 6968 uvm_processor_id_t resident_id, 6969 uvm_va_block_region_t region, 6970 uvm_page_mask_t *map_page_mask, 6971 uvm_prot_t new_prot, 6972 uvm_tracker_t *out_tracker) 6973 { 6974 NV_STATUS status = NV_OK; 6975 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 6976 uvm_page_index_t page_index; 6977 uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask; 6978 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, resident_id); 6979 uvm_pte_bits_cpu_t prot_pte_bit = get_cpu_pte_bit_index(new_prot); 6980 uvm_pte_bits_cpu_t pte_bit; 6981 6982 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU)); 6983 6984 // TODO: Bug 1766424: Check if optimizing the unmap_mapping_range calls 6985 // within block_map_cpu_page_to by doing them once here is helpful. 6986 6987 UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask, 6988 map_page_mask, 6989 &block->cpu.pte_bits[prot_pte_bit])); 6990 6991 // The pages which will actually change are those in the input page mask 6992 // which are resident on the target. 6993 if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask)) 6994 return NV_OK; 6995 6996 status = block_pre_populate_pde1_all_gpus(block, out_tracker); 6997 if (status != NV_OK) 6998 return status; 6999 7000 block->cpu.ever_mapped = true; 7001 7002 for_each_va_block_page_in_region_mask(page_index, pages_to_map, region) { 7003 status = block_map_cpu_page_to(block, 7004 block_context, 7005 resident_id, 7006 page_index, 7007 new_prot); 7008 if (status != NV_OK) 7009 break; 7010 7011 uvm_processor_mask_set(&block->mapped, UVM_ID_CPU); 7012 } 7013 7014 // If there was some error, shrink the region so that we only update the 7015 // pte/mapping tracking bits for the pages that succeeded 7016 if (status != NV_OK) { 7017 region = uvm_va_block_region(region.first, page_index); 7018 uvm_page_mask_region_clear_outside(pages_to_map, region); 7019 } 7020 7021 // If pages are mapped from a remote residency, notify the remote mapping 7022 // events to tools. We skip event notification if the cause is Invalid. We 7023 // use it to signal that this function is being called from the revocation 7024 // path to avoid reporting duplicate events. 7025 if (UVM_ID_IS_GPU(resident_id) && 7026 va_space->tools.enabled && 7027 block_context->mapping.cause != UvmEventMapRemoteCauseInvalid) { 7028 uvm_va_block_region_t subregion; 7029 for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) { 7030 uvm_tools_record_map_remote(block, 7031 NULL, 7032 UVM_ID_CPU, 7033 resident_id, 7034 uvm_va_block_region_start(block, subregion), 7035 uvm_va_block_region_size(subregion), 7036 block_context->mapping.cause); 7037 } 7038 } 7039 7040 // Update CPU mapping state 7041 for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++) 7042 uvm_page_mask_or(&block->cpu.pte_bits[pte_bit], &block->cpu.pte_bits[pte_bit], pages_to_map); 7043 7044 uvm_page_mask_or(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_map); 7045 7046 UVM_ASSERT(block_check_mappings(block)); 7047 7048 // Remove all pages that were newly-mapped from the input mask 7049 uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map); 7050 7051 return status; 7052 } 7053 7054 // Maps the GPU to the given pages which are resident on resident_id. 7055 // map_page_mask is an in/out parameter: the pages which are mapped 7056 // to resident_id are removed from the mask before returning. 7057 // 7058 // Caller must ensure that: 7059 // - Pages in map_page_mask must not be set in the corresponding pte_bits mask 7060 // for the requested protection on the mapping GPU. 7061 static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block, 7062 uvm_va_block_context_t *block_context, 7063 uvm_gpu_t *gpu, 7064 uvm_processor_id_t resident_id, 7065 uvm_page_mask_t *map_page_mask, 7066 uvm_prot_t new_prot, 7067 uvm_tracker_t *out_tracker) 7068 { 7069 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 7070 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7071 uvm_push_t push; 7072 NV_STATUS status; 7073 uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask; 7074 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id); 7075 uvm_pte_bits_gpu_t pte_bit; 7076 uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot); 7077 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 7078 block_pte_op_t pte_op; 7079 7080 UVM_ASSERT(map_page_mask); 7081 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], gpu->id)); 7082 7083 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) 7084 UVM_ASSERT(uvm_id_equal(resident_id, uvm_va_range_get_policy(va_block->va_range)->preferred_location)); 7085 7086 UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask, 7087 map_page_mask, 7088 &gpu_state->pte_bits[prot_pte_bit])); 7089 7090 // The pages which will actually change are those in the input page mask 7091 // which are resident on the target. 7092 if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask)) 7093 return NV_OK; 7094 7095 UVM_ASSERT(block_check_mapping_residency(va_block, gpu, resident_id, pages_to_map)); 7096 7097 // For PTE merge/split computation, compute all resident pages which will 7098 // have exactly new_prot after performing the mapping. 7099 uvm_page_mask_or(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_map); 7100 if (prot_pte_bit < UVM_PTE_BITS_GPU_ATOMIC) { 7101 uvm_page_mask_andnot(&block_context->scratch_page_mask, 7102 &block_context->scratch_page_mask, 7103 &gpu_state->pte_bits[prot_pte_bit + 1]); 7104 } 7105 uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask); 7106 7107 block_gpu_compute_new_pte_state(va_block, 7108 gpu, 7109 resident_id, 7110 pages_to_map, 7111 &block_context->scratch_page_mask, 7112 new_pte_state); 7113 7114 status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker); 7115 if (status != NV_OK) 7116 return status; 7117 7118 status = uvm_push_begin_acquire(gpu->channel_manager, 7119 UVM_CHANNEL_TYPE_MEMOPS, 7120 &va_block->tracker, 7121 &push, 7122 "Mapping pages in block [0x%llx, 0x%llx) as %s", 7123 va_block->start, 7124 va_block->end + 1, 7125 uvm_prot_string(new_prot)); 7126 if (status != NV_OK) 7127 return status; 7128 7129 pte_op = BLOCK_PTE_OP_MAP; 7130 if (new_pte_state->pte_is_2m) { 7131 // We're either modifying permissions of a pre-existing 2M PTE, or all 7132 // permissions match so we can merge to a new 2M PTE. 7133 block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op); 7134 } 7135 else if (gpu_state->pte_is_2m) { 7136 // Permissions on a subset of the existing 2M PTE are being upgraded, so 7137 // we have to split it into the appropriate mix of big and 4k PTEs. 7138 block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op); 7139 } 7140 else { 7141 // We're upgrading permissions on some pre-existing mix of big and 4K 7142 // PTEs into some other mix of big and 4K PTEs. 7143 block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op); 7144 } 7145 7146 // If we are mapping remotely, record the event 7147 if (va_space->tools.enabled && !uvm_id_equal(resident_id, gpu->id)) { 7148 uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(va_block); 7149 7150 UVM_ASSERT(block_context->mapping.cause != UvmEventMapRemoteCauseInvalid); 7151 7152 for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) { 7153 uvm_tools_record_map_remote(va_block, 7154 &push, 7155 gpu->id, 7156 resident_id, 7157 uvm_va_block_region_start(va_block, subregion), 7158 uvm_va_block_region_size(subregion), 7159 block_context->mapping.cause); 7160 } 7161 } 7162 7163 uvm_push_end(&push); 7164 7165 // Update GPU mapping state 7166 for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++) 7167 uvm_page_mask_or(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_map); 7168 7169 uvm_processor_mask_set(&va_block->mapped, gpu->id); 7170 7171 // If we are mapping a UVM-Lite GPU do not update maybe_mapped_pages 7172 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) 7173 uvm_page_mask_or(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, pages_to_map); 7174 7175 // Remove all pages resident on this processor from the input mask, which 7176 // were newly-mapped. 7177 uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map); 7178 7179 UVM_ASSERT(block_check_mappings(va_block)); 7180 7181 return uvm_tracker_add_push_safe(out_tracker, &push); 7182 } 7183 7184 static void map_get_allowed_destinations(uvm_va_block_t *block, 7185 uvm_va_block_context_t *va_block_context, 7186 const uvm_va_policy_t *policy, 7187 uvm_processor_id_t id, 7188 uvm_processor_mask_t *allowed_mask) 7189 { 7190 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7191 7192 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), id)) { 7193 // UVM-Lite can only map resident pages on the preferred location 7194 uvm_processor_mask_zero(allowed_mask); 7195 uvm_processor_mask_set(allowed_mask, policy->preferred_location); 7196 } 7197 else if ((uvm_va_policy_is_read_duplicate(policy, va_space) || 7198 (uvm_id_equal(policy->preferred_location, id) && 7199 !is_uvm_fault_force_sysmem_set() && 7200 !uvm_hmm_must_use_sysmem(block, va_block_context))) && 7201 uvm_va_space_processor_has_memory(va_space, id)) { 7202 // When operating under read-duplication we should only map the local 7203 // processor to cause fault-and-duplicate of remote pages. 7204 // 7205 // The same holds when this processor is the preferred location: only 7206 // create local mappings to force remote pages to fault-and-migrate. 7207 uvm_processor_mask_zero(allowed_mask); 7208 uvm_processor_mask_set(allowed_mask, id); 7209 } 7210 else { 7211 // Common case: Just map wherever the memory happens to reside 7212 uvm_processor_mask_and(allowed_mask, &block->resident, &va_space->can_access[uvm_id_value(id)]); 7213 return; 7214 } 7215 7216 // Clamp to resident and accessible processors 7217 uvm_processor_mask_and(allowed_mask, allowed_mask, &block->resident); 7218 uvm_processor_mask_and(allowed_mask, allowed_mask, &va_space->can_access[uvm_id_value(id)]); 7219 } 7220 7221 NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block, 7222 uvm_va_block_context_t *va_block_context, 7223 uvm_processor_id_t id, 7224 uvm_va_block_region_t region, 7225 const uvm_page_mask_t *map_page_mask, 7226 uvm_prot_t new_prot, 7227 UvmEventMapRemoteCause cause, 7228 uvm_tracker_t *out_tracker) 7229 { 7230 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7231 uvm_gpu_t *gpu = NULL; 7232 uvm_processor_mask_t allowed_destinations; 7233 uvm_processor_id_t resident_id; 7234 const uvm_page_mask_t *pte_mask; 7235 uvm_page_mask_t *running_page_mask = &va_block_context->mapping.map_running_page_mask; 7236 NV_STATUS status; 7237 7238 va_block_context->mapping.cause = cause; 7239 7240 UVM_ASSERT(new_prot != UVM_PROT_NONE); 7241 UVM_ASSERT(new_prot < UVM_PROT_MAX); 7242 uvm_assert_mutex_locked(&va_block->lock); 7243 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region)); 7244 7245 // Mapping is not supported on the eviction path that doesn't hold the VA 7246 // space lock. 7247 uvm_assert_rwsem_locked(&va_space->lock); 7248 7249 if (UVM_ID_IS_CPU(id)) { 7250 uvm_pte_bits_cpu_t prot_pte_bit; 7251 7252 // Check if the current thread is allowed to call vm_insert_page 7253 if (!uvm_va_block_is_hmm(va_block) && !uvm_va_range_vma_check(va_block->va_range, va_block_context->mm)) 7254 return NV_OK; 7255 7256 prot_pte_bit = get_cpu_pte_bit_index(new_prot); 7257 pte_mask = &va_block->cpu.pte_bits[prot_pte_bit]; 7258 } 7259 else { 7260 uvm_va_block_gpu_state_t *gpu_state; 7261 uvm_pte_bits_gpu_t prot_pte_bit; 7262 7263 gpu = uvm_va_space_get_gpu(va_space, id); 7264 7265 // Although this GPU UUID is registered in the VA space, it might not have a 7266 // GPU VA space registered. 7267 if (!uvm_gpu_va_space_get(va_space, gpu)) 7268 return NV_OK; 7269 7270 gpu_state = block_gpu_state_get_alloc(va_block, gpu); 7271 if (!gpu_state) 7272 return NV_ERR_NO_MEMORY; 7273 7274 prot_pte_bit = get_gpu_pte_bit_index(new_prot); 7275 pte_mask = &gpu_state->pte_bits[prot_pte_bit]; 7276 } 7277 7278 uvm_page_mask_init_from_region(running_page_mask, region, map_page_mask); 7279 7280 if (!uvm_page_mask_andnot(running_page_mask, running_page_mask, pte_mask)) 7281 return NV_OK; 7282 7283 // Map per resident location so we can more easily detect physically- 7284 // contiguous mappings. 7285 map_get_allowed_destinations(va_block, va_block_context, va_block_context->policy, id, &allowed_destinations); 7286 7287 for_each_closest_id(resident_id, &allowed_destinations, id, va_space) { 7288 if (UVM_ID_IS_CPU(id)) { 7289 status = block_map_cpu_to(va_block, 7290 va_block_context, 7291 resident_id, 7292 region, 7293 running_page_mask, 7294 new_prot, 7295 out_tracker); 7296 } 7297 else { 7298 status = block_map_gpu_to(va_block, 7299 va_block_context, 7300 gpu, 7301 resident_id, 7302 running_page_mask, 7303 new_prot, 7304 out_tracker); 7305 } 7306 7307 if (status != NV_OK) 7308 return status; 7309 7310 // If we've mapped all requested pages, we're done 7311 if (uvm_page_mask_region_empty(running_page_mask, region)) 7312 break; 7313 } 7314 7315 return NV_OK; 7316 } 7317 7318 // Revokes the given pages mapped by cpu. This is implemented by unmapping all 7319 // pages and mapping them later with the lower permission. This is required 7320 // because vm_insert_page can only be used for upgrades from Invalid. 7321 // 7322 // Caller must ensure that: 7323 // - Pages in revoke_page_mask must be set in the 7324 // cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE] mask. 7325 static NV_STATUS block_revoke_cpu_write(uvm_va_block_t *block, 7326 uvm_va_block_context_t *block_context, 7327 uvm_va_block_region_t region, 7328 const uvm_page_mask_t *revoke_page_mask, 7329 uvm_tracker_t *out_tracker) 7330 { 7331 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7332 uvm_va_block_region_t subregion; 7333 7334 UVM_ASSERT(revoke_page_mask); 7335 7336 UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])); 7337 7338 block_unmap_cpu(block, region, revoke_page_mask); 7339 7340 // Coalesce revocation event notification 7341 for_each_va_block_subregion_in_mask(subregion, revoke_page_mask, region) { 7342 uvm_perf_event_notify_revocation(&va_space->perf_events, 7343 block, 7344 UVM_ID_CPU, 7345 uvm_va_block_region_start(block, subregion), 7346 uvm_va_block_region_size(subregion), 7347 UVM_PROT_READ_WRITE_ATOMIC, 7348 UVM_PROT_READ_ONLY); 7349 } 7350 7351 // uvm_va_block_map will skip this remap if we aren't holding the right mm 7352 // lock. 7353 return uvm_va_block_map(block, 7354 block_context, 7355 UVM_ID_CPU, 7356 region, 7357 revoke_page_mask, 7358 UVM_PROT_READ_ONLY, 7359 UvmEventMapRemoteCauseInvalid, 7360 out_tracker); 7361 } 7362 7363 static void block_revoke_prot_gpu_perf_notify(uvm_va_block_t *block, 7364 uvm_va_block_context_t *block_context, 7365 uvm_gpu_t *gpu, 7366 uvm_prot_t prot_revoked, 7367 const uvm_page_mask_t *pages_revoked) 7368 { 7369 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7370 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); 7371 uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(block); 7372 uvm_pte_bits_gpu_t pte_bit; 7373 7374 for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= get_gpu_pte_bit_index(prot_revoked); pte_bit--) { 7375 uvm_prot_t old_prot; 7376 7377 if (!uvm_page_mask_and(&block_context->scratch_page_mask, &gpu_state->pte_bits[pte_bit], pages_revoked)) 7378 continue; 7379 7380 if (pte_bit == UVM_PTE_BITS_GPU_ATOMIC) 7381 old_prot = UVM_PROT_READ_WRITE_ATOMIC; 7382 else 7383 old_prot = UVM_PROT_READ_WRITE; 7384 7385 for_each_va_block_subregion_in_mask(subregion, &block_context->scratch_page_mask, region) { 7386 uvm_perf_event_notify_revocation(&va_space->perf_events, 7387 block, 7388 gpu->id, 7389 uvm_va_block_region_start(block, subregion), 7390 uvm_va_block_region_size(subregion), 7391 old_prot, 7392 prot_revoked - 1); 7393 } 7394 } 7395 } 7396 7397 // Revokes the given pages mapped by gpu which are resident on resident_id. 7398 // revoke_page_mask is an in/out parameter: the pages which have the appropriate 7399 // permissions and are mapped to resident_id are removed from the mask before 7400 // returning. 7401 // 7402 // Caller must ensure that: 7403 // - Pages in map_page_mask must be set in the corresponding pte_bits mask for 7404 // the protection to be revoked on the mapping GPU. 7405 static NV_STATUS block_revoke_prot_gpu_to(uvm_va_block_t *va_block, 7406 uvm_va_block_context_t *block_context, 7407 uvm_gpu_t *gpu, 7408 uvm_processor_id_t resident_id, 7409 uvm_page_mask_t *revoke_page_mask, 7410 uvm_prot_t prot_to_revoke, 7411 uvm_tracker_t *out_tracker) 7412 { 7413 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 7414 uvm_push_t push; 7415 NV_STATUS status; 7416 uvm_pte_bits_gpu_t pte_bit; 7417 uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke); 7418 uvm_prot_t new_prot = prot_to_revoke - 1; 7419 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; 7420 block_pte_op_t pte_op; 7421 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id); 7422 uvm_page_mask_t *pages_to_revoke = &block_context->mapping.page_mask; 7423 7424 UVM_ASSERT(revoke_page_mask); 7425 UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &gpu_state->pte_bits[prot_pte_bit])); 7426 7427 // The pages which will actually change are those in the input page mask 7428 // which are resident on the target. 7429 if (!uvm_page_mask_and(pages_to_revoke, revoke_page_mask, resident_mask)) 7430 return NV_OK; 7431 7432 UVM_ASSERT(block_check_mapping_residency(va_block, gpu, resident_id, pages_to_revoke)); 7433 7434 // For PTE merge/split computation, compute all resident pages which will 7435 // have exactly prot_to_revoke-1 after performing the revocation. 7436 uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_revoke); 7437 uvm_page_mask_andnot(&block_context->scratch_page_mask, 7438 &gpu_state->pte_bits[prot_pte_bit - 1], 7439 &block_context->scratch_page_mask); 7440 uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask); 7441 7442 block_gpu_compute_new_pte_state(va_block, 7443 gpu, 7444 resident_id, 7445 pages_to_revoke, 7446 &block_context->scratch_page_mask, 7447 new_pte_state); 7448 7449 status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker); 7450 if (status != NV_OK) 7451 return status; 7452 7453 status = uvm_push_begin_acquire(gpu->channel_manager, 7454 UVM_CHANNEL_TYPE_MEMOPS, 7455 &va_block->tracker, 7456 &push, 7457 "Revoking %s access privileges in block [0x%llx, 0x%llx) ", 7458 uvm_prot_string(prot_to_revoke), 7459 va_block->start, 7460 va_block->end + 1); 7461 if (status != NV_OK) 7462 return status; 7463 7464 pte_op = BLOCK_PTE_OP_REVOKE; 7465 if (new_pte_state->pte_is_2m) { 7466 // We're either modifying permissions of a pre-existing 2M PTE, or all 7467 // permissions match so we can merge to a new 2M PTE. 7468 block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op); 7469 } 7470 else if (gpu_state->pte_is_2m) { 7471 // Permissions on a subset of the existing 2M PTE are being downgraded, 7472 // so we have to split it into the appropriate mix of big and 4k PTEs. 7473 block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op); 7474 } 7475 else { 7476 // We're downgrading permissions on some pre-existing mix of big and 4K 7477 // PTEs into some other mix of big and 4K PTEs. 7478 block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op); 7479 } 7480 7481 uvm_push_end(&push); 7482 7483 block_revoke_prot_gpu_perf_notify(va_block, block_context, gpu, prot_to_revoke, pages_to_revoke); 7484 7485 // Update GPU mapping state 7486 for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= prot_pte_bit; pte_bit--) 7487 uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_revoke); 7488 7489 // Remove all pages resident on this processor from the input mask, which 7490 // pages which were revoked and pages which already had the correct 7491 // permissions. 7492 uvm_page_mask_andnot(revoke_page_mask, revoke_page_mask, pages_to_revoke); 7493 7494 UVM_ASSERT(block_check_mappings(va_block)); 7495 7496 return uvm_tracker_add_push_safe(out_tracker, &push); 7497 } 7498 7499 NV_STATUS uvm_va_block_revoke_prot(uvm_va_block_t *va_block, 7500 uvm_va_block_context_t *va_block_context, 7501 uvm_processor_id_t id, 7502 uvm_va_block_region_t region, 7503 const uvm_page_mask_t *revoke_page_mask, 7504 uvm_prot_t prot_to_revoke, 7505 uvm_tracker_t *out_tracker) 7506 { 7507 uvm_gpu_t *gpu; 7508 uvm_va_block_gpu_state_t *gpu_state; 7509 uvm_processor_mask_t resident_procs; 7510 uvm_processor_id_t resident_id; 7511 uvm_page_mask_t *running_page_mask = &va_block_context->mapping.revoke_running_page_mask; 7512 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7513 uvm_pte_bits_gpu_t prot_pte_bit; 7514 7515 UVM_ASSERT(prot_to_revoke > UVM_PROT_READ_ONLY); 7516 UVM_ASSERT(prot_to_revoke < UVM_PROT_MAX); 7517 uvm_assert_mutex_locked(&va_block->lock); 7518 7519 if (UVM_ID_IS_CPU(id)) { 7520 if (prot_to_revoke == UVM_PROT_READ_WRITE_ATOMIC) 7521 return NV_OK; 7522 7523 if (uvm_va_block_is_hmm(va_block)) { 7524 // Linux is responsible for CPU page table updates. 7525 uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], region); 7526 return NV_OK; 7527 } 7528 7529 uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask); 7530 7531 if (uvm_page_mask_and(running_page_mask, running_page_mask, &va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])) 7532 return block_revoke_cpu_write(va_block, va_block_context, region, running_page_mask, out_tracker); 7533 7534 return NV_OK; 7535 } 7536 7537 gpu = uvm_va_space_get_gpu(va_space, id); 7538 7539 // UVM-Lite GPUs should never have access revoked 7540 UVM_ASSERT_MSG(!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id), 7541 "GPU %s\n", uvm_gpu_name(gpu)); 7542 7543 // Return early if there are no mappings for the GPU present in the block 7544 if (!uvm_processor_mask_test(&va_block->mapped, gpu->id)) 7545 return NV_OK; 7546 7547 gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 7548 prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke); 7549 7550 uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask); 7551 7552 if (!uvm_page_mask_and(running_page_mask, running_page_mask, &gpu_state->pte_bits[prot_pte_bit])) 7553 return NV_OK; 7554 7555 // Revoke per resident location so we can more easily detect physically- 7556 // contiguous mappings. 7557 uvm_processor_mask_copy(&resident_procs, &va_block->resident); 7558 7559 for_each_closest_id(resident_id, &resident_procs, gpu->id, va_space) { 7560 NV_STATUS status = block_revoke_prot_gpu_to(va_block, 7561 va_block_context, 7562 gpu, 7563 resident_id, 7564 running_page_mask, 7565 prot_to_revoke, 7566 out_tracker); 7567 if (status != NV_OK) 7568 return status; 7569 7570 // If we've revoked all requested pages, we're done 7571 if (uvm_page_mask_region_empty(running_page_mask, region)) 7572 break; 7573 } 7574 7575 return NV_OK; 7576 } 7577 7578 NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block, 7579 uvm_va_block_context_t *va_block_context, 7580 const uvm_processor_mask_t *map_processor_mask, 7581 uvm_va_block_region_t region, 7582 const uvm_page_mask_t *map_page_mask, 7583 uvm_prot_t new_prot, 7584 UvmEventMapRemoteCause cause) 7585 { 7586 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 7587 NV_STATUS status = NV_OK; 7588 NV_STATUS tracker_status; 7589 uvm_processor_id_t id; 7590 7591 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region)); 7592 7593 for_each_id_in_mask(id, map_processor_mask) { 7594 status = uvm_va_block_map(va_block, 7595 va_block_context, 7596 id, 7597 region, 7598 map_page_mask, 7599 new_prot, 7600 cause, 7601 &local_tracker); 7602 if (status != NV_OK) 7603 break; 7604 } 7605 7606 // Regardless of error, add the successfully-pushed mapping operations into 7607 // the block's tracker. Note that we can't overwrite the tracker because we 7608 // aren't guaranteed that the map actually pushed anything (in which case it 7609 // would've acquired the block tracker first). 7610 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 7611 uvm_tracker_deinit(&local_tracker); 7612 7613 return status == NV_OK ? tracker_status : status; 7614 } 7615 7616 NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block, 7617 uvm_va_block_context_t *va_block_context, 7618 const uvm_processor_mask_t *unmap_processor_mask, 7619 uvm_va_block_region_t region, 7620 const uvm_page_mask_t *unmap_page_mask) 7621 { 7622 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 7623 NV_STATUS status = NV_OK; 7624 NV_STATUS tracker_status; 7625 uvm_processor_id_t id; 7626 7627 // Watch out, unmap_mask could change during iteration since it could be 7628 // va_block->mapped. 7629 for_each_id_in_mask(id, unmap_processor_mask) { 7630 // Errors could either be a system-fatal error (ECC) or an allocation 7631 // retry due to PTE splitting. In either case we should stop after 7632 // hitting the first one. 7633 status = uvm_va_block_unmap(va_block, va_block_context, id, region, unmap_page_mask, &local_tracker); 7634 if (status != NV_OK) 7635 break; 7636 } 7637 7638 // See the comment in uvm_va_block_map_mask for adding to the tracker. 7639 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 7640 uvm_tracker_deinit(&local_tracker); 7641 7642 return status == NV_OK ? tracker_status : status; 7643 } 7644 7645 NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block, 7646 uvm_va_block_context_t *va_block_context, 7647 const uvm_processor_mask_t *revoke_processor_mask, 7648 uvm_va_block_region_t region, 7649 const uvm_page_mask_t *revoke_page_mask, 7650 uvm_prot_t prot_to_revoke) 7651 { 7652 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 7653 NV_STATUS status = NV_OK; 7654 NV_STATUS tracker_status; 7655 uvm_processor_id_t id; 7656 7657 for_each_id_in_mask(id, revoke_processor_mask) { 7658 status = uvm_va_block_revoke_prot(va_block, 7659 va_block_context, 7660 id, 7661 region, 7662 revoke_page_mask, 7663 prot_to_revoke, 7664 &local_tracker); 7665 if (status != NV_OK) 7666 break; 7667 } 7668 7669 // See the comment in uvm_va_block_map_mask for adding to the tracker. 7670 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 7671 uvm_tracker_deinit(&local_tracker); 7672 7673 return status == NV_OK ? tracker_status : status; 7674 } 7675 7676 // Updates the read_duplicated_pages mask in the block when the state of GPU id 7677 // is being destroyed 7678 static void update_read_duplicated_pages_mask(uvm_va_block_t *block, 7679 uvm_gpu_id_t id, 7680 uvm_va_block_gpu_state_t *gpu_state) 7681 { 7682 uvm_gpu_id_t running_id; 7683 bool first = true; 7684 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7685 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 7686 uvm_page_mask_t *running_page_mask = &block_context->update_read_duplicated_pages.running_page_mask; 7687 uvm_page_mask_t *tmp_page_mask = &block_context->scratch_page_mask; 7688 7689 uvm_page_mask_zero(&block->read_duplicated_pages); 7690 7691 for_each_id_in_mask(running_id, &block->resident) { 7692 const uvm_page_mask_t *running_residency_mask; 7693 7694 if (uvm_id_equal(running_id, id)) 7695 continue; 7696 7697 running_residency_mask = uvm_va_block_resident_mask_get(block, running_id); 7698 7699 if (first) { 7700 uvm_page_mask_copy(running_page_mask, running_residency_mask); 7701 first = false; 7702 continue; 7703 } 7704 7705 if (uvm_page_mask_and(tmp_page_mask, running_page_mask, running_residency_mask)) 7706 uvm_page_mask_or(&block->read_duplicated_pages, &block->read_duplicated_pages, tmp_page_mask); 7707 7708 uvm_page_mask_or(running_page_mask, running_page_mask, running_residency_mask); 7709 } 7710 } 7711 7712 // Unmaps all GPU mappings under this block, frees the page tables, and frees 7713 // all the GPU chunks. This simply drops the chunks on the floor, so the caller 7714 // must take care of copying the data elsewhere if it needs to remain intact. 7715 // 7716 // This serializes on the block tracker since it must unmap page tables. 7717 static void block_destroy_gpu_state(uvm_va_block_t *block, uvm_gpu_id_t id) 7718 { 7719 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id); 7720 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 7721 uvm_gpu_va_space_t *gpu_va_space; 7722 uvm_gpu_t *gpu, *other_gpu; 7723 7724 if (!gpu_state) 7725 return; 7726 7727 uvm_assert_mutex_locked(&block->lock); 7728 7729 // Unmap PTEs and free page tables 7730 gpu = uvm_va_space_get_gpu(va_space, id); 7731 gpu_va_space = uvm_gpu_va_space_get(va_space, gpu); 7732 if (gpu_va_space) { 7733 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 7734 7735 uvm_va_block_remove_gpu_va_space(block, gpu_va_space, block_context); 7736 } 7737 7738 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id)); 7739 7740 // No processor should have this GPU mapped at this point 7741 UVM_ASSERT(block_check_processor_not_mapped(block, id)); 7742 7743 // We need to remove the mappings of the indirect peers from the reverse 7744 // map when the GPU state is being destroyed (for example, on 7745 // unregister_gpu) and when peer access between indirect peers is disabled. 7746 // However, we need to avoid double mapping removals. There are two 7747 // possible scenarios: 7748 // - Disable peer access first. This will remove all mappings between A and 7749 // B GPUs, and the indirect_peers bit is cleared. Thus, the later call to 7750 // unregister_gpu will not operate on that pair of GPUs. 7751 // - Unregister GPU first. This will remove all mappings from all indirect 7752 // peers to the GPU being unregistered. It will also destroy its GPU state. 7753 // Subsequent calls to disable peers will remove the mappings from the GPU 7754 // being unregistered, but never to the GPU being unregistered (since it no 7755 // longer has a valid GPU state). 7756 for_each_va_space_gpu_in_mask(other_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) 7757 block_gpu_unmap_all_chunks_indirect_peer(block, gpu, other_gpu); 7758 7759 if (gpu_state->chunks) { 7760 size_t i, num_chunks; 7761 7762 update_read_duplicated_pages_mask(block, id, gpu_state); 7763 uvm_page_mask_zero(&gpu_state->resident); 7764 block_clear_resident_processor(block, id); 7765 7766 num_chunks = block_num_gpu_chunks(block, gpu); 7767 for (i = 0; i < num_chunks; i++) { 7768 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 7769 if (!chunk) 7770 continue; 7771 7772 uvm_mmu_chunk_unmap(chunk, &block->tracker); 7773 uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker); 7774 } 7775 7776 uvm_kvfree(gpu_state->chunks); 7777 } 7778 else { 7779 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id)); 7780 } 7781 7782 7783 // Pending operations may still need the DMA memory to be mapped. 7784 uvm_tracker_wait(&block->tracker); 7785 7786 block_gpu_unmap_phys_all_cpu_pages(block, gpu); 7787 uvm_processor_mask_clear(&block->evicted_gpus, id); 7788 7789 kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state); 7790 block->gpus[uvm_id_gpu_index(id)] = NULL; 7791 } 7792 7793 static void block_put_ptes_safe(uvm_page_tree_t *tree, uvm_page_table_range_t *range) 7794 { 7795 if (range->table) { 7796 uvm_page_tree_put_ptes(tree, range); 7797 memset(range, 0, sizeof(*range)); 7798 } 7799 } 7800 7801 NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space) 7802 { 7803 uvm_assert_mutex_locked(&va_block->lock); 7804 7805 if (!gpu_va_space->ats.enabled || !va_block->cpu.ever_mapped) 7806 return NV_OK; 7807 7808 // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See 7809 // comments in pre_populate_pde1_gpu. 7810 return block_pre_populate_pde1_gpu(va_block, gpu_va_space, NULL); 7811 } 7812 7813 void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block, 7814 uvm_gpu_va_space_t *gpu_va_space, 7815 uvm_va_block_context_t *block_context) 7816 { 7817 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; 7818 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; 7819 uvm_gpu_t *gpu = gpu_va_space->gpu; 7820 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 7821 uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block); 7822 uvm_push_t push; 7823 NV_STATUS status; 7824 7825 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 7826 7827 if (!gpu_state) 7828 return; 7829 7830 uvm_assert_mutex_locked(&va_block->lock); 7831 7832 // Unmapping the whole block won't cause a page table split, so this should 7833 // only fail if we have a system-fatal error. 7834 status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &local_tracker); 7835 if (status != NV_OK) { 7836 UVM_ASSERT(status == uvm_global_get_status()); 7837 return; // Just leak 7838 } 7839 7840 UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id)); 7841 7842 // Reset the page tables if other allocations could reuse them 7843 if (!block_gpu_supports_2m(va_block, gpu) && 7844 !bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { 7845 7846 status = uvm_push_begin_acquire(gpu->channel_manager, 7847 UVM_CHANNEL_TYPE_MEMOPS, 7848 &local_tracker, 7849 &push, 7850 "Resetting PTEs for block [0x%llx, 0x%llx)", 7851 va_block->start, 7852 va_block->end + 1); 7853 if (status != NV_OK) { 7854 UVM_ASSERT(status == uvm_global_get_status()); 7855 return; // Just leak 7856 } 7857 7858 uvm_pte_batch_begin(&push, pte_batch); 7859 uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch); 7860 7861 // When the big PTEs is active, the 4k PTEs under it are garbage. Make 7862 // them invalid so the page tree code can reuse them for other 7863 // allocations on this VA. These don't need TLB invalidates since the 7864 // big PTEs above them are active. 7865 if (gpu_state->page_table_range_4k.table) { 7866 uvm_page_mask_init_from_big_ptes(va_block, gpu, &block_context->scratch_page_mask, gpu_state->big_ptes); 7867 block_gpu_pte_clear_4k(va_block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); 7868 } 7869 7870 // We unmapped all big PTEs above, which means they have the unmapped 7871 // pattern so the GPU MMU won't read 4k PTEs under them. Set them to 7872 // invalid to activate the 4ks below so new allocations using just those 7873 // 4k PTEs will work. 7874 block_gpu_pte_clear_big(va_block, gpu, gpu_state->big_ptes, 0, pte_batch, tlb_batch); 7875 7876 uvm_pte_batch_end(pte_batch); 7877 uvm_tlb_batch_end(tlb_batch, &push, UVM_MEMBAR_NONE); 7878 7879 uvm_push_end(&push); 7880 uvm_tracker_overwrite_with_push(&local_tracker, &push); 7881 } 7882 7883 // The unmap must finish before we free the page tables 7884 status = uvm_tracker_wait_deinit(&local_tracker); 7885 if (status != NV_OK) 7886 return; // System-fatal error, just leak 7887 7888 // Note that if the PTE is currently 2M with lower tables allocated but not 7889 // in use, calling put_ptes on those lower ranges will re-write the 2M entry 7890 // to be a PDE. 7891 block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_4k); 7892 block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_big); 7893 block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_2m); 7894 7895 gpu_state->pte_is_2m = false; 7896 gpu_state->initialized_big = false; 7897 gpu_state->activated_big = false; 7898 gpu_state->activated_4k = false; 7899 bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 7900 7901 UVM_ASSERT(block_check_mappings(va_block)); 7902 } 7903 7904 NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) 7905 { 7906 NV_STATUS status; 7907 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7908 7909 UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type != UVM_GPU_LINK_INVALID); 7910 uvm_assert_rwsem_locked_write(&va_space->lock); 7911 uvm_assert_mutex_locked(&va_block->lock); 7912 7913 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) { 7914 status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu0, gpu1); 7915 if (status != NV_OK) 7916 return status; 7917 7918 status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu1, gpu0); 7919 if (status != NV_OK) { 7920 block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1); 7921 return status; 7922 } 7923 } 7924 7925 // TODO: Bug 1767224: Refactor the uvm_va_block_set_accessed_by logic so we 7926 // call it here. 7927 7928 return NV_OK; 7929 } 7930 7931 void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) 7932 { 7933 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7934 NV_STATUS status; 7935 uvm_tracker_t tracker = UVM_TRACKER_INIT(); 7936 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 7937 uvm_page_mask_t *unmap_page_mask = &block_context->caller_page_mask; 7938 const uvm_page_mask_t *resident0; 7939 const uvm_page_mask_t *resident1; 7940 7941 uvm_assert_mutex_locked(&va_block->lock); 7942 7943 // See comment in block_destroy_gpu_state 7944 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) { 7945 block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1); 7946 block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu1, gpu0); 7947 } 7948 7949 // If either of the GPUs doesn't have GPU state then nothing could be mapped 7950 // between them. 7951 if (!uvm_va_block_gpu_state_get(va_block, gpu0->id) || !uvm_va_block_gpu_state_get(va_block, gpu1->id)) 7952 return; 7953 7954 resident0 = uvm_va_block_resident_mask_get(va_block, gpu0->id); 7955 resident1 = uvm_va_block_resident_mask_get(va_block, gpu1->id); 7956 7957 // Unmap all pages resident on gpu1, but not on gpu0, from gpu0 7958 if (uvm_page_mask_andnot(unmap_page_mask, resident1, resident0)) { 7959 status = block_unmap_gpu(va_block, block_context, gpu0, unmap_page_mask, &tracker); 7960 if (status != NV_OK) { 7961 // Since all PTEs unmapped by this call have the same aperture, page 7962 // splits should never be required so any failure should be the 7963 // result of a system-fatal error. 7964 UVM_ASSERT_MSG(status == uvm_global_get_status(), 7965 "Unmapping failed: %s, GPU %s\n", 7966 nvstatusToString(status), 7967 uvm_gpu_name(gpu0)); 7968 } 7969 } 7970 7971 // Unmap all pages resident on gpu0, but not on gpu1, from gpu1 7972 if (uvm_page_mask_andnot(unmap_page_mask, resident0, resident1)) { 7973 status = block_unmap_gpu(va_block, block_context, gpu1, unmap_page_mask, &tracker); 7974 if (status != NV_OK) { 7975 UVM_ASSERT_MSG(status == uvm_global_get_status(), 7976 "Unmapping failed: %s, GPU %s\n", 7977 nvstatusToString(status), 7978 uvm_gpu_name(gpu0)); 7979 } 7980 } 7981 7982 status = uvm_tracker_add_tracker_safe(&va_block->tracker, &tracker); 7983 if (status != NV_OK) 7984 UVM_ASSERT(status == uvm_global_get_status()); 7985 7986 status = uvm_tracker_wait_deinit(&tracker); 7987 if (status != NV_OK) 7988 UVM_ASSERT(status == uvm_global_get_status()); 7989 } 7990 7991 void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uvm_gpu_t *gpu) 7992 { 7993 NV_STATUS status; 7994 uvm_va_range_t *va_range = va_block->va_range; 7995 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 7996 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 7997 uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block); 7998 7999 uvm_assert_mutex_locked(&va_block->lock); 8000 UVM_ASSERT(uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu->id)); 8001 8002 // If the GPU doesn't have GPU state then nothing could be mapped. 8003 if (!uvm_va_block_gpu_state_get(va_block, gpu->id)) 8004 return; 8005 8006 // In UVM-Lite mode, mappings to the preferred location are not tracked 8007 // directly, so just unmap the whole block. 8008 status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &va_block->tracker); 8009 if (status != NV_OK) { 8010 // Unmapping the whole block should not cause page splits so any failure 8011 // should be the result of a system-fatal error. 8012 UVM_ASSERT_MSG(status == uvm_global_get_status(), 8013 "Unmapping failed: %s, GPU %s\n", 8014 nvstatusToString(status), uvm_gpu_name(gpu)); 8015 } 8016 8017 status = uvm_tracker_wait(&va_block->tracker); 8018 if (status != NV_OK) { 8019 UVM_ASSERT_MSG(status == uvm_global_get_status(), 8020 "Unmapping failed: %s, GPU %s\n", 8021 nvstatusToString(status), uvm_gpu_name(gpu)); 8022 } 8023 } 8024 8025 // Evict pages from the GPU by moving each resident region to the CPU 8026 // 8027 // Notably the caller needs to support allocation-retry as 8028 // uvm_va_block_migrate_locked() requires that. 8029 static NV_STATUS block_evict_pages_from_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm) 8030 { 8031 NV_STATUS status = NV_OK; 8032 const uvm_page_mask_t *resident = uvm_va_block_resident_mask_get(va_block, gpu->id); 8033 uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block); 8034 uvm_va_block_region_t subregion; 8035 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 8036 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, mm); 8037 8038 // Move all subregions resident on the GPU to the CPU 8039 for_each_va_block_subregion_in_mask(subregion, resident, region) { 8040 if (uvm_va_block_is_hmm(va_block)) { 8041 status = uvm_hmm_va_block_evict_pages_from_gpu(va_block, 8042 gpu, 8043 block_context, 8044 resident, 8045 subregion); 8046 } 8047 else { 8048 status = uvm_va_block_migrate_locked(va_block, 8049 NULL, 8050 block_context, 8051 subregion, 8052 UVM_ID_CPU, 8053 UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP, 8054 NULL); 8055 } 8056 if (status != NV_OK) 8057 return status; 8058 } 8059 8060 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu->id)); 8061 return NV_OK; 8062 } 8063 8064 void uvm_va_block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm) 8065 { 8066 NV_STATUS status; 8067 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 8068 8069 uvm_assert_mutex_locked(&va_block->lock); 8070 8071 if (!gpu_state) 8072 return; 8073 8074 // The mappings should've already been torn down by GPU VA space unregister 8075 UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id)); 8076 UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ])); 8077 UVM_ASSERT(!block_gpu_has_page_tables(va_block, gpu)); 8078 8079 // Use UVM_VA_BLOCK_RETRY_LOCKED() as the va block lock is already taken and 8080 // we don't rely on any state of the block across the call. 8081 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL, block_evict_pages_from_gpu(va_block, gpu, mm)); 8082 if (status != NV_OK) { 8083 UVM_ERR_PRINT("Failed to evict GPU pages on GPU unregister: %s, GPU %s\n", 8084 nvstatusToString(status), 8085 uvm_gpu_name(gpu)); 8086 uvm_global_set_fatal_error(status); 8087 } 8088 8089 // This function will copy the block's tracker into each chunk then free the 8090 // chunk to PMM. If we do this before waiting for the block tracker below 8091 // we'll populate PMM's free chunks with tracker entries, which gives us 8092 // better testing coverage of chunk synchronization on GPU unregister. 8093 block_destroy_gpu_state(va_block, gpu->id); 8094 8095 // Any time a GPU is unregistered we need to make sure that there are no 8096 // pending (direct or indirect) tracker entries for that GPU left in the 8097 // block's tracker. The only way to ensure that is to wait for the whole 8098 // tracker. 8099 status = uvm_tracker_wait(&va_block->tracker); 8100 if (status != NV_OK) 8101 UVM_ASSERT(status == uvm_global_get_status()); 8102 } 8103 8104 void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm) 8105 { 8106 // Take the lock internally to not expose the caller to allocation-retry. 8107 uvm_mutex_lock(&va_block->lock); 8108 8109 uvm_va_block_unregister_gpu_locked(va_block, gpu, mm); 8110 8111 uvm_mutex_unlock(&va_block->lock); 8112 } 8113 8114 static void block_mark_region_cpu_dirty(uvm_va_block_t *va_block, uvm_va_block_region_t region) 8115 { 8116 uvm_page_index_t page_index; 8117 8118 uvm_assert_mutex_locked(&va_block->lock); 8119 8120 for_each_va_block_page_in_region_mask (page_index, &va_block->cpu.resident, region) 8121 block_mark_cpu_page_dirty(va_block, page_index); 8122 } 8123 8124 // Tears down everything within the block, but doesn't free the block itself. 8125 // Note that when uvm_va_block_kill is called, this is called twice: once for 8126 // the initial kill itself, then again when the block's ref count is eventually 8127 // destroyed. block->va_range is used to track whether the block has already 8128 // been killed. 8129 static void block_kill(uvm_va_block_t *block) 8130 { 8131 uvm_va_space_t *va_space; 8132 uvm_perf_event_data_t event_data; 8133 uvm_cpu_chunk_t *chunk; 8134 uvm_gpu_id_t id; 8135 NV_STATUS status; 8136 uvm_va_block_region_t region = uvm_va_block_region_from_block(block); 8137 uvm_page_index_t page_index; 8138 uvm_page_index_t next_page_index; 8139 8140 if (uvm_va_block_is_dead(block)) 8141 return; 8142 8143 va_space = uvm_va_block_get_va_space(block); 8144 event_data.block_destroy.block = block; 8145 uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_DESTROY, &event_data); 8146 8147 // Unmap all processors in parallel first. Unmapping the whole block won't 8148 // cause a page table split, so this should only fail if we have a system- 8149 // fatal error. 8150 if (!uvm_processor_mask_empty(&block->mapped)) { 8151 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 8152 8153 // HMM CPU mappings are controlled by Linux so no need to unmap. 8154 // Remote GPU mappings will be removed below. 8155 if (uvm_va_block_is_hmm(block) && uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) { 8156 uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]); 8157 uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]); 8158 uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU); 8159 } 8160 8161 // We could only be killed with mapped GPU state by VA range free or VA 8162 // space teardown, so it's safe to use the va_space's block_context 8163 // because both of those have the VA space lock held in write mode. 8164 status = uvm_va_block_unmap_mask(block, block_context, &block->mapped, region, NULL); 8165 UVM_ASSERT(status == uvm_global_get_status()); 8166 } 8167 8168 UVM_ASSERT(uvm_processor_mask_empty(&block->mapped)); 8169 8170 // Free the GPU page tables and chunks 8171 for_each_gpu_id(id) 8172 block_destroy_gpu_state(block, id); 8173 8174 // Wait for the GPU PTE unmaps before freeing CPU memory 8175 uvm_tracker_wait_deinit(&block->tracker); 8176 8177 // No processor should have the CPU mapped at this point 8178 UVM_ASSERT(block_check_processor_not_mapped(block, UVM_ID_CPU)); 8179 8180 // Free CPU pages 8181 for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, block) { 8182 // be conservative. 8183 // Tell the OS we wrote to the page because we sometimes clear the dirty 8184 // bit after writing to it. HMM dirty flags are managed by the kernel. 8185 if (!uvm_va_block_is_hmm(block)) 8186 uvm_cpu_chunk_mark_dirty(chunk, 0); 8187 uvm_cpu_chunk_remove_from_block(block, page_index); 8188 uvm_cpu_chunk_free(chunk); 8189 } 8190 8191 uvm_kvfree((void *)block->cpu.chunks); 8192 block->cpu.chunks = 0; 8193 8194 // Clearing the resident bit isn't strictly necessary since this block 8195 // is getting destroyed, but it keeps state consistent for assertions. 8196 uvm_page_mask_zero(&block->cpu.resident); 8197 block_clear_resident_processor(block, UVM_ID_CPU); 8198 8199 if (uvm_va_block_is_hmm(block)) 8200 uvm_va_policy_clear(block, block->start, block->end); 8201 8202 block->va_range = NULL; 8203 #if UVM_IS_CONFIG_HMM() 8204 block->hmm.va_space = NULL; 8205 #endif 8206 } 8207 8208 // Called when the block's ref count drops to 0 8209 void uvm_va_block_destroy(nv_kref_t *nv_kref) 8210 { 8211 uvm_va_block_t *block = container_of(nv_kref, uvm_va_block_t, kref); 8212 8213 // Nobody else should have a reference when freeing 8214 uvm_assert_mutex_unlocked(&block->lock); 8215 8216 uvm_mutex_lock(&block->lock); 8217 block_kill(block); 8218 uvm_mutex_unlock(&block->lock); 8219 8220 if (uvm_enable_builtin_tests) { 8221 uvm_va_block_wrapper_t *block_wrapper = container_of(block, uvm_va_block_wrapper_t, block); 8222 8223 kmem_cache_free(g_uvm_va_block_cache, block_wrapper); 8224 } 8225 else { 8226 kmem_cache_free(g_uvm_va_block_cache, block); 8227 } 8228 } 8229 8230 void uvm_va_block_kill(uvm_va_block_t *va_block) 8231 { 8232 uvm_mutex_lock(&va_block->lock); 8233 block_kill(va_block); 8234 uvm_mutex_unlock(&va_block->lock); 8235 8236 // May call block_kill again 8237 uvm_va_block_release(va_block); 8238 } 8239 8240 static void block_gpu_release_region(uvm_va_block_t *va_block, 8241 uvm_gpu_id_t gpu_id, 8242 uvm_va_block_gpu_state_t *gpu_state, 8243 uvm_page_mask_t *page_mask, 8244 uvm_va_block_region_t region) 8245 { 8246 uvm_page_index_t page_index; 8247 8248 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 8249 uvm_gpu_chunk_t *gpu_chunk = gpu_state->chunks[page_index]; 8250 8251 if (!gpu_chunk) 8252 continue; 8253 8254 // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks 8255 8256 uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker); 8257 8258 // The GPU chunk will be freed when the device private reference drops. 8259 if (uvm_page_mask_test_and_clear(&gpu_state->resident, page_index) && 8260 uvm_page_mask_empty(&gpu_state->resident)) 8261 block_clear_resident_processor(va_block, gpu_id); 8262 8263 gpu_state->chunks[page_index] = NULL; 8264 } 8265 } 8266 8267 void uvm_va_block_munmap_region(uvm_va_block_t *va_block, 8268 uvm_va_block_region_t region) 8269 { 8270 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 8271 uvm_perf_event_data_t event_data; 8272 uvm_gpu_id_t gpu_id; 8273 8274 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 8275 uvm_assert_mutex_locked(&va_block->lock); 8276 8277 // Reset thrashing state for the region. 8278 event_data.block_munmap.block = va_block; 8279 event_data.block_munmap.region = region; 8280 uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_MUNMAP, &event_data); 8281 8282 // Set a flag so that GPU fault events are flushed since they might refer 8283 // to the region being unmapped. 8284 // Note that holding the va_block lock prevents GPU VA spaces from 8285 // being removed so the registered_gpu_va_spaces mask is stable. 8286 for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) { 8287 uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id); 8288 } 8289 8290 // Release any remaining vidmem chunks in the given region. 8291 for_each_gpu_id(gpu_id) { 8292 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 8293 8294 if (!gpu_state) 8295 continue; 8296 8297 uvm_page_mask_region_clear(&gpu_state->evicted, region); 8298 if (uvm_page_mask_empty(&gpu_state->evicted)) 8299 uvm_processor_mask_clear(&va_block->evicted_gpus, gpu_id); 8300 8301 if (gpu_state->chunks) { 8302 block_gpu_release_region(va_block, gpu_id, gpu_state, NULL, region); 8303 8304 // TODO: bug 3660922: Need to update the read duplicated pages mask 8305 // when read duplication is supported for HMM. 8306 } 8307 else { 8308 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu_id)); 8309 } 8310 } 8311 8312 uvm_va_policy_clear(va_block, 8313 uvm_va_block_region_start(va_block, region), 8314 uvm_va_block_region_end(va_block, region)); 8315 } 8316 8317 static NV_STATUS block_split_presplit_ptes_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu) 8318 { 8319 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id); 8320 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing); 8321 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 8322 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu); 8323 NvU32 alloc_sizes; 8324 DECLARE_BITMAP(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 8325 uvm_page_index_t new_start_page_index = uvm_va_block_cpu_page_index(existing, new->start); 8326 size_t big_page_index; 8327 uvm_push_t push; 8328 NV_STATUS status; 8329 8330 // We only have to split to big PTEs if we're currently a 2M PTE 8331 if (existing_gpu_state->pte_is_2m) { 8332 // We can skip the split if the 2M PTE is invalid and we have no lower 8333 // PTEs. 8334 if (block_page_prot_gpu(existing, gpu, 0) == UVM_PROT_NONE && 8335 !existing_gpu_state->page_table_range_big.table && 8336 !existing_gpu_state->page_table_range_4k.table) 8337 return NV_OK; 8338 8339 alloc_sizes = big_page_size; 8340 bitmap_fill(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 8341 8342 if (!IS_ALIGNED(new->start, big_page_size)) { 8343 alloc_sizes |= UVM_PAGE_SIZE_4K; 8344 8345 big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size); 8346 __clear_bit(big_page_index, new_big_ptes); 8347 } 8348 8349 status = block_alloc_ptes_with_retry(existing, gpu, alloc_sizes, NULL); 8350 if (status != NV_OK) 8351 return status; 8352 8353 status = uvm_push_begin_acquire(gpu->channel_manager, 8354 UVM_CHANNEL_TYPE_MEMOPS, 8355 &existing->tracker, 8356 &push, 8357 "Splitting 2M PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)", 8358 existing->start, existing->end + 1, 8359 new->start, new->end + 1); 8360 if (status != NV_OK) 8361 return status; 8362 8363 block_gpu_split_2m(existing, block_context, gpu, new_big_ptes, &push); 8364 } 8365 else { 8366 big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size); 8367 8368 // If the split point is on a big page boundary, or if the split point 8369 // is not currently covered by a big PTE, we don't have to split 8370 // anything. 8371 if (IS_ALIGNED(new->start, big_page_size) || 8372 big_page_index == MAX_BIG_PAGES_PER_UVM_VA_BLOCK || 8373 !test_bit(big_page_index, existing_gpu_state->big_ptes)) 8374 return NV_OK; 8375 8376 status = block_alloc_ptes_with_retry(existing, gpu, UVM_PAGE_SIZE_4K, NULL); 8377 if (status != NV_OK) 8378 return status; 8379 8380 bitmap_zero(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 8381 __set_bit(big_page_index, new_big_ptes); 8382 8383 status = uvm_push_begin_acquire(gpu->channel_manager, 8384 UVM_CHANNEL_TYPE_MEMOPS, 8385 &existing->tracker, 8386 &push, 8387 "Splitting big PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)", 8388 existing->start, existing->end + 1, 8389 new->start, new->end + 1); 8390 if (status != NV_OK) 8391 return status; 8392 8393 block_gpu_split_big(existing, block_context, gpu, new_big_ptes, &push); 8394 } 8395 8396 uvm_push_end(&push); 8397 8398 // Adding this push to existing block tracker will cause all GPU PTE splits 8399 // to serialize on each other, but it's simpler than maintaining a separate 8400 // tracker and this path isn't performance-critical. 8401 return uvm_tracker_add_push_safe(&existing->tracker, &push); 8402 } 8403 8404 static NV_STATUS block_split_presplit_ptes(uvm_va_block_t *existing, uvm_va_block_t *new) 8405 { 8406 uvm_gpu_t *gpu; 8407 uvm_gpu_id_t id; 8408 NV_STATUS status; 8409 8410 for_each_gpu_id(id) { 8411 if (!uvm_va_block_gpu_state_get(existing, id)) 8412 continue; 8413 8414 gpu = block_get_gpu(existing, id); 8415 8416 if (block_gpu_has_page_tables(existing, gpu)) { 8417 status = block_split_presplit_ptes_gpu(existing, new, gpu); 8418 if (status != NV_OK) 8419 return status; 8420 } 8421 } 8422 8423 return NV_OK; 8424 } 8425 8426 typedef struct 8427 { 8428 // Number of chunks contained by this VA block 8429 size_t num_chunks; 8430 8431 // Index of the "interesting" chunk, either adjacent to or spanning the 8432 // split point depending on which block this is. 8433 size_t chunk_index; 8434 8435 // Size of the chunk referenced by chunk_index 8436 uvm_chunk_size_t chunk_size; 8437 } block_gpu_chunk_split_state_t; 8438 8439 static void block_gpu_chunk_get_split_state(uvm_va_block_t *block, 8440 block_gpu_chunk_split_state_t *state, 8441 NvU64 start, 8442 NvU64 end, 8443 uvm_page_index_t page_index, 8444 uvm_gpu_t *gpu) 8445 { 8446 NvU64 size = end - start + 1; 8447 state->num_chunks = block_num_gpu_chunks_range(block, start, size, gpu); 8448 state->chunk_index = block_gpu_chunk_index_range(block, start, size, gpu, page_index, &state->chunk_size); 8449 } 8450 8451 static void block_merge_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) 8452 { 8453 uvm_gpu_t *accessing_gpu; 8454 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 8455 8456 uvm_pmm_gpu_merge_chunk(&gpu->pmm, chunk); 8457 8458 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 8459 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); 8460 8461 uvm_pmm_sysmem_mappings_merge_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings, 8462 peer_addr, 8463 uvm_gpu_chunk_get_size(chunk)); 8464 } 8465 } 8466 8467 // Perform any chunk splitting and array growing required for this block split, 8468 // but don't actually move chunk pointers anywhere. 8469 static NV_STATUS block_presplit_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu) 8470 { 8471 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id); 8472 uvm_gpu_t *accessing_gpu; 8473 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing); 8474 uvm_gpu_chunk_t **temp_chunks; 8475 uvm_gpu_chunk_t *original_chunk, *curr_chunk; 8476 uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start); 8477 uvm_chunk_sizes_mask_t split_sizes; 8478 uvm_chunk_size_t subchunk_size; 8479 NV_STATUS status; 8480 block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state; 8481 8482 block_gpu_chunk_get_split_state(existing, 8483 &existing_before_state, 8484 existing->start, 8485 existing->end, 8486 split_page_index, 8487 gpu); 8488 block_gpu_chunk_get_split_state(existing, 8489 &existing_after_state, 8490 existing->start, 8491 new->start - 1, 8492 split_page_index - 1, 8493 gpu); 8494 block_gpu_chunk_get_split_state(new, 8495 &new_state, 8496 new->start, 8497 new->end, 8498 0, 8499 gpu); 8500 8501 // Even though we're splitting existing, we could wind up requiring a larger 8502 // chunks array if we split a large chunk into many smaller ones. 8503 if (existing_after_state.num_chunks > existing_before_state.num_chunks) { 8504 temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks, 8505 existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0])); 8506 if (!temp_chunks) 8507 return NV_ERR_NO_MEMORY; 8508 existing_gpu_state->chunks = temp_chunks; 8509 } 8510 8511 original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index]; 8512 8513 // If the chunk covering the split point is not populated, we're done. We've 8514 // already grown the array to cover any new chunks which may be populated 8515 // later. 8516 if (!original_chunk) 8517 return NV_OK; 8518 8519 // Figure out the splits we need to perform. Remove all sizes >= the current 8520 // size, and all sizes < the target size. Note that the resulting mask will 8521 // be 0 if the sizes match (we're already splitting at a chunk boundary). 8522 UVM_ASSERT(uvm_gpu_chunk_get_size(original_chunk) == existing_before_state.chunk_size); 8523 UVM_ASSERT(existing_before_state.chunk_size >= new_state.chunk_size); 8524 split_sizes = gpu->parent->mmu_user_chunk_sizes; 8525 split_sizes &= existing_before_state.chunk_size - 1; 8526 split_sizes &= ~(new_state.chunk_size - 1); 8527 8528 // Keep splitting the chunk covering the split point until we hit the target 8529 // size. 8530 curr_chunk = original_chunk; 8531 for_each_chunk_size_rev(subchunk_size, split_sizes) { 8532 size_t last_index, num_subchunks; 8533 8534 status = uvm_pmm_gpu_split_chunk(&gpu->pmm, curr_chunk, subchunk_size, NULL); 8535 if (status != NV_OK) 8536 goto error; 8537 8538 // Split physical GPU mappings for indirect peers 8539 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 8540 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, curr_chunk, accessing_gpu); 8541 8542 status = uvm_pmm_sysmem_mappings_split_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings, 8543 peer_addr, 8544 subchunk_size); 8545 if (status != NV_OK) 8546 goto error; 8547 } 8548 8549 if (subchunk_size == new_state.chunk_size) 8550 break; 8551 8552 // Compute the last subchunk index prior to the split point. Divide the 8553 // entire address space into units of subchunk_size, then mod by the 8554 // number of subchunks within the parent. 8555 last_index = (size_t)uvm_div_pow2_64(new->start - 1, subchunk_size); 8556 num_subchunks = (size_t)uvm_div_pow2_64(uvm_gpu_chunk_get_size(curr_chunk), subchunk_size); 8557 UVM_ASSERT(num_subchunks > 1); 8558 last_index &= num_subchunks - 1; 8559 8560 uvm_pmm_gpu_get_subchunks(&gpu->pmm, curr_chunk, last_index, 1, &curr_chunk); 8561 UVM_ASSERT(uvm_gpu_chunk_get_size(curr_chunk) == subchunk_size); 8562 } 8563 8564 // Note that existing's chunks array still has a pointer to original_chunk, 8565 // not to any newly-split subchunks. If a subsequent split failure occurs on 8566 // a later GPU we'll have to merge it back. Once we're past the preallocate 8567 // stage we'll remove it from the chunks array and move the new split chunks 8568 // in. 8569 8570 return NV_OK; 8571 8572 error: 8573 // On error we need to leave the chunk in its initial state 8574 block_merge_chunk(existing, gpu, original_chunk); 8575 8576 return status; 8577 } 8578 8579 static NV_STATUS block_split_cpu_chunk_to_64k(uvm_va_block_t *block) 8580 { 8581 uvm_cpu_chunk_storage_mixed_t *mixed; 8582 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, 0); 8583 NV_STATUS status; 8584 8585 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M); 8586 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_CHUNK); 8587 8588 mixed = uvm_kvmalloc_zero(sizeof(*mixed)); 8589 if (!mixed) 8590 return NV_ERR_NO_MEMORY; 8591 8592 status = uvm_cpu_chunk_split(chunk, (uvm_cpu_chunk_t **)&mixed->slots); 8593 if (status != NV_OK) { 8594 uvm_kvfree(mixed); 8595 return status; 8596 } 8597 8598 bitmap_fill(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK); 8599 block->cpu.chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED; 8600 return status; 8601 } 8602 8603 static NV_STATUS block_split_cpu_chunk_to_4k(uvm_va_block_t *block, uvm_page_index_t page_index) 8604 { 8605 uvm_cpu_chunk_storage_mixed_t *mixed; 8606 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 8607 uvm_cpu_chunk_t **small_chunks; 8608 size_t slot_index; 8609 NV_STATUS status; 8610 8611 UVM_ASSERT(chunk); 8612 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K); 8613 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED); 8614 8615 mixed = uvm_cpu_storage_get_ptr(block); 8616 slot_index = compute_slot_index(block, page_index); 8617 small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT); 8618 if (!small_chunks) 8619 return NV_ERR_NO_MEMORY; 8620 8621 status = uvm_cpu_chunk_split(chunk, small_chunks); 8622 if (status != NV_OK) { 8623 uvm_kvfree(small_chunks); 8624 return status; 8625 } 8626 8627 mixed->slots[slot_index] = small_chunks; 8628 clear_bit(slot_index, mixed->big_chunks); 8629 return status; 8630 } 8631 8632 static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index_t page_index) 8633 { 8634 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 8635 uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk); 8636 uvm_chunk_size_t new_size; 8637 uvm_gpu_t *gpu; 8638 NvU64 gpu_mapping_addr; 8639 uvm_processor_mask_t gpu_split_mask; 8640 uvm_gpu_id_t id; 8641 NV_STATUS status; 8642 8643 if (chunk_size == UVM_CHUNK_SIZE_2M) 8644 new_size = UVM_CHUNK_SIZE_64K; 8645 else 8646 new_size = UVM_CHUNK_SIZE_4K; 8647 8648 UVM_ASSERT(IS_ALIGNED(chunk_size, new_size)); 8649 8650 uvm_processor_mask_zero(&gpu_split_mask); 8651 for_each_gpu_id(id) { 8652 if (!uvm_va_block_gpu_state_get(block, id)) 8653 continue; 8654 8655 gpu = block_get_gpu(block, id); 8656 8657 // If the parent chunk has not been mapped, there is nothing to split. 8658 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 8659 if (gpu_mapping_addr == 0) 8660 continue; 8661 8662 status = uvm_pmm_sysmem_mappings_split_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings, 8663 gpu_mapping_addr, 8664 new_size); 8665 if (status != NV_OK) 8666 goto merge; 8667 8668 uvm_processor_mask_set(&gpu_split_mask, id); 8669 } 8670 8671 if (new_size == UVM_CHUNK_SIZE_64K) 8672 status = block_split_cpu_chunk_to_64k(block); 8673 else 8674 status = block_split_cpu_chunk_to_4k(block, page_index); 8675 8676 if (status != NV_OK) { 8677 merge: 8678 for_each_gpu_id_in_mask(id, &gpu_split_mask) { 8679 gpu = block_get_gpu(block, id); 8680 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 8681 uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings, 8682 gpu_mapping_addr, 8683 chunk_size); 8684 } 8685 } 8686 8687 return status; 8688 } 8689 8690 static NV_STATUS block_prealloc_cpu_chunk_storage(uvm_va_block_t *existing, uvm_va_block_t *new) 8691 { 8692 uvm_cpu_chunk_storage_mixed_t *existing_mixed; 8693 uvm_cpu_chunk_storage_mixed_t *new_mixed = NULL; 8694 size_t slot_offset; 8695 size_t existing_slot; 8696 NV_STATUS status = NV_OK; 8697 8698 UVM_ASSERT(uvm_cpu_storage_get_type(existing) == UVM_CPU_CHUNK_STORAGE_MIXED); 8699 existing_mixed = uvm_cpu_storage_get_ptr(existing); 8700 8701 // Pre-allocate chunk storage for the new block. By definition, the new block 8702 // will contain either 64K and/or 4K chunks. 8703 // 8704 // We do this here so there are no failures in block_split_cpu(). 8705 new_mixed = uvm_kvmalloc_zero(sizeof(*new_mixed)); 8706 if (!new_mixed) 8707 return NV_ERR_NO_MEMORY; 8708 8709 slot_offset = compute_slot_index(existing, uvm_va_block_cpu_page_index(existing, new->start)); 8710 existing_slot = slot_offset; 8711 for_each_clear_bit_from(existing_slot, existing_mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK) { 8712 size_t new_slot = existing_slot - slot_offset; 8713 8714 if (existing_mixed->slots[existing_slot]) { 8715 uvm_cpu_chunk_t **small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT); 8716 8717 if (!small_chunks) { 8718 status = NV_ERR_NO_MEMORY; 8719 goto done; 8720 } 8721 8722 new_mixed->slots[new_slot] = small_chunks; 8723 } 8724 } 8725 8726 new->cpu.chunks = (unsigned long)new_mixed | UVM_CPU_CHUNK_STORAGE_MIXED; 8727 UVM_ASSERT(status == NV_OK); 8728 8729 done: 8730 if (status != NV_OK) { 8731 for (; existing_slot > slot_offset; existing_slot--) 8732 uvm_kvfree(new_mixed->slots[existing_slot - slot_offset]); 8733 8734 uvm_kvfree(new_mixed); 8735 } 8736 8737 return status; 8738 } 8739 8740 static void block_free_cpu_chunk_storage(uvm_va_block_t *block) 8741 { 8742 if (block->cpu.chunks) { 8743 uvm_cpu_chunk_storage_mixed_t *mixed; 8744 size_t slot_index; 8745 8746 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED); 8747 mixed = uvm_cpu_storage_get_ptr(block); 8748 for (slot_index = 0; slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK; slot_index++) 8749 uvm_kvfree(mixed->slots[slot_index]); 8750 8751 uvm_kvfree(mixed); 8752 block->cpu.chunks = 0; 8753 } 8754 } 8755 8756 // Perform any CPU chunk splitting that may be required for this block split. 8757 // Just like block_presplit_gpu_chunks, no chunks are moved to the new block. 8758 static NV_STATUS block_presplit_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new) 8759 { 8760 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start); 8761 uvm_cpu_chunk_t *splitting_chunk; 8762 uvm_chunk_sizes_mask_t split_sizes = uvm_cpu_chunk_get_allocation_sizes(); 8763 uvm_chunk_size_t subchunk_size; 8764 NV_STATUS status = NV_OK; 8765 8766 UVM_ASSERT(!IS_ALIGNED(new->start, UVM_VA_BLOCK_SIZE)); 8767 splitting_chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index); 8768 8769 // If the page covering the split point has not been populated, there is no 8770 // need to split. 8771 if (!splitting_chunk) 8772 return NV_OK; 8773 8774 // If the split point is aligned on the chunk size, there is no need to 8775 // split. 8776 if (IS_ALIGNED(new->start, uvm_cpu_chunk_get_size(splitting_chunk))) 8777 return NV_OK; 8778 8779 // Remove all sizes above the chunk's current size. 8780 split_sizes &= uvm_cpu_chunk_get_size(splitting_chunk) - 1; 8781 // Remove all sizes below the alignment of the new block's start. 8782 split_sizes &= ~(IS_ALIGNED(new->start, UVM_CHUNK_SIZE_64K) ? UVM_CHUNK_SIZE_64K - 1 : 0); 8783 8784 for_each_chunk_size_rev(subchunk_size, split_sizes) { 8785 status = block_split_cpu_chunk_one(existing, page_index); 8786 if (status != NV_OK) 8787 return status; 8788 } 8789 8790 return block_prealloc_cpu_chunk_storage(existing, new); 8791 } 8792 8793 static void block_merge_cpu_chunks_to_64k(uvm_va_block_t *block, uvm_page_index_t page_index) 8794 { 8795 uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(block); 8796 size_t slot_index = compute_slot_index(block, page_index); 8797 uvm_cpu_chunk_t **small_chunks = mixed->slots[slot_index]; 8798 uvm_cpu_chunk_t *merged_chunk; 8799 8800 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED); 8801 UVM_ASSERT(small_chunks); 8802 UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks)); 8803 8804 merged_chunk = uvm_cpu_chunk_merge(small_chunks); 8805 mixed->slots[slot_index] = merged_chunk; 8806 set_bit(slot_index, mixed->big_chunks); 8807 uvm_kvfree(small_chunks); 8808 } 8809 8810 static void block_merge_cpu_chunks_to_2m(uvm_va_block_t *block, uvm_page_index_t page_index) 8811 { 8812 uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(block); 8813 uvm_cpu_chunk_t **big_chunks = (uvm_cpu_chunk_t **)&mixed->slots; 8814 uvm_cpu_chunk_t *merged_chunk; 8815 8816 UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED); 8817 UVM_ASSERT(bitmap_full(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK)); 8818 8819 merged_chunk = uvm_cpu_chunk_merge(big_chunks); 8820 block->cpu.chunks = (unsigned long)merged_chunk | UVM_CPU_CHUNK_STORAGE_CHUNK; 8821 uvm_kvfree(mixed); 8822 } 8823 8824 static void block_merge_cpu_chunks_one(uvm_va_block_t *block, uvm_page_index_t page_index) 8825 { 8826 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 8827 uvm_gpu_id_t id; 8828 8829 if (uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K) { 8830 block_merge_cpu_chunks_to_64k(block, page_index); 8831 } 8832 else { 8833 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K); 8834 block_merge_cpu_chunks_to_2m(block, page_index); 8835 } 8836 8837 chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 8838 8839 for_each_gpu_id(id) { 8840 NvU64 gpu_mapping_addr; 8841 uvm_gpu_t *gpu; 8842 8843 if (!uvm_va_block_gpu_state_get(block, id)) 8844 continue; 8845 8846 gpu = block_get_gpu(block, id); 8847 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent); 8848 if (gpu_mapping_addr == 0) 8849 continue; 8850 8851 uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings, 8852 gpu_mapping_addr, 8853 uvm_cpu_chunk_get_size(chunk)); 8854 } 8855 } 8856 8857 static void block_merge_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new) 8858 { 8859 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start); 8860 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index); 8861 uvm_chunk_sizes_mask_t merge_sizes = uvm_cpu_chunk_get_allocation_sizes(); 8862 uvm_chunk_size_t largest_size; 8863 uvm_chunk_size_t chunk_size; 8864 uvm_chunk_size_t merge_size; 8865 size_t block_size = uvm_va_block_size(existing); 8866 8867 if (!chunk || uvm_cpu_chunk_is_physical(chunk)) 8868 return; 8869 8870 chunk_size = uvm_cpu_chunk_get_size(chunk); 8871 8872 // Remove all CPU chunk sizes above the size of the existing VA block. 8873 // Since block sizes are not always powers of 2, use the largest power of 2 8874 // less than or equal to the block size since we can't merge to a size 8875 // larger than the block's size. 8876 largest_size = rounddown_pow_of_two(block_size); 8877 merge_sizes &= (largest_size | (largest_size - 1)); 8878 8879 // Remove all CPU chunk sizes smaller than the size of the chunk being merged up. 8880 merge_sizes &= ~(chunk_size | (chunk_size - 1)); 8881 8882 for_each_chunk_size(merge_size, merge_sizes) { 8883 uvm_va_block_region_t chunk_region; 8884 8885 // The block has to fully contain the VA range after the merge. 8886 if (!uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size)) || 8887 !uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size) + merge_size - 1)) 8888 break; 8889 8890 chunk_region = uvm_va_block_chunk_region(existing, merge_size, page_index); 8891 8892 // If not all pages in the region covered by the chunk are allocated, 8893 // we can't merge. 8894 if (!uvm_page_mask_region_full(&existing->cpu.allocated, chunk_region)) 8895 break; 8896 8897 block_merge_cpu_chunks_one(existing, chunk_region.first); 8898 chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index); 8899 if (uvm_cpu_chunk_is_physical(chunk)) 8900 break; 8901 } 8902 8903 block_free_cpu_chunk_storage(new); 8904 } 8905 8906 // Pre-allocate everything which doesn't require retry on both existing and new 8907 // which will be needed to handle a split. If this fails, existing must remain 8908 // functionally unmodified. 8909 static NV_STATUS block_split_preallocate_no_retry(uvm_va_block_t *existing, uvm_va_block_t *new) 8910 { 8911 NV_STATUS status; 8912 uvm_gpu_t *gpu; 8913 uvm_gpu_id_t id; 8914 uvm_page_index_t split_page_index; 8915 uvm_va_block_test_t *block_test; 8916 8917 status = block_presplit_cpu_chunks(existing, new); 8918 if (status != NV_OK) 8919 goto error; 8920 8921 for_each_gpu_id(id) { 8922 if (!uvm_va_block_gpu_state_get(existing, id)) 8923 continue; 8924 8925 gpu = block_get_gpu(existing, id); 8926 8927 status = block_presplit_gpu_chunks(existing, new, gpu); 8928 if (status != NV_OK) 8929 goto error; 8930 8931 if (!block_gpu_state_get_alloc(new, gpu)) { 8932 status = NV_ERR_NO_MEMORY; 8933 goto error; 8934 } 8935 } 8936 8937 block_test = uvm_va_block_get_test(existing); 8938 if (block_test && block_test->inject_split_error) { 8939 block_test->inject_split_error = false; 8940 if (!uvm_va_block_is_hmm(existing)) { 8941 UVM_ASSERT(existing->va_range->inject_split_error); 8942 existing->va_range->inject_split_error = false; 8943 } 8944 status = NV_ERR_NO_MEMORY; 8945 goto error; 8946 } 8947 8948 if (uvm_va_block_is_hmm(existing)) { 8949 uvm_va_policy_node_t *node = uvm_va_policy_node_find(existing, new->start); 8950 8951 if (node && node->node.start != new->start) { 8952 status = uvm_va_policy_node_split(existing, node, new->start - 1, NULL); 8953 if (status != NV_OK) 8954 goto error; 8955 } 8956 } 8957 8958 return NV_OK; 8959 8960 error: 8961 // Merge back the chunks we split 8962 split_page_index = uvm_va_block_cpu_page_index(existing, new->start); 8963 8964 for_each_gpu_id(id) { 8965 uvm_gpu_chunk_t *chunk; 8966 size_t chunk_index; 8967 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, id); 8968 8969 if (!existing_gpu_state) 8970 continue; 8971 8972 // If the chunk spanning the split point was split, merge it back 8973 gpu = block_get_gpu(existing, id); 8974 chunk_index = block_gpu_chunk_index(existing, gpu, split_page_index, NULL); 8975 chunk = existing_gpu_state->chunks[chunk_index]; 8976 if (!chunk || chunk->state != UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT) 8977 continue; 8978 8979 block_merge_chunk(existing, gpu, chunk); 8980 8981 // We could attempt to shrink the chunks array back down, but it doesn't 8982 // hurt much to have it larger than necessary, and we'd have to handle 8983 // the shrink call failing anyway on this error path. 8984 8985 } 8986 8987 block_merge_cpu_chunks(existing, new); 8988 8989 return status; 8990 } 8991 8992 // Re-calculate the block's top-level processor masks: 8993 // - block->mapped 8994 // - block->resident 8995 // 8996 // This is called on block split. 8997 static void block_set_processor_masks(uvm_va_block_t *block) 8998 { 8999 size_t num_pages = uvm_va_block_num_cpu_pages(block); 9000 uvm_va_block_region_t block_region = uvm_va_block_region(0, num_pages); 9001 uvm_gpu_id_t id; 9002 9003 if (uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], block_region)) { 9004 UVM_ASSERT(uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], block_region)); 9005 uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU); 9006 } 9007 else { 9008 uvm_processor_mask_set(&block->mapped, UVM_ID_CPU); 9009 } 9010 9011 if (uvm_page_mask_region_empty(&block->cpu.resident, block_region)) { 9012 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); 9013 9014 if (uvm_processor_mask_get_gpu_count(&va_space->can_access[UVM_ID_CPU_VALUE]) == 0) 9015 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)); 9016 9017 block_clear_resident_processor(block, UVM_ID_CPU); 9018 } 9019 else { 9020 block_set_resident_processor(block, UVM_ID_CPU); 9021 } 9022 9023 for_each_gpu_id(id) { 9024 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id); 9025 if (!gpu_state) 9026 continue; 9027 9028 if (uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], block_region)) { 9029 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], block_region)); 9030 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], block_region)); 9031 uvm_processor_mask_clear(&block->mapped, id); 9032 } 9033 else { 9034 uvm_processor_mask_set(&block->mapped, id); 9035 } 9036 9037 if (uvm_page_mask_region_empty(&gpu_state->resident, block_region)) 9038 block_clear_resident_processor(block, id); 9039 else 9040 block_set_resident_processor(block, id); 9041 9042 if (uvm_page_mask_region_empty(&gpu_state->evicted, block_region)) 9043 uvm_processor_mask_clear(&block->evicted_gpus, id); 9044 else 9045 uvm_processor_mask_set(&block->evicted_gpus, id); 9046 } 9047 } 9048 9049 // Split a PAGES_PER_UVM_VA_BLOCK sized bitmap into new and existing parts 9050 // corresponding to a block split. 9051 static void block_split_page_mask(uvm_page_mask_t *existing_mask, 9052 size_t existing_pages, 9053 uvm_page_mask_t *new_mask, 9054 size_t new_pages) 9055 { 9056 UVM_ASSERT_MSG(existing_pages + new_pages <= PAGES_PER_UVM_VA_BLOCK, "existing %zu new %zu\n", 9057 existing_pages, new_pages); 9058 9059 // The new block is always in the upper region of existing, so shift the bit 9060 // vectors down. 9061 // 9062 // Note that bitmap_shift_right requires both dst and src to be the same 9063 // size. That's ok since we don't scale them by block size. 9064 uvm_page_mask_shift_right(new_mask, existing_mask, existing_pages); 9065 uvm_page_mask_region_clear(existing_mask, uvm_va_block_region(existing_pages, existing_pages + new_pages)); 9066 } 9067 9068 // Split the CPU state within the existing block. existing's start is correct 9069 // but its end has not yet been adjusted. 9070 static void block_split_cpu(uvm_va_block_t *existing, uvm_va_block_t *new) 9071 { 9072 size_t existing_pages, new_pages = uvm_va_block_num_cpu_pages(new); 9073 uvm_pte_bits_cpu_t pte_bit; 9074 uvm_va_block_region_t block_region = uvm_va_block_region_from_block(existing); 9075 uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start); 9076 uvm_page_index_t page_index; 9077 uvm_page_index_t next_page_index; 9078 uvm_cpu_chunk_t *chunk; 9079 uvm_va_range_t *existing_va_range = existing->va_range; 9080 9081 if (existing_va_range) { 9082 UVM_ASSERT(existing->va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 9083 UVM_ASSERT(existing->va_range->type == new->va_range->type); 9084 } 9085 9086 UVM_ASSERT(existing->start < new->start); 9087 UVM_ASSERT(existing->end == new->end); 9088 9089 UVM_ASSERT(PAGE_ALIGNED(new->start)); 9090 UVM_ASSERT(PAGE_ALIGNED(existing->start)); 9091 9092 existing_pages = (new->start - existing->start) / PAGE_SIZE; 9093 9094 // We don't have to unmap the CPU since its virtual -> physical mappings 9095 // don't change. 9096 9097 page_index = uvm_va_block_next_page_in_mask(block_region, &existing->cpu.allocated, split_page_index - 1); 9098 9099 for_each_cpu_chunk_in_block_region_safe(chunk, 9100 page_index, 9101 next_page_index, 9102 existing, 9103 uvm_va_block_region(split_page_index, block_region.outer)) { 9104 uvm_page_index_t new_chunk_page_index; 9105 NV_STATUS status; 9106 9107 uvm_cpu_chunk_remove_from_block(existing, page_index); 9108 9109 // The chunk has to be adjusted for the new block before inserting it. 9110 new_chunk_page_index = page_index - split_page_index; 9111 9112 // This should never fail because all necessary storage was allocated 9113 // in block_presplit_cpu_chunks(). 9114 status = uvm_cpu_chunk_insert_in_block(new, chunk, new_chunk_page_index); 9115 UVM_ASSERT(status == NV_OK); 9116 } 9117 9118 new->cpu.ever_mapped = existing->cpu.ever_mapped; 9119 9120 block_split_page_mask(&existing->cpu.resident, existing_pages, &new->cpu.resident, new_pages); 9121 9122 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++) 9123 block_split_page_mask(&existing->cpu.pte_bits[pte_bit], existing_pages, &new->cpu.pte_bits[pte_bit], new_pages); 9124 } 9125 9126 // Fill out the blocks' chunks arrays with the chunks split by 9127 // block_presplit_gpu_chunks. 9128 static void block_copy_split_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu) 9129 { 9130 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id); 9131 uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu->id); 9132 uvm_gpu_chunk_t **temp_chunks; 9133 uvm_gpu_chunk_t *original_chunk; 9134 block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state; 9135 size_t num_pre_chunks, num_post_chunks, num_split_chunks_existing, num_split_chunks_new; 9136 uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start); 9137 size_t i; 9138 9139 block_gpu_chunk_get_split_state(existing, 9140 &existing_before_state, 9141 existing->start, 9142 existing->end, 9143 split_page_index, 9144 gpu); 9145 block_gpu_chunk_get_split_state(existing, 9146 &existing_after_state, 9147 existing->start, 9148 new->start - 1, 9149 split_page_index - 1, 9150 gpu); 9151 block_gpu_chunk_get_split_state(new, 9152 &new_state, 9153 new->start, 9154 new->end, 9155 0, 9156 gpu); 9157 9158 // General case (B is original_chunk): 9159 // split 9160 // v 9161 // existing (before) [------ A -----][------ B -----][------ C -----] 9162 // existing (after) [------ A -----][- B0 -] 9163 // new [- B1 -][------ C -----] 9164 // 9165 // Note that the logic below also handles the case of the split happening at 9166 // a chunk boundary. That case behaves as though there is no B0 chunk. 9167 9168 // Number of chunks to the left and right of original_chunk (A and C above). 9169 // Either or both of these may be 0. 9170 num_pre_chunks = existing_before_state.chunk_index; 9171 num_post_chunks = existing_before_state.num_chunks - num_pre_chunks - 1; 9172 9173 // Number of subchunks under existing's portion of original_chunk (B0 above) 9174 num_split_chunks_existing = existing_after_state.num_chunks - num_pre_chunks; 9175 9176 // Number of subchunks under new's portion of original_chunk (B1 above) 9177 num_split_chunks_new = new_state.num_chunks - num_post_chunks; 9178 9179 UVM_ASSERT(num_pre_chunks + num_split_chunks_existing > 0); 9180 UVM_ASSERT(num_split_chunks_new > 0); 9181 9182 // Copy post chunks from the end of existing into new (C above) 9183 memcpy(&new_gpu_state->chunks[num_split_chunks_new], 9184 &existing_gpu_state->chunks[existing_before_state.chunk_index + 1], 9185 num_post_chunks * sizeof(new_gpu_state->chunks[0])); 9186 9187 // Save off the original split chunk since we may overwrite the array 9188 original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index]; 9189 9190 // Fill out the new pointers 9191 if (original_chunk) { 9192 // Note that if the split happened at a chunk boundary, original_chunk 9193 // will not be split. In that case, num_split_chunks_existing will be 0 9194 // and num_split_chunks_new will be 1, so the left copy will be skipped 9195 // and the right copy will pick up the chunk. 9196 9197 // Copy left newly-split chunks into existing (B0 above). The array was 9198 // re-sized in block_presplit_gpu_chunks as necessary. 9199 size_t num_subchunks; 9200 9201 num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm, 9202 original_chunk, 9203 0, // start_index 9204 num_split_chunks_existing, 9205 &existing_gpu_state->chunks[existing_before_state.chunk_index]); 9206 UVM_ASSERT(num_subchunks == num_split_chunks_existing); 9207 9208 // Copy right newly-split chunks into new (B1 above), overwriting the 9209 // pointer to the original chunk. 9210 num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm, 9211 original_chunk, 9212 num_split_chunks_existing, // start_index 9213 num_split_chunks_new, 9214 &new_gpu_state->chunks[0]); 9215 UVM_ASSERT(num_subchunks == num_split_chunks_new); 9216 } 9217 else { 9218 // If the chunk wasn't already populated we don't need to copy pointers 9219 // anywhere, but we need to clear out stale pointers from existing's 9220 // array covering the new elements. new's chunks array was already zero- 9221 // initialized. 9222 memset(&existing_gpu_state->chunks[existing_before_state.chunk_index], 9223 0, 9224 num_split_chunks_existing * sizeof(existing_gpu_state->chunks[0])); 9225 } 9226 9227 // Since we update the reverse map information, protect it against a 9228 // concurrent lookup 9229 uvm_spin_lock(&gpu->pmm.list_lock); 9230 9231 // Update the reverse map of all the chunks that are now under the new block 9232 for (i = 0; i < new_state.num_chunks; ++i) { 9233 if (new_gpu_state->chunks[i]) { 9234 UVM_ASSERT(new_gpu_state->chunks[i]->va_block == existing); 9235 new_gpu_state->chunks[i]->va_block = new; 9236 9237 // Adjust the page_index within the VA block for the new subchunks in 9238 // the new VA block 9239 UVM_ASSERT(new_gpu_state->chunks[i]->va_block_page_index >= split_page_index); 9240 new_gpu_state->chunks[i]->va_block_page_index -= split_page_index; 9241 } 9242 } 9243 9244 uvm_spin_unlock(&gpu->pmm.list_lock); 9245 9246 // Attempt to shrink existing's chunk allocation. If the realloc fails, just 9247 // keep on using the old larger one. 9248 if (existing_after_state.num_chunks < existing_before_state.num_chunks) { 9249 temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks, 9250 existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0])); 9251 if (temp_chunks) 9252 existing_gpu_state->chunks = temp_chunks; 9253 } 9254 } 9255 9256 static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_id_t gpu_id) 9257 { 9258 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu_id); 9259 uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu_id); 9260 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing); 9261 uvm_gpu_va_space_t *gpu_va_space; 9262 uvm_gpu_t *gpu; 9263 uvm_gpu_t *accessing_gpu; 9264 size_t new_pages = uvm_va_block_num_cpu_pages(new); 9265 size_t existing_pages, existing_pages_4k, existing_pages_big, new_pages_big; 9266 uvm_pte_bits_gpu_t pte_bit; 9267 size_t num_chunks, i; 9268 uvm_cpu_chunk_t *cpu_chunk; 9269 uvm_page_index_t page_index; 9270 9271 if (!existing_gpu_state) 9272 return; 9273 9274 gpu = uvm_va_space_get_gpu(va_space, gpu_id); 9275 UVM_ASSERT(new_gpu_state); 9276 9277 new_gpu_state->force_4k_ptes = existing_gpu_state->force_4k_ptes; 9278 9279 UVM_ASSERT(PAGE_ALIGNED(new->start)); 9280 UVM_ASSERT(PAGE_ALIGNED(existing->start)); 9281 existing_pages = (new->start - existing->start) / PAGE_SIZE; 9282 9283 for_each_cpu_chunk_in_block(cpu_chunk, page_index, new) { 9284 uvm_pmm_sysmem_mappings_reparent_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, 9285 uvm_cpu_chunk_get_gpu_phys_addr(cpu_chunk, gpu->parent), 9286 new); 9287 } 9288 9289 block_copy_split_gpu_chunks(existing, new, gpu); 9290 9291 num_chunks = block_num_gpu_chunks(new, gpu); 9292 9293 // Reparent GPU mappings for indirect peers 9294 for (i = 0; i < num_chunks; ++i) { 9295 uvm_gpu_chunk_t *chunk = new_gpu_state->chunks[i]; 9296 if (!chunk) 9297 continue; 9298 9299 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 9300 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); 9301 9302 uvm_pmm_sysmem_mappings_reparent_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, 9303 peer_addr, 9304 new); 9305 } 9306 } 9307 9308 block_split_page_mask(&existing_gpu_state->resident, 9309 existing_pages, 9310 &new_gpu_state->resident, 9311 new_pages); 9312 9313 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { 9314 block_split_page_mask(&existing_gpu_state->pte_bits[pte_bit], existing_pages, 9315 &new_gpu_state->pte_bits[pte_bit], new_pages); 9316 } 9317 9318 // Adjust page table ranges. 9319 gpu_va_space = uvm_gpu_va_space_get(va_space, gpu); 9320 if (gpu_va_space) { 9321 if (existing_gpu_state->page_table_range_big.table) { 9322 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu); 9323 9324 // existing's end has not been adjusted yet 9325 existing_pages_big = range_num_big_pages(existing->start, new->start - 1, big_page_size); 9326 9327 // Take references on all big pages covered by new 9328 new_pages_big = uvm_va_block_num_big_pages(new, big_page_size); 9329 if (new_pages_big) { 9330 uvm_page_table_range_get_upper(&gpu_va_space->page_tables, 9331 &existing_gpu_state->page_table_range_big, 9332 &new_gpu_state->page_table_range_big, 9333 new_pages_big); 9334 9335 // If the split point is within a big page region, we might have 9336 // a gap since neither existing nor new can use it anymore. 9337 // Get the top N bits from existing's mask to handle that. 9338 bitmap_shift_right(new_gpu_state->big_ptes, 9339 existing_gpu_state->big_ptes, 9340 uvm_va_block_num_big_pages(existing, big_page_size) - new_pages_big, 9341 MAX_BIG_PAGES_PER_UVM_VA_BLOCK); 9342 9343 new_gpu_state->initialized_big = existing_gpu_state->initialized_big; 9344 } 9345 9346 // Drop existing's references on the big PTEs it no longer covers 9347 // now that new has references on them. Note that neither existing 9348 // nor new might have big PTEs after the split. In that case, this 9349 // shrink will free the entire old range. 9350 uvm_page_table_range_shrink(&gpu_va_space->page_tables, 9351 &existing_gpu_state->page_table_range_big, 9352 existing_pages_big); 9353 9354 if (existing_pages_big == 0) { 9355 memset(&existing_gpu_state->page_table_range_big, 0, sizeof(existing_gpu_state->page_table_range_big)); 9356 existing_gpu_state->initialized_big = false; 9357 } 9358 9359 bitmap_clear(existing_gpu_state->big_ptes, 9360 existing_pages_big, 9361 MAX_BIG_PAGES_PER_UVM_VA_BLOCK - existing_pages_big); 9362 } 9363 9364 if (existing_gpu_state->page_table_range_4k.table) { 9365 // Since existing and new share the same PDE we just need to bump 9366 // the ref-count on new's sub-range. 9367 uvm_page_table_range_get_upper(&gpu_va_space->page_tables, 9368 &existing_gpu_state->page_table_range_4k, 9369 &new_gpu_state->page_table_range_4k, 9370 uvm_va_block_size(new) / UVM_PAGE_SIZE_4K); 9371 9372 // Drop existing's references on the PTEs it no longer covers now 9373 // that new has references on them. 9374 existing_pages_4k = existing_pages * (PAGE_SIZE / UVM_PAGE_SIZE_4K); 9375 uvm_page_table_range_shrink(&gpu_va_space->page_tables, 9376 &existing_gpu_state->page_table_range_4k, 9377 existing_pages_4k); 9378 } 9379 9380 // We have to set this explicitly to handle the case of splitting an 9381 // invalid, active 2M PTE with no lower page tables allocated. 9382 if (existing_gpu_state->pte_is_2m) { 9383 UVM_ASSERT(!existing_gpu_state->page_table_range_big.table); 9384 UVM_ASSERT(!existing_gpu_state->page_table_range_4k.table); 9385 existing_gpu_state->pte_is_2m = false; 9386 } 9387 9388 // existing can't possibly cover 2MB after a split, so drop any 2M PTE 9389 // references it has. We've taken the necessary references on the lower 9390 // tables above. 9391 block_put_ptes_safe(&gpu_va_space->page_tables, &existing_gpu_state->page_table_range_2m); 9392 existing_gpu_state->activated_big = false; 9393 existing_gpu_state->activated_4k = false; 9394 } 9395 9396 block_split_page_mask(&existing_gpu_state->evicted, existing_pages, &new_gpu_state->evicted, new_pages); 9397 } 9398 9399 NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block, 9400 NvU64 new_end, 9401 uvm_va_block_t **new_va_block, 9402 uvm_va_range_t *new_va_range) 9403 { 9404 uvm_va_space_t *va_space; 9405 uvm_va_block_t *new_block = NULL; 9406 NV_STATUS status; 9407 9408 va_space = new_va_range->va_space; 9409 UVM_ASSERT(existing_va_block->va_range); 9410 UVM_ASSERT(existing_va_block->va_range->va_space == va_space); 9411 UVM_ASSERT(!uvm_va_block_is_hmm(existing_va_block)); 9412 9413 // External range types can't be split 9414 UVM_ASSERT(existing_va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 9415 UVM_ASSERT(new_va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 9416 uvm_assert_rwsem_locked_write(&va_space->lock); 9417 9418 UVM_ASSERT(new_end > existing_va_block->start); 9419 UVM_ASSERT(new_end < existing_va_block->end); 9420 UVM_ASSERT(PAGE_ALIGNED(new_end + 1)); 9421 9422 status = uvm_va_block_create(new_va_range, new_end + 1, existing_va_block->end, &new_block); 9423 if (status != NV_OK) 9424 return status; 9425 9426 // We're protected from other splits and faults by the va_space lock being 9427 // held in write mode, but that doesn't stop the reverse mapping (eviction 9428 // path) from inspecting the existing block. Stop those threads by taking 9429 // the block lock. When a reverse mapping thread takes this lock after the 9430 // split has been performed, it will have to re-inspect state and may see 9431 // that it should use the newly-split block instead. 9432 uvm_mutex_lock(&existing_va_block->lock); 9433 9434 status = uvm_va_block_split_locked(existing_va_block, new_end, new_block, new_va_range); 9435 9436 uvm_mutex_unlock(&existing_va_block->lock); 9437 9438 if (status != NV_OK) 9439 uvm_va_block_release(new_block); 9440 else if (new_va_block) 9441 *new_va_block = new_block; 9442 9443 return status; 9444 } 9445 9446 NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block, 9447 NvU64 new_end, 9448 uvm_va_block_t *new_block, 9449 uvm_va_range_t *new_va_range) 9450 { 9451 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing_va_block); 9452 uvm_gpu_id_t id; 9453 NV_STATUS status; 9454 uvm_perf_event_data_t event_data; 9455 9456 UVM_ASSERT(block_check_chunks(existing_va_block)); 9457 9458 // As soon as we update existing's reverse mappings to point to the newly- 9459 // split block, the eviction path could try to operate on the new block. 9460 // Lock that out too until new is ready. 9461 // 9462 // Note that we usually shouldn't nest block locks, but it's ok here because 9463 // we just created new_block so no other thread could possibly take it out 9464 // of order with existing's lock. 9465 uvm_mutex_lock_no_tracking(&new_block->lock); 9466 9467 // The split has to be transactional, meaning that if we fail, the existing 9468 // block must not be modified. Handle that by pre-allocating everything we 9469 // might need under both existing and new at the start so we only have a 9470 // single point of failure. 9471 9472 // Since pre-allocation might require allocating new PTEs, we have to handle 9473 // allocation retry which might drop existing's block lock. The 9474 // preallocation is split into two steps for that: the first part which 9475 // allocates and splits PTEs can handle having the block lock dropped then 9476 // re-taken. It won't modify existing_va_block other than adding new PTE 9477 // allocations and splitting existing PTEs, which is always safe. 9478 status = UVM_VA_BLOCK_RETRY_LOCKED(existing_va_block, 9479 NULL, 9480 block_split_presplit_ptes(existing_va_block, new_block)); 9481 if (status != NV_OK) 9482 goto out; 9483 9484 // Pre-allocate, stage two. This modifies existing_va_block in ways which 9485 // violate many assumptions (such as changing chunk size), but it will put 9486 // things back into place on a failure without dropping the block lock. 9487 status = block_split_preallocate_no_retry(existing_va_block, new_block); 9488 if (status != NV_OK) 9489 goto out; 9490 9491 // We'll potentially be freeing page tables, so we need to wait for any 9492 // outstanding work before we start 9493 status = uvm_tracker_wait(&existing_va_block->tracker); 9494 if (status != NV_OK) 9495 goto out; 9496 9497 // Update existing's state only once we're past all failure points 9498 9499 event_data.block_shrink.block = existing_va_block; 9500 uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_SHRINK, &event_data); 9501 9502 block_split_cpu(existing_va_block, new_block); 9503 9504 for_each_gpu_id(id) 9505 block_split_gpu(existing_va_block, new_block, id); 9506 9507 // Update the size of the existing block first so that 9508 // block_set_processor_masks can use block_{set,clear}_resident_processor 9509 // that relies on the size to be correct. 9510 existing_va_block->end = new_end; 9511 9512 block_split_page_mask(&existing_va_block->read_duplicated_pages, 9513 uvm_va_block_num_cpu_pages(existing_va_block), 9514 &new_block->read_duplicated_pages, 9515 uvm_va_block_num_cpu_pages(new_block)); 9516 9517 block_split_page_mask(&existing_va_block->maybe_mapped_pages, 9518 uvm_va_block_num_cpu_pages(existing_va_block), 9519 &new_block->maybe_mapped_pages, 9520 uvm_va_block_num_cpu_pages(new_block)); 9521 9522 block_set_processor_masks(existing_va_block); 9523 block_set_processor_masks(new_block); 9524 9525 if (uvm_va_block_is_hmm(existing_va_block)) { 9526 uvm_hmm_va_block_split_tree(existing_va_block, new_block); 9527 uvm_va_policy_node_split_move(existing_va_block, new_block); 9528 } 9529 9530 out: 9531 // Run checks on existing_va_block even on failure, since an error must 9532 // leave the block in a consistent state. 9533 UVM_ASSERT(block_check_chunks(existing_va_block)); 9534 UVM_ASSERT(block_check_mappings(existing_va_block)); 9535 if (status == NV_OK) { 9536 UVM_ASSERT(block_check_chunks(new_block)); 9537 UVM_ASSERT(block_check_mappings(new_block)); 9538 } 9539 else { 9540 block_free_cpu_chunk_storage(new_block); 9541 } 9542 9543 uvm_mutex_unlock_no_tracking(&new_block->lock); 9544 9545 return status; 9546 } 9547 9548 static bool block_region_might_read_duplicate(uvm_va_block_t *va_block, 9549 uvm_va_block_region_t region) 9550 { 9551 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9552 uvm_va_range_t *va_range = va_block->va_range; 9553 9554 if (!uvm_va_space_can_read_duplicate(va_space, NULL)) 9555 return false; 9556 9557 // TODO: Bug 3660922: need to implement HMM read duplication support. 9558 if (uvm_va_block_is_hmm(va_block) || 9559 uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_DISABLED) 9560 return false; 9561 9562 if (uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_UNSET 9563 && uvm_page_mask_region_weight(&va_block->read_duplicated_pages, region) == 0) 9564 return false; 9565 9566 return true; 9567 } 9568 9569 // Returns the new access permission for the processor that faulted or 9570 // triggered access counter notifications on the given page 9571 // 9572 // TODO: Bug 1766424: this function works on a single page at a time. This 9573 // could be changed in the future to optimize multiple faults/counters on 9574 // contiguous pages. 9575 static uvm_prot_t compute_new_permission(uvm_va_block_t *va_block, 9576 uvm_va_block_context_t *va_block_context, 9577 uvm_page_index_t page_index, 9578 uvm_processor_id_t fault_processor_id, 9579 uvm_processor_id_t new_residency, 9580 uvm_fault_access_type_t access_type) 9581 { 9582 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9583 uvm_prot_t logical_prot, new_prot; 9584 9585 // TODO: Bug 1766432: Refactor into policies. Current policy is 9586 // query_promote: upgrade access privileges to avoid future faults IF 9587 // they don't trigger further revocations. 9588 new_prot = uvm_fault_access_type_to_prot(access_type); 9589 logical_prot = compute_logical_prot(va_block, va_block_context, page_index); 9590 9591 UVM_ASSERT(logical_prot >= new_prot); 9592 9593 if (logical_prot > UVM_PROT_READ_ONLY && new_prot == UVM_PROT_READ_ONLY && 9594 !block_region_might_read_duplicate(va_block, uvm_va_block_region_for_page(page_index))) { 9595 uvm_processor_mask_t processors_with_atomic_mapping; 9596 uvm_processor_mask_t revoke_processors; 9597 9598 block_page_authorized_processors(va_block, 9599 page_index, 9600 UVM_PROT_READ_WRITE_ATOMIC, 9601 &processors_with_atomic_mapping); 9602 9603 uvm_processor_mask_andnot(&revoke_processors, 9604 &processors_with_atomic_mapping, 9605 &va_space->has_native_atomics[uvm_id_value(new_residency)]); 9606 9607 // Only check if there are no faultable processors in the revoke 9608 // processors mask. 9609 uvm_processor_mask_and(&revoke_processors, &revoke_processors, &va_space->faultable_processors); 9610 9611 if (uvm_processor_mask_empty(&revoke_processors)) 9612 new_prot = UVM_PROT_READ_WRITE; 9613 } 9614 if (logical_prot == UVM_PROT_READ_WRITE_ATOMIC && new_prot == UVM_PROT_READ_WRITE) { 9615 if (uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], fault_processor_id)) 9616 new_prot = UVM_PROT_READ_WRITE_ATOMIC; 9617 } 9618 9619 return new_prot; 9620 } 9621 9622 static NV_STATUS do_block_add_mappings_after_migration(uvm_va_block_t *va_block, 9623 uvm_va_block_context_t *va_block_context, 9624 uvm_processor_id_t new_residency, 9625 uvm_processor_id_t processor_id, 9626 const uvm_processor_mask_t *map_processors, 9627 uvm_va_block_region_t region, 9628 const uvm_page_mask_t *map_page_mask, 9629 uvm_prot_t max_prot, 9630 const uvm_processor_mask_t *thrashing_processors, 9631 uvm_tracker_t *tracker) 9632 { 9633 NV_STATUS status; 9634 uvm_processor_id_t map_processor_id; 9635 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9636 uvm_prot_t new_map_prot = max_prot; 9637 uvm_processor_mask_t map_processors_local; 9638 9639 uvm_processor_mask_copy(&map_processors_local, map_processors); 9640 9641 // Handle atomic mappings separately 9642 if (max_prot == UVM_PROT_READ_WRITE_ATOMIC) { 9643 bool this_processor_has_native_atomics; 9644 9645 this_processor_has_native_atomics = 9646 uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id); 9647 9648 if (this_processor_has_native_atomics) { 9649 uvm_processor_mask_t map_atomic_processors; 9650 9651 // Compute processors with native atomics to the residency 9652 uvm_processor_mask_and(&map_atomic_processors, 9653 &map_processors_local, 9654 &va_space->has_native_atomics[uvm_id_value(new_residency)]); 9655 9656 // Filter out these mapped processors for the next steps 9657 uvm_processor_mask_andnot(&map_processors_local, &map_processors_local, &map_atomic_processors); 9658 9659 for_each_id_in_mask(map_processor_id, &map_atomic_processors) { 9660 UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy; 9661 if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id)) 9662 cause = UvmEventMapRemoteCauseThrashing; 9663 9664 status = uvm_va_block_map(va_block, 9665 va_block_context, 9666 map_processor_id, 9667 region, 9668 map_page_mask, 9669 UVM_PROT_READ_WRITE_ATOMIC, 9670 cause, 9671 tracker); 9672 if (status != NV_OK) 9673 return status; 9674 } 9675 9676 new_map_prot = UVM_PROT_READ_WRITE; 9677 } 9678 else { 9679 if (UVM_ID_IS_CPU(processor_id)) 9680 new_map_prot = UVM_PROT_READ_WRITE; 9681 else 9682 new_map_prot = UVM_PROT_READ_ONLY; 9683 } 9684 } 9685 9686 // Map the rest of processors 9687 for_each_id_in_mask(map_processor_id, &map_processors_local) { 9688 UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy; 9689 uvm_prot_t final_map_prot; 9690 bool map_processor_has_enabled_system_wide_atomics = 9691 uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, map_processor_id); 9692 9693 // Write mappings from processors with disabled system-wide atomics are treated like atomics 9694 if (new_map_prot == UVM_PROT_READ_WRITE && !map_processor_has_enabled_system_wide_atomics) 9695 final_map_prot = UVM_PROT_READ_WRITE_ATOMIC; 9696 else 9697 final_map_prot = new_map_prot; 9698 9699 if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id)) 9700 cause = UvmEventMapRemoteCauseThrashing; 9701 9702 status = uvm_va_block_map(va_block, 9703 va_block_context, 9704 map_processor_id, 9705 region, 9706 map_page_mask, 9707 final_map_prot, 9708 cause, 9709 tracker); 9710 if (status != NV_OK) 9711 return status; 9712 } 9713 9714 return NV_OK; 9715 } 9716 9717 NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block, 9718 uvm_va_block_context_t *va_block_context, 9719 uvm_processor_id_t new_residency, 9720 uvm_processor_id_t processor_id, 9721 uvm_va_block_region_t region, 9722 const uvm_page_mask_t *map_page_mask, 9723 uvm_prot_t max_prot, 9724 const uvm_processor_mask_t *thrashing_processors) 9725 { 9726 NV_STATUS tracker_status, status = NV_OK; 9727 uvm_processor_mask_t map_other_processors, map_uvm_lite_gpus; 9728 uvm_processor_id_t map_processor_id; 9729 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9730 const uvm_page_mask_t *final_page_mask = map_page_mask; 9731 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 9732 const uvm_va_policy_t *policy = va_block_context->policy; 9733 uvm_processor_id_t preferred_location; 9734 9735 uvm_assert_mutex_locked(&va_block->lock); 9736 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, policy, region)); 9737 9738 // Read duplication takes precedence over SetAccessedBy. 9739 // 9740 // Exclude ranges with read duplication set... 9741 if (uvm_va_policy_is_read_duplicate(policy, va_space)) { 9742 status = NV_OK; 9743 goto out; 9744 } 9745 9746 // ... and pages read-duplicated by performance heuristics 9747 if (policy->read_duplication == UVM_READ_DUPLICATION_UNSET) { 9748 if (map_page_mask) { 9749 uvm_page_mask_andnot(&va_block_context->mapping.filtered_page_mask, 9750 map_page_mask, 9751 &va_block->read_duplicated_pages); 9752 } 9753 else { 9754 uvm_page_mask_complement(&va_block_context->mapping.filtered_page_mask, &va_block->read_duplicated_pages); 9755 } 9756 final_page_mask = &va_block_context->mapping.filtered_page_mask; 9757 } 9758 9759 // Add mappings for accessed_by processors and the given processor mask 9760 if (thrashing_processors) 9761 uvm_processor_mask_or(&map_other_processors, &policy->accessed_by, thrashing_processors); 9762 else 9763 uvm_processor_mask_copy(&map_other_processors, &policy->accessed_by); 9764 9765 // Only processors that can access the new location must be considered 9766 uvm_processor_mask_and(&map_other_processors, 9767 &map_other_processors, 9768 &va_space->accessible_from[uvm_id_value(new_residency)]); 9769 9770 // Exclude caller processor as it must have already been mapped 9771 uvm_processor_mask_clear(&map_other_processors, processor_id); 9772 9773 // Exclude preferred location so it won't get remote mappings 9774 preferred_location = policy->preferred_location; 9775 if (UVM_ID_IS_VALID(preferred_location) && 9776 !uvm_id_equal(new_residency, preferred_location) && 9777 uvm_va_space_processor_has_memory(va_space, preferred_location)) { 9778 uvm_processor_mask_clear(&map_other_processors, preferred_location); 9779 } 9780 9781 // Map the UVM-Lite GPUs if the new location is the preferred location. This 9782 // will only create mappings on first touch. After that they're persistent 9783 // so uvm_va_block_map will be a no-op. 9784 uvm_processor_mask_and(&map_uvm_lite_gpus, &map_other_processors, block_get_uvm_lite_gpus(va_block)); 9785 if (!uvm_processor_mask_empty(&map_uvm_lite_gpus) && 9786 uvm_id_equal(new_residency, preferred_location)) { 9787 for_each_id_in_mask(map_processor_id, &map_uvm_lite_gpus) { 9788 status = uvm_va_block_map(va_block, 9789 va_block_context, 9790 map_processor_id, 9791 region, 9792 final_page_mask, 9793 UVM_PROT_READ_WRITE_ATOMIC, 9794 UvmEventMapRemoteCauseCoherence, 9795 &local_tracker); 9796 if (status != NV_OK) 9797 goto out; 9798 } 9799 } 9800 9801 uvm_processor_mask_andnot(&map_other_processors, &map_other_processors, block_get_uvm_lite_gpus(va_block)); 9802 9803 // We can't map non-migratable pages to the CPU. If we have any, build a 9804 // new mask of migratable pages and map the CPU separately. 9805 if (uvm_processor_mask_test(&map_other_processors, UVM_ID_CPU) && 9806 !uvm_range_group_all_migratable(va_space, 9807 uvm_va_block_region_start(va_block, region), 9808 uvm_va_block_region_end(va_block, region))) { 9809 uvm_page_mask_t *migratable_mask = &va_block_context->mapping.migratable_mask; 9810 9811 uvm_range_group_migratable_page_mask(va_block, region, migratable_mask); 9812 if (uvm_page_mask_and(migratable_mask, migratable_mask, final_page_mask)) { 9813 uvm_processor_mask_t cpu_mask; 9814 uvm_processor_mask_zero(&cpu_mask); 9815 uvm_processor_mask_set(&cpu_mask, UVM_ID_CPU); 9816 9817 status = do_block_add_mappings_after_migration(va_block, 9818 va_block_context, 9819 new_residency, 9820 processor_id, 9821 &cpu_mask, 9822 region, 9823 migratable_mask, 9824 max_prot, 9825 thrashing_processors, 9826 &local_tracker); 9827 if (status != NV_OK) 9828 goto out; 9829 } 9830 9831 uvm_processor_mask_clear(&map_other_processors, UVM_ID_CPU); 9832 } 9833 9834 status = do_block_add_mappings_after_migration(va_block, 9835 va_block_context, 9836 new_residency, 9837 processor_id, 9838 &map_other_processors, 9839 region, 9840 final_page_mask, 9841 max_prot, 9842 thrashing_processors, 9843 &local_tracker); 9844 if (status != NV_OK) 9845 goto out; 9846 9847 out: 9848 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 9849 uvm_tracker_deinit(&local_tracker); 9850 return status == NV_OK ? tracker_status : status; 9851 } 9852 9853 uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block, 9854 uvm_processor_id_t processor_id, 9855 uvm_page_index_t page_index) 9856 { 9857 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9858 uvm_processor_mask_t resident_processors; 9859 NvU32 resident_processors_count; 9860 9861 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), processor_id)) 9862 return UVM_PROT_READ_WRITE_ATOMIC; 9863 9864 uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors); 9865 resident_processors_count = uvm_processor_mask_get_count(&resident_processors); 9866 9867 if (resident_processors_count == 0) { 9868 return UVM_PROT_NONE; 9869 } 9870 else if (resident_processors_count > 1) { 9871 // If there are many copies, we can only map READ ONLY 9872 // 9873 // The block state doesn't track the mapping target (aperture) of each 9874 // individual PTE, just the permissions and where the data is resident. 9875 // If the data is resident in multiple places, then we have a problem 9876 // since we can't know where the PTE points. This means we won't know 9877 // what needs to be unmapped for cases like UvmUnregisterGpu and 9878 // UvmDisablePeerAccess. 9879 // 9880 // The simple way to solve this is to enforce that a read-duplication 9881 // mapping always points to local memory. 9882 if (uvm_processor_mask_test(&resident_processors, processor_id)) 9883 return UVM_PROT_READ_ONLY; 9884 9885 return UVM_PROT_NONE; 9886 } 9887 else { 9888 uvm_processor_id_t atomic_id; 9889 uvm_processor_id_t residency; 9890 uvm_processor_mask_t atomic_mappings; 9891 uvm_processor_mask_t write_mappings; 9892 9893 // Search the id of the processor with the only resident copy 9894 residency = uvm_processor_mask_find_first_id(&resident_processors); 9895 UVM_ASSERT(UVM_ID_IS_VALID(residency)); 9896 9897 // If we cannot map the processor with the resident copy, exit 9898 if (!uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(residency)], processor_id)) 9899 return UVM_PROT_NONE; 9900 9901 // Fast path: if the page is not mapped anywhere else, it can be safely 9902 // mapped with RWA permission 9903 if (!uvm_page_mask_test(&va_block->maybe_mapped_pages, page_index)) 9904 return UVM_PROT_READ_WRITE_ATOMIC; 9905 9906 block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings); 9907 9908 // Exclude processors with system-wide atomics disabled from atomic_mappings 9909 uvm_processor_mask_and(&atomic_mappings, 9910 &atomic_mappings, 9911 &va_space->system_wide_atomics_enabled_processors); 9912 9913 // Exclude the processor for which the mapping protections are being computed 9914 uvm_processor_mask_clear(&atomic_mappings, processor_id); 9915 9916 // If there is any processor with atomic mapping, check if it has native atomics to the processor 9917 // with the resident copy. If it does not, we can only map READ ONLY 9918 atomic_id = uvm_processor_mask_find_first_id(&atomic_mappings); 9919 if (UVM_ID_IS_VALID(atomic_id) && 9920 !uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], atomic_id)) { 9921 return UVM_PROT_READ_ONLY; 9922 } 9923 9924 block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE, &write_mappings); 9925 9926 // Exclude the processor for which the mapping protections are being computed 9927 uvm_processor_mask_clear(&write_mappings, processor_id); 9928 9929 // At this point, any processor with atomic mappings either has native 9930 // atomics support to the processor with the resident copy or has 9931 // disabled system-wide atomics. If the requesting processor has 9932 // disabled system-wide atomics or has native atomics to that processor, 9933 // we can map with ATOMIC privileges. Likewise, if there are no other 9934 // processors with WRITE or ATOMIC mappings, we can map with ATOMIC 9935 // privileges. For HMM, don't allow GPU atomic access to remote mapped 9936 // system memory even if there are no write mappings since CPU access 9937 // can be upgraded without notification. 9938 if (!uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, processor_id) || 9939 uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id) || 9940 (uvm_processor_mask_empty(&write_mappings) && !uvm_va_block_is_hmm(va_block))) { 9941 return UVM_PROT_READ_WRITE_ATOMIC; 9942 } 9943 9944 return UVM_PROT_READ_WRITE; 9945 } 9946 } 9947 9948 NV_STATUS uvm_va_block_add_mappings(uvm_va_block_t *va_block, 9949 uvm_va_block_context_t *va_block_context, 9950 uvm_processor_id_t processor_id, 9951 uvm_va_block_region_t region, 9952 const uvm_page_mask_t *page_mask, 9953 UvmEventMapRemoteCause cause) 9954 { 9955 uvm_va_range_t *va_range = va_block->va_range; 9956 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 9957 NV_STATUS status = NV_OK; 9958 uvm_page_index_t page_index; 9959 uvm_range_group_range_iter_t iter; 9960 uvm_prot_t prot_to_map; 9961 9962 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region)); 9963 9964 if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) { 9965 if (!uvm_va_range_vma_check(va_range, va_block_context->mm)) 9966 return NV_OK; 9967 9968 uvm_range_group_range_migratability_iter_first(va_space, 9969 uvm_va_block_region_start(va_block, region), 9970 uvm_va_block_region_end(va_block, region), 9971 &iter); 9972 } 9973 9974 for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map) 9975 va_block_context->mask_by_prot[prot_to_map - 1].count = 0; 9976 9977 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 9978 // Read duplication takes precedence over SetAccessedBy. Exclude pages 9979 // read-duplicated by performance heuristics 9980 if (uvm_page_mask_test(&va_block->read_duplicated_pages, page_index)) 9981 continue; 9982 9983 prot_to_map = uvm_va_block_page_compute_highest_permission(va_block, processor_id, page_index); 9984 if (prot_to_map == UVM_PROT_NONE) 9985 continue; 9986 9987 if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) { 9988 while (uvm_va_block_cpu_page_index(va_block, iter.end) < page_index) { 9989 uvm_range_group_range_migratability_iter_next(va_space, 9990 &iter, 9991 uvm_va_block_region_end(va_block, region)); 9992 } 9993 9994 if (!iter.migratable) 9995 continue; 9996 } 9997 9998 if (va_block_context->mask_by_prot[prot_to_map - 1].count++ == 0) 9999 uvm_page_mask_zero(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask); 10000 10001 uvm_page_mask_set(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask, page_index); 10002 } 10003 10004 for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map) { 10005 if (va_block_context->mask_by_prot[prot_to_map - 1].count == 0) 10006 continue; 10007 10008 status = uvm_va_block_map(va_block, 10009 va_block_context, 10010 processor_id, 10011 region, 10012 &va_block_context->mask_by_prot[prot_to_map - 1].page_mask, 10013 prot_to_map, 10014 cause, 10015 &va_block->tracker); 10016 if (status != NV_OK) 10017 break; 10018 } 10019 10020 return status; 10021 } 10022 10023 static bool can_read_duplicate(uvm_va_block_t *va_block, 10024 uvm_page_index_t page_index, 10025 const uvm_va_policy_t *policy, 10026 const uvm_perf_thrashing_hint_t *thrashing_hint) 10027 { 10028 if (uvm_va_policy_is_read_duplicate(policy, uvm_va_block_get_va_space(va_block))) 10029 return true; 10030 10031 if (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED && 10032 uvm_page_mask_test(&va_block->read_duplicated_pages, page_index) && 10033 thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_PIN) 10034 return true; 10035 10036 return false; 10037 } 10038 10039 // TODO: Bug 1827400: If the faulting processor has support for native 10040 // atomics to the current location and the faults on the page were 10041 // triggered by atomic accesses only, we keep the current residency. 10042 // This is a short-term solution to exercise remote atomics over 10043 // NVLINK when possible (not only when preferred location is set to 10044 // the remote GPU) as they are much faster than relying on page 10045 // faults and permission downgrades, which cause thrashing. In the 10046 // future, the thrashing detection/prevention heuristics should 10047 // detect and handle this case. 10048 static bool map_remote_on_atomic_fault(uvm_va_space_t *va_space, 10049 NvU32 access_type_mask, 10050 uvm_processor_id_t processor_id, 10051 uvm_processor_id_t residency) 10052 { 10053 // This policy can be enabled/disabled using a module parameter 10054 if (!uvm_perf_map_remote_on_native_atomics_fault) 10055 return false; 10056 10057 // Only consider atomics faults 10058 if (uvm_fault_access_type_mask_lowest(access_type_mask) < UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK) 10059 return false; 10060 10061 // We cannot differentiate CPU writes from atomics. We exclude CPU faults 10062 // from the logic explained above in order to avoid mapping CPU to vidmem 10063 // memory due to a write. 10064 if (UVM_ID_IS_CPU(processor_id)) 10065 return false; 10066 10067 // On P9 systems (which have native HW support for system-wide atomics), we 10068 // have determined experimentally that placing memory on a GPU yields the 10069 // best performance on most cases (since CPU can cache vidmem but not vice 10070 // versa). Therefore, don't map remotely if the current residency is 10071 // sysmem. 10072 if (UVM_ID_IS_CPU(residency)) 10073 return false; 10074 10075 return uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id); 10076 } 10077 10078 // TODO: Bug 1766424: this function works on a single page at a time. This 10079 // could be changed in the future to optimize multiple faults or access 10080 // counter notifications on contiguous pages. 10081 static uvm_processor_id_t block_select_residency(uvm_va_block_t *va_block, 10082 uvm_va_block_context_t *va_block_context, 10083 uvm_page_index_t page_index, 10084 uvm_processor_id_t processor_id, 10085 NvU32 access_type_mask, 10086 const uvm_va_policy_t *policy, 10087 const uvm_perf_thrashing_hint_t *thrashing_hint, 10088 uvm_service_operation_t operation, 10089 bool *read_duplicate) 10090 { 10091 uvm_processor_id_t closest_resident_processor; 10092 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10093 bool may_read_duplicate; 10094 uvm_processor_id_t preferred_location; 10095 10096 // TODO: Bug 3660968: Remove uvm_hmm_force_sysmem_set() check as soon as 10097 // HMM migration is implemented VMAs other than anonymous memory. 10098 if (is_uvm_fault_force_sysmem_set() || uvm_hmm_must_use_sysmem(va_block, va_block_context)) { 10099 *read_duplicate = false; 10100 return UVM_ID_CPU; 10101 } 10102 10103 may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint); 10104 10105 // Read/prefetch faults on a VA range with read duplication enabled 10106 // always create a copy of the page on the faulting processor's memory. 10107 // Note that access counters always use UVM_FAULT_ACCESS_TYPE_PREFETCH, 10108 // which will lead to read duplication if it is enabled. 10109 *read_duplicate = may_read_duplicate && 10110 (uvm_fault_access_type_mask_highest(access_type_mask) <= UVM_FAULT_ACCESS_TYPE_READ); 10111 10112 if (*read_duplicate) 10113 return processor_id; 10114 10115 *read_duplicate = false; 10116 10117 // If read-duplication is active in the page but we are not 10118 // read-duplicating because the access type is not a read or a prefetch, 10119 // the faulting processor should get a local copy 10120 if (may_read_duplicate) 10121 return processor_id; 10122 10123 // If the faulting processor is the preferred location always migrate 10124 preferred_location = policy->preferred_location; 10125 if (uvm_id_equal(processor_id, preferred_location)) { 10126 if (thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_NONE) { 10127 UVM_ASSERT(thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN); 10128 if (uvm_va_space_processor_has_memory(va_space, processor_id)) 10129 UVM_ASSERT(uvm_id_equal(thrashing_hint->pin.residency, processor_id)); 10130 } 10131 10132 return processor_id; 10133 } 10134 10135 // If the faulting processor is the CPU, HMM has to migrate the block to 10136 // system memory. 10137 // TODO: Bug 3900021: [UVM-HMM] investigate thrashing improvements. 10138 if (UVM_ID_IS_CPU(processor_id) && uvm_va_block_is_hmm(va_block)) 10139 return processor_id; 10140 10141 if (thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN) { 10142 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(thrashing_hint->pin.residency)], 10143 processor_id)); 10144 return thrashing_hint->pin.residency; 10145 } 10146 10147 closest_resident_processor = uvm_va_block_page_get_closest_resident(va_block, page_index, processor_id); 10148 10149 // If the page is not resident anywhere, select the preferred location as 10150 // long as the preferred location is accessible from the faulting processor. 10151 // Otherwise select the faulting processor. 10152 if (UVM_ID_IS_INVALID(closest_resident_processor)) { 10153 if (UVM_ID_IS_VALID(preferred_location) && 10154 uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], 10155 processor_id)) { 10156 return preferred_location; 10157 } 10158 10159 return processor_id; 10160 } 10161 10162 // AccessedBy mappings might have not been created for the CPU if the thread 10163 // which made the memory resident did not have the proper references on the 10164 // mm_struct (for example, the GPU fault handling path when 10165 // uvm_va_space_mm_enabled() is false). 10166 // 10167 // Also, in uvm_migrate_*, we implement a two-pass scheme in which 10168 // AccessedBy mappings may be delayed to the second pass. This can produce 10169 // faults even if the faulting processor is in the accessed_by mask. 10170 // 10171 // Here, we keep it on the current residency and we just add the missing 10172 // mapping. 10173 if (uvm_processor_mask_test(&policy->accessed_by, processor_id) && 10174 uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) && 10175 operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS) { 10176 return closest_resident_processor; 10177 } 10178 10179 // Check if we should map the closest resident processor remotely on atomic 10180 // fault 10181 if (map_remote_on_atomic_fault(va_space, access_type_mask, processor_id, closest_resident_processor)) 10182 return closest_resident_processor; 10183 10184 // If the processor has access to the preferred location, and the page is 10185 // not resident on the accessing processor, move it to the preferred 10186 // location. 10187 if (!uvm_id_equal(closest_resident_processor, processor_id) && 10188 UVM_ID_IS_VALID(preferred_location) && 10189 uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id)) 10190 return preferred_location; 10191 10192 // If the page is resident on a processor other than the preferred location, 10193 // or the faulting processor can't access the preferred location, we select 10194 // the faulting processor as the new residency. 10195 return processor_id; 10196 } 10197 10198 uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block, 10199 uvm_va_block_context_t *va_block_context, 10200 uvm_page_index_t page_index, 10201 uvm_processor_id_t processor_id, 10202 NvU32 access_type_mask, 10203 const uvm_va_policy_t *policy, 10204 const uvm_perf_thrashing_hint_t *thrashing_hint, 10205 uvm_service_operation_t operation, 10206 bool *read_duplicate) 10207 { 10208 uvm_processor_id_t id; 10209 10210 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, 10211 va_block_context->policy, 10212 uvm_va_block_region_for_page(page_index))); 10213 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, 10214 va_block_context, 10215 uvm_va_block_region_for_page(page_index))); 10216 10217 id = block_select_residency(va_block, 10218 va_block_context, 10219 page_index, 10220 processor_id, 10221 access_type_mask, 10222 policy, 10223 thrashing_hint, 10224 operation, 10225 read_duplicate); 10226 10227 // If the intended residency doesn't have memory, fall back to the CPU. 10228 if (!block_processor_has_memory(va_block, id)) { 10229 *read_duplicate = false; 10230 return UVM_ID_CPU; 10231 } 10232 10233 return id; 10234 } 10235 10236 static bool check_access_counters_dont_revoke(uvm_va_block_t *block, 10237 uvm_va_block_context_t *block_context, 10238 uvm_va_block_region_t region, 10239 const uvm_processor_mask_t *revoke_processors, 10240 const uvm_page_mask_t *revoke_page_mask, 10241 uvm_prot_t revoke_prot) 10242 { 10243 uvm_processor_id_t id; 10244 for_each_id_in_mask(id, revoke_processors) { 10245 const uvm_page_mask_t *mapped_with_prot = block_map_with_prot_mask_get(block, id, revoke_prot); 10246 10247 uvm_page_mask_and(&block_context->caller_page_mask, revoke_page_mask, mapped_with_prot); 10248 10249 UVM_ASSERT(uvm_page_mask_region_weight(&block_context->caller_page_mask, region) == 0); 10250 } 10251 10252 return true; 10253 } 10254 10255 // Update service_context->prefetch_hint, service_context->per_processor_masks, 10256 // and service_context->region. 10257 static void uvm_va_block_get_prefetch_hint(uvm_va_block_t *va_block, 10258 uvm_service_block_context_t *service_context) 10259 { 10260 uvm_processor_id_t new_residency; 10261 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10262 10263 // Performance heuristics policy: we only consider prefetching when there 10264 // are migrations to a single processor, only. 10265 if (uvm_processor_mask_get_count(&service_context->resident_processors) == 1) { 10266 uvm_page_index_t page_index; 10267 uvm_page_mask_t *new_residency_mask; 10268 const uvm_va_policy_t *policy = service_context->block_context.policy; 10269 10270 new_residency = uvm_processor_mask_find_first_id(&service_context->resident_processors); 10271 new_residency_mask = &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency; 10272 10273 // Update prefetch tracking structure with the pages that will migrate 10274 // due to faults 10275 uvm_perf_prefetch_get_hint(va_block, 10276 &service_context->block_context, 10277 new_residency, 10278 new_residency_mask, 10279 service_context->region, 10280 &service_context->prefetch_bitmap_tree, 10281 &service_context->prefetch_hint); 10282 10283 // Obtain the prefetch hint and give a fake fault access type to the 10284 // prefetched pages 10285 if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency)) { 10286 const uvm_page_mask_t *prefetch_pages_mask = &service_context->prefetch_hint.prefetch_pages_mask; 10287 10288 for_each_va_block_page_in_mask(page_index, prefetch_pages_mask, va_block) { 10289 UVM_ASSERT(!uvm_page_mask_test(new_residency_mask, page_index)); 10290 10291 service_context->access_type[page_index] = UVM_FAULT_ACCESS_TYPE_PREFETCH; 10292 10293 if (uvm_va_policy_is_read_duplicate(policy, va_space) || 10294 (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED && 10295 uvm_page_mask_test(&va_block->read_duplicated_pages, page_index))) { 10296 if (service_context->read_duplicate_count++ == 0) 10297 uvm_page_mask_zero(&service_context->read_duplicate_mask); 10298 10299 uvm_page_mask_set(&service_context->read_duplicate_mask, page_index); 10300 } 10301 } 10302 10303 uvm_page_mask_or(new_residency_mask, new_residency_mask, prefetch_pages_mask); 10304 service_context->region = uvm_va_block_region_from_mask(va_block, new_residency_mask); 10305 } 10306 } 10307 else { 10308 service_context->prefetch_hint.residency = UVM_ID_INVALID; 10309 } 10310 } 10311 10312 NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id, 10313 uvm_processor_id_t new_residency, 10314 uvm_va_block_t *va_block, 10315 uvm_va_block_retry_t *block_retry, 10316 uvm_service_block_context_t *service_context) 10317 { 10318 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10319 uvm_processor_mask_t *all_involved_processors = 10320 &service_context->block_context.make_resident.all_involved_processors; 10321 uvm_page_mask_t *new_residency_mask = 10322 &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency; 10323 uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency; 10324 uvm_page_mask_t *caller_page_mask = &service_context->block_context.caller_page_mask; 10325 uvm_make_resident_cause_t cause; 10326 NV_STATUS status; 10327 10328 // 1- Migrate pages 10329 switch (service_context->operation) { 10330 case UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS: 10331 cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT; 10332 break; 10333 case UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS: 10334 cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT; 10335 break; 10336 case UVM_SERVICE_OPERATION_ACCESS_COUNTERS: 10337 cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER; 10338 break; 10339 default: 10340 UVM_ASSERT_MSG(false, "Invalid operation value %d\n", service_context->operation); 10341 // Set cause to silence compiler warning that it may be unused. 10342 cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER; 10343 break; 10344 } 10345 10346 // Reset masks before all of the make_resident calls 10347 uvm_page_mask_zero(did_migrate_mask); 10348 uvm_processor_mask_zero(all_involved_processors); 10349 10350 // Handle read duplication first so that the caller_page_mask will be free 10351 // to use below and still valid in uvm_va_block_service_finish(). 10352 // TODO: Bug 3660922: need to implement HMM read duplication support. 10353 if (service_context->read_duplicate_count != 0 && 10354 uvm_page_mask_and(caller_page_mask, 10355 new_residency_mask, 10356 &service_context->read_duplicate_mask)) { 10357 status = uvm_va_block_make_resident_read_duplicate(va_block, 10358 block_retry, 10359 &service_context->block_context, 10360 new_residency, 10361 service_context->region, 10362 caller_page_mask, 10363 &service_context->prefetch_hint.prefetch_pages_mask, 10364 cause); 10365 if (status != NV_OK) 10366 return status; 10367 } 10368 10369 if (service_context->read_duplicate_count == 0 || 10370 uvm_page_mask_andnot(caller_page_mask, new_residency_mask, &service_context->read_duplicate_mask)) { 10371 if (service_context->read_duplicate_count == 0) 10372 uvm_page_mask_copy(caller_page_mask, new_residency_mask); 10373 status = uvm_va_block_make_resident_copy(va_block, 10374 block_retry, 10375 &service_context->block_context, 10376 new_residency, 10377 service_context->region, 10378 caller_page_mask, 10379 &service_context->prefetch_hint.prefetch_pages_mask, 10380 cause); 10381 if (status != NV_OK) 10382 return status; 10383 } 10384 10385 if (UVM_ID_IS_CPU(processor_id) && !uvm_processor_mask_empty(all_involved_processors)) 10386 service_context->cpu_fault.did_migrate = true; 10387 10388 // 2- Check for ECC errors on all GPUs involved in the migration if CPU is 10389 // the destination. Migrations in response to CPU faults are special 10390 // because they're on the only path (apart from tools) where CUDA is not 10391 // involved and wouldn't have a chance to do its own ECC checking. 10392 if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS && 10393 UVM_ID_IS_CPU(new_residency) && 10394 !uvm_processor_mask_empty(all_involved_processors)) { 10395 uvm_gpu_t *gpu; 10396 10397 // Before checking for ECC errors, make sure all of the GPU work 10398 // is finished. Creating mappings on the CPU would have to wait 10399 // for the tracker anyway so this shouldn't hurt performance. 10400 status = uvm_tracker_wait(&va_block->tracker); 10401 if (status != NV_OK) 10402 return status; 10403 10404 for_each_va_space_gpu_in_mask(gpu, va_space, all_involved_processors) { 10405 // We cannot call into RM here so use the no RM ECC check. 10406 status = uvm_gpu_check_ecc_error_no_rm(gpu); 10407 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) { 10408 // In case we need to call into RM to be sure whether 10409 // there is an ECC error or not, signal that to the 10410 // caller by adding the GPU to the mask. 10411 // 10412 // In that case the ECC error might be noticed only after 10413 // the CPU mappings have been already created below, 10414 // exposing different CPU threads to the possibly corrupt 10415 // data, but this thread will fault eventually and that's 10416 // considered to be an acceptable trade-off between 10417 // performance and ECC error containment. 10418 uvm_processor_mask_set(&service_context->cpu_fault.gpus_to_check_for_ecc, gpu->id); 10419 status = NV_OK; 10420 } 10421 if (status != NV_OK) 10422 return status; 10423 } 10424 } 10425 10426 return NV_OK; 10427 } 10428 10429 NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id, 10430 uvm_va_block_t *va_block, 10431 uvm_service_block_context_t *service_context) 10432 { 10433 uvm_processor_id_t new_residency = service_context->block_context.make_resident.dest_id; 10434 uvm_page_mask_t *new_residency_mask = 10435 &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency; 10436 uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency; 10437 uvm_page_mask_t *caller_page_mask = &service_context->block_context.caller_page_mask; 10438 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10439 uvm_prot_t new_prot; 10440 uvm_page_index_t page_index; 10441 NV_STATUS status; 10442 10443 // Update residency. 10444 if (service_context->read_duplicate_count == 0 || !uvm_page_mask_empty(caller_page_mask)) 10445 uvm_va_block_make_resident_finish(va_block, 10446 &service_context->block_context, 10447 service_context->region, 10448 caller_page_mask); 10449 10450 uvm_page_mask_andnot(&service_context->did_not_migrate_mask, new_residency_mask, did_migrate_mask); 10451 10452 // The loops below depend on the enums having the following values in order 10453 // to index into service_context->mappings_by_prot[]. 10454 BUILD_BUG_ON(UVM_PROT_READ_ONLY != 1); 10455 BUILD_BUG_ON(UVM_PROT_READ_WRITE != 2); 10456 BUILD_BUG_ON(UVM_PROT_READ_WRITE_ATOMIC != 3); 10457 BUILD_BUG_ON(UVM_PROT_MAX != 4); 10458 10459 // 1- Compute mapping protections for the requesting processor on the new 10460 // residency. 10461 for (new_prot = UVM_PROT_READ_ONLY; new_prot < UVM_PROT_MAX; ++new_prot) 10462 service_context->mappings_by_prot[new_prot - 1].count = 0; 10463 10464 for_each_va_block_page_in_region_mask(page_index, new_residency_mask, service_context->region) { 10465 new_prot = compute_new_permission(va_block, 10466 &service_context->block_context, 10467 page_index, 10468 processor_id, 10469 new_residency, 10470 service_context->access_type[page_index]); 10471 10472 if (service_context->mappings_by_prot[new_prot - 1].count++ == 0) 10473 uvm_page_mask_zero(&service_context->mappings_by_prot[new_prot - 1].page_mask); 10474 10475 uvm_page_mask_set(&service_context->mappings_by_prot[new_prot - 1].page_mask, page_index); 10476 } 10477 10478 // 2- Revoke permissions 10479 // 10480 // NOTE: uvm_va_block_make_resident_copy destroys mappings to old locations. 10481 // Thus, we need to revoke only if residency did not change and we 10482 // are mapping higher than READ ONLY. 10483 for (new_prot = UVM_PROT_READ_WRITE; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) { 10484 bool pages_need_revocation; 10485 uvm_processor_mask_t revoke_processors; 10486 uvm_prot_t revoke_prot; 10487 bool this_processor_has_enabled_atomics; 10488 10489 if (service_context->mappings_by_prot[new_prot - 1].count == 0) 10490 continue; 10491 10492 pages_need_revocation = uvm_page_mask_and(&service_context->revocation_mask, 10493 &service_context->did_not_migrate_mask, 10494 &service_context->mappings_by_prot[new_prot - 1].page_mask); 10495 if (!pages_need_revocation) 10496 continue; 10497 10498 uvm_processor_mask_and(&revoke_processors, &va_block->mapped, &va_space->faultable_processors); 10499 10500 // Do not revoke the processor that took the fault 10501 uvm_processor_mask_clear(&revoke_processors, processor_id); 10502 10503 this_processor_has_enabled_atomics = uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, 10504 processor_id); 10505 10506 // Atomic operations on processors with system-wide atomics 10507 // disabled or with native atomics access to new_residency 10508 // behave like writes. 10509 if (new_prot == UVM_PROT_READ_WRITE || 10510 !this_processor_has_enabled_atomics || 10511 uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id)) { 10512 10513 // Exclude processors with native atomics on the resident copy 10514 uvm_processor_mask_andnot(&revoke_processors, 10515 &revoke_processors, 10516 &va_space->has_native_atomics[uvm_id_value(new_residency)]); 10517 10518 // Exclude processors with disabled system-wide atomics 10519 uvm_processor_mask_and(&revoke_processors, 10520 &revoke_processors, 10521 &va_space->system_wide_atomics_enabled_processors); 10522 } 10523 10524 if (UVM_ID_IS_CPU(processor_id)) { 10525 revoke_prot = UVM_PROT_READ_WRITE_ATOMIC; 10526 } 10527 else { 10528 revoke_prot = (new_prot == UVM_PROT_READ_WRITE_ATOMIC)? UVM_PROT_READ_WRITE: 10529 UVM_PROT_READ_WRITE_ATOMIC; 10530 } 10531 10532 // UVM-Lite processors must always have RWA mappings 10533 if (uvm_processor_mask_andnot(&revoke_processors, &revoke_processors, block_get_uvm_lite_gpus(va_block))) { 10534 // Access counters should never trigger revocations apart from 10535 // read-duplication, which are performed in the calls to 10536 // uvm_va_block_make_resident_read_duplicate, above. 10537 if (service_context->operation == UVM_SERVICE_OPERATION_ACCESS_COUNTERS) { 10538 UVM_ASSERT(check_access_counters_dont_revoke(va_block, 10539 &service_context->block_context, 10540 service_context->region, 10541 &revoke_processors, 10542 &service_context->revocation_mask, 10543 revoke_prot)); 10544 } 10545 10546 // Downgrade other processors' mappings 10547 status = uvm_va_block_revoke_prot_mask(va_block, 10548 &service_context->block_context, 10549 &revoke_processors, 10550 service_context->region, 10551 &service_context->revocation_mask, 10552 revoke_prot); 10553 if (status != NV_OK) 10554 return status; 10555 } 10556 } 10557 10558 // 3- Map requesting processor with the necessary privileges 10559 for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) { 10560 const uvm_page_mask_t *map_prot_mask = &service_context->mappings_by_prot[new_prot - 1].page_mask; 10561 10562 if (service_context->mappings_by_prot[new_prot - 1].count == 0) 10563 continue; 10564 10565 // 3.1 - Unmap CPU pages 10566 // HMM cpu mappings can be upgraded at any time without notification 10567 // so no need to downgrade first. 10568 if (service_context->operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS && 10569 UVM_ID_IS_CPU(processor_id) && 10570 !uvm_va_block_is_hmm(va_block)) { 10571 // The kernel can downgrade managed CPU mappings at any time without 10572 // notifying us, which means our PTE state could be stale. We 10573 // handle this by unmapping the CPU PTE and re-mapping it again. 10574 // 10575 // A CPU fault is unexpected if: 10576 // curr_prot == RW || (!is_write && curr_prot == RO) 10577 status = uvm_va_block_unmap(va_block, 10578 &service_context->block_context, 10579 UVM_ID_CPU, 10580 service_context->region, 10581 map_prot_mask, 10582 NULL); 10583 if (status != NV_OK) 10584 return status; 10585 } 10586 10587 // 3.2 - Add new mappings 10588 10589 // The faulting processor can be mapped remotely due to user policy or 10590 // the thrashing mitigation heuristics. Therefore, we set the cause 10591 // accordingly in each case. 10592 10593 // Map pages that are thrashing first 10594 if (service_context->thrashing_pin_count > 0 && va_space->tools.enabled) { 10595 uvm_page_mask_t *helper_page_mask = &service_context->block_context.caller_page_mask; 10596 bool pages_need_mapping = uvm_page_mask_and(helper_page_mask, 10597 map_prot_mask, 10598 &service_context->thrashing_pin_mask); 10599 if (pages_need_mapping) { 10600 status = uvm_va_block_map(va_block, 10601 &service_context->block_context, 10602 processor_id, 10603 service_context->region, 10604 helper_page_mask, 10605 new_prot, 10606 UvmEventMapRemoteCauseThrashing, 10607 &va_block->tracker); 10608 if (status != NV_OK) 10609 return status; 10610 10611 // Remove thrashing pages from the map mask 10612 pages_need_mapping = uvm_page_mask_andnot(helper_page_mask, 10613 map_prot_mask, 10614 &service_context->thrashing_pin_mask); 10615 if (!pages_need_mapping) 10616 continue; 10617 10618 map_prot_mask = helper_page_mask; 10619 } 10620 } 10621 10622 status = uvm_va_block_map(va_block, 10623 &service_context->block_context, 10624 processor_id, 10625 service_context->region, 10626 map_prot_mask, 10627 new_prot, 10628 UvmEventMapRemoteCausePolicy, 10629 &va_block->tracker); 10630 if (status != NV_OK) 10631 return status; 10632 } 10633 10634 // 4- If pages did migrate, map SetAccessedBy processors, except for 10635 // UVM-Lite 10636 for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) { 10637 bool pages_need_mapping; 10638 10639 if (service_context->mappings_by_prot[new_prot - 1].count == 0) 10640 continue; 10641 10642 pages_need_mapping = uvm_page_mask_and(caller_page_mask, 10643 new_residency_mask, 10644 &service_context->mappings_by_prot[new_prot - 1].page_mask); 10645 if (!pages_need_mapping) 10646 continue; 10647 10648 // Map pages that are thrashing 10649 if (service_context->thrashing_pin_count > 0) { 10650 uvm_page_index_t page_index; 10651 10652 for_each_va_block_page_in_region_mask(page_index, 10653 &service_context->thrashing_pin_mask, 10654 service_context->region) { 10655 uvm_processor_mask_t *map_thrashing_processors = NULL; 10656 NvU64 page_addr = uvm_va_block_cpu_page_address(va_block, page_index); 10657 10658 // Check protection type 10659 if (!uvm_page_mask_test(caller_page_mask, page_index)) 10660 continue; 10661 10662 map_thrashing_processors = uvm_perf_thrashing_get_thrashing_processors(va_block, page_addr); 10663 10664 status = uvm_va_block_add_mappings_after_migration(va_block, 10665 &service_context->block_context, 10666 new_residency, 10667 processor_id, 10668 uvm_va_block_region_for_page(page_index), 10669 caller_page_mask, 10670 new_prot, 10671 map_thrashing_processors); 10672 if (status != NV_OK) 10673 return status; 10674 } 10675 10676 pages_need_mapping = uvm_page_mask_andnot(caller_page_mask, 10677 caller_page_mask, 10678 &service_context->thrashing_pin_mask); 10679 if (!pages_need_mapping) 10680 continue; 10681 } 10682 10683 // Map the rest of pages in a single shot 10684 status = uvm_va_block_add_mappings_after_migration(va_block, 10685 &service_context->block_context, 10686 new_residency, 10687 processor_id, 10688 service_context->region, 10689 caller_page_mask, 10690 new_prot, 10691 NULL); 10692 if (status != NV_OK) 10693 return status; 10694 } 10695 10696 return NV_OK; 10697 } 10698 10699 NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id, 10700 uvm_va_block_t *va_block, 10701 uvm_va_block_retry_t *block_retry, 10702 uvm_service_block_context_t *service_context) 10703 { 10704 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10705 uvm_processor_id_t new_residency; 10706 NV_STATUS status = NV_OK; 10707 10708 uvm_assert_mutex_locked(&va_block->lock); 10709 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, 10710 service_context->block_context.policy, 10711 service_context->region)); 10712 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, 10713 &service_context->block_context, 10714 service_context->region)); 10715 10716 // GPU fault servicing must be done under the VA space read lock. GPU fault 10717 // servicing is required for RM to make forward progress, and we allow other 10718 // threads to call into RM while holding the VA space lock in read mode. If 10719 // we took the VA space lock in write mode on the GPU fault service path, 10720 // we could deadlock because the thread in RM which holds the VA space lock 10721 // for read wouldn't be able to complete until fault servicing completes. 10722 if (service_context->operation != UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS || UVM_ID_IS_CPU(processor_id)) 10723 uvm_assert_rwsem_locked(&va_space->lock); 10724 else 10725 uvm_assert_rwsem_locked_read(&va_space->lock); 10726 10727 uvm_va_block_get_prefetch_hint(va_block, service_context); 10728 10729 for_each_id_in_mask(new_residency, &service_context->resident_processors) { 10730 if (uvm_va_block_is_hmm(va_block)) { 10731 status = uvm_hmm_va_block_service_locked(processor_id, new_residency, va_block, block_retry, service_context); 10732 if (status != NV_OK) 10733 break; 10734 10735 continue; 10736 } 10737 10738 status = uvm_va_block_service_copy(processor_id, new_residency, va_block, block_retry, service_context); 10739 if (status != NV_OK) 10740 break; 10741 10742 status = uvm_va_block_service_finish(processor_id, va_block, service_context); 10743 if (status != NV_OK) 10744 break; 10745 } 10746 10747 return status; 10748 } 10749 10750 NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block, 10751 uvm_va_block_context_t *va_block_context, 10752 uvm_processor_id_t processor_id, 10753 uvm_page_index_t page_index, 10754 uvm_fault_type_t access_type, 10755 bool allow_migration) 10756 { 10757 uvm_va_range_t *va_range = va_block->va_range; 10758 uvm_prot_t access_prot = uvm_fault_access_type_to_prot(access_type); 10759 10760 UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, 10761 va_block_context->policy, 10762 uvm_va_block_region_for_page(page_index))); 10763 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, 10764 va_block_context, 10765 uvm_va_block_region_for_page(page_index))); 10766 10767 // CPU permissions are checked later by block_map_cpu_page. 10768 // 10769 // TODO: Bug 1766124: permissions are checked by block_map_cpu_page because 10770 // it can also be called from change_pte. Make change_pte call this 10771 // function and only check CPU permissions here. 10772 if (UVM_ID_IS_GPU(processor_id)) { 10773 if (va_range && uvm_va_range_is_managed_zombie(va_range)) 10774 return NV_ERR_INVALID_ADDRESS; 10775 10776 // GPU faults only check vma permissions if a mm is registered with the 10777 // VA space (ie. uvm_va_space_mm_retain_lock(va_space) != NULL) or if 10778 // uvm_enable_builtin_tests is set, because the Linux kernel can change 10779 // vm_flags at any moment (for example on mprotect) and here we are not 10780 // guaranteed to have vma->vm_mm->mmap_lock. During tests we ensure that 10781 // this scenario does not happen. 10782 if ((va_block_context->mm || uvm_enable_builtin_tests) && 10783 (access_prot > compute_logical_prot(va_block, va_block_context, page_index))) 10784 return NV_ERR_INVALID_ACCESS_TYPE; 10785 } 10786 10787 // Non-migratable range: 10788 // - CPU accesses are always fatal, regardless of the VA range residency 10789 // - GPU accesses are fatal if the GPU can't map the preferred location 10790 if (!allow_migration) { 10791 UVM_ASSERT(!uvm_va_block_is_hmm(va_block)); 10792 10793 if (UVM_ID_IS_CPU(processor_id)) { 10794 return NV_ERR_INVALID_OPERATION; 10795 } 10796 else { 10797 uvm_va_space_t *va_space = va_range->va_space; 10798 10799 return uvm_processor_mask_test( 10800 &va_space->accessible_from[uvm_id_value(uvm_va_range_get_policy(va_range)->preferred_location)], 10801 processor_id)? 10802 NV_OK : NV_ERR_INVALID_ACCESS_TYPE; 10803 } 10804 } 10805 10806 return NV_OK; 10807 } 10808 10809 // Check if we are faulting on a page with valid permissions to check if we can 10810 // skip fault handling. See uvm_va_block_t::cpu::fault_authorized for more 10811 // details 10812 static bool skip_cpu_fault_with_valid_permissions(uvm_va_block_t *va_block, 10813 uvm_page_index_t page_index, 10814 uvm_fault_access_type_t fault_access_type) 10815 { 10816 // TODO: Bug 3900038: is skip_cpu_fault_with_valid_permissions() needed for 10817 // HMM? 10818 if (uvm_va_block_is_hmm(va_block)) 10819 return false; 10820 10821 if (block_page_is_processor_authorized(va_block, 10822 page_index, 10823 UVM_ID_CPU, 10824 uvm_fault_access_type_to_prot(fault_access_type))) { 10825 NvU64 now = NV_GETTIME(); 10826 pid_t pid = current->pid; 10827 10828 // Latch the pid/timestamp/page_index values for the first time 10829 if (!va_block->cpu.fault_authorized.first_fault_stamp) { 10830 va_block->cpu.fault_authorized.first_fault_stamp = now; 10831 va_block->cpu.fault_authorized.first_pid = pid; 10832 va_block->cpu.fault_authorized.page_index = page_index; 10833 10834 return true; 10835 } 10836 10837 // If the same thread shows up again, this means that the kernel 10838 // downgraded the page's PTEs. Service the fault to force a remap of 10839 // the page. 10840 if (va_block->cpu.fault_authorized.first_pid == pid && 10841 va_block->cpu.fault_authorized.page_index == page_index) { 10842 va_block->cpu.fault_authorized.first_fault_stamp = 0; 10843 } 10844 else { 10845 // If the window has expired, clear the information and service the 10846 // fault. Otherwise, just return 10847 if (now - va_block->cpu.fault_authorized.first_fault_stamp > uvm_perf_authorized_cpu_fault_tracking_window_ns) 10848 va_block->cpu.fault_authorized.first_fault_stamp = 0; 10849 else 10850 return true; 10851 } 10852 } 10853 10854 return false; 10855 } 10856 10857 static NV_STATUS block_cpu_fault_locked(uvm_va_block_t *va_block, 10858 uvm_va_block_retry_t *va_block_retry, 10859 NvU64 fault_addr, 10860 uvm_fault_access_type_t fault_access_type, 10861 uvm_service_block_context_t *service_context) 10862 { 10863 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 10864 NV_STATUS status = NV_OK; 10865 uvm_page_index_t page_index; 10866 uvm_perf_thrashing_hint_t thrashing_hint; 10867 uvm_processor_id_t new_residency; 10868 bool read_duplicate; 10869 10870 uvm_assert_rwsem_locked(&va_space->lock); 10871 10872 UVM_ASSERT(fault_addr >= va_block->start); 10873 UVM_ASSERT(fault_addr <= va_block->end); 10874 10875 uvm_assert_mmap_lock_locked(service_context->block_context.mm); 10876 10877 service_context->block_context.policy = uvm_va_policy_get(va_block, fault_addr); 10878 10879 if (service_context->num_retries == 0) { 10880 // notify event to tools/performance heuristics 10881 uvm_perf_event_notify_cpu_fault(&va_space->perf_events, 10882 va_block, 10883 service_context->block_context.policy->preferred_location, 10884 fault_addr, 10885 fault_access_type > UVM_FAULT_ACCESS_TYPE_READ, 10886 KSTK_EIP(current)); 10887 } 10888 10889 // Check logical permissions 10890 page_index = uvm_va_block_cpu_page_index(va_block, fault_addr); 10891 status = uvm_va_block_check_logical_permissions(va_block, 10892 &service_context->block_context, 10893 UVM_ID_CPU, 10894 page_index, 10895 fault_access_type, 10896 uvm_range_group_address_migratable(va_space, fault_addr)); 10897 if (status != NV_OK) 10898 return status; 10899 10900 uvm_processor_mask_zero(&service_context->cpu_fault.gpus_to_check_for_ecc); 10901 10902 if (skip_cpu_fault_with_valid_permissions(va_block, page_index, fault_access_type)) 10903 return NV_OK; 10904 10905 thrashing_hint = uvm_perf_thrashing_get_hint(va_block, fault_addr, UVM_ID_CPU); 10906 // Throttling is implemented by sleeping in the fault handler on the CPU 10907 if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) { 10908 service_context->cpu_fault.wakeup_time_stamp = thrashing_hint.throttle.end_time_stamp; 10909 return NV_WARN_MORE_PROCESSING_REQUIRED; 10910 } 10911 10912 service_context->read_duplicate_count = 0; 10913 service_context->thrashing_pin_count = 0; 10914 service_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS; 10915 10916 if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) { 10917 uvm_page_mask_zero(&service_context->thrashing_pin_mask); 10918 uvm_page_mask_set(&service_context->thrashing_pin_mask, page_index); 10919 service_context->thrashing_pin_count = 1; 10920 } 10921 10922 // Compute new residency and update the masks 10923 new_residency = uvm_va_block_select_residency(va_block, 10924 &service_context->block_context, 10925 page_index, 10926 UVM_ID_CPU, 10927 uvm_fault_access_type_mask_bit(fault_access_type), 10928 service_context->block_context.policy, 10929 &thrashing_hint, 10930 UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS, 10931 &read_duplicate); 10932 10933 // Initialize the minimum necessary state in the fault service context 10934 uvm_processor_mask_zero(&service_context->resident_processors); 10935 10936 // Set new residency and update the masks 10937 uvm_processor_mask_set(&service_context->resident_processors, new_residency); 10938 10939 // The masks need to be fully zeroed as the fault region may grow due to prefetching 10940 uvm_page_mask_zero(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency); 10941 uvm_page_mask_set(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index); 10942 10943 if (read_duplicate) { 10944 uvm_page_mask_zero(&service_context->read_duplicate_mask); 10945 uvm_page_mask_set(&service_context->read_duplicate_mask, page_index); 10946 service_context->read_duplicate_count = 1; 10947 } 10948 10949 service_context->access_type[page_index] = fault_access_type; 10950 10951 service_context->region = uvm_va_block_region_for_page(page_index); 10952 10953 status = uvm_va_block_service_locked(UVM_ID_CPU, va_block, va_block_retry, service_context); 10954 10955 ++service_context->num_retries; 10956 10957 return status; 10958 } 10959 10960 NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block, 10961 NvU64 fault_addr, 10962 bool is_write, 10963 uvm_service_block_context_t *service_context) 10964 { 10965 NV_STATUS status; 10966 uvm_va_block_retry_t va_block_retry; 10967 uvm_fault_access_type_t fault_access_type; 10968 10969 if (is_write) 10970 fault_access_type = UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG; 10971 else 10972 fault_access_type = UVM_FAULT_ACCESS_TYPE_READ; 10973 10974 service_context->num_retries = 0; 10975 service_context->cpu_fault.did_migrate = false; 10976 10977 // We have to use vm_insert_page instead of handing the page to the kernel 10978 // and letting it insert the mapping, and we must do that while holding the 10979 // lock on this VA block. Otherwise there will be a window in which we think 10980 // we've mapped the page but the CPU mapping hasn't actually been created 10981 // yet. During that window a GPU fault event could arrive and claim 10982 // ownership of that VA, "unmapping" it. Then later the kernel would 10983 // eventually establish the mapping, and we'd end up with both CPU and GPU 10984 // thinking they each owned the page. 10985 // 10986 // This function must only be called when it's safe to call vm_insert_page. 10987 // That is, there must be a reference held on the vma's vm_mm, and 10988 // vm_mm->mmap_lock is held in at least read mode. Note that current->mm 10989 // might not be vma->vm_mm. 10990 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, 10991 &va_block_retry, 10992 block_cpu_fault_locked(va_block, 10993 &va_block_retry, 10994 fault_addr, 10995 fault_access_type, 10996 service_context)); 10997 return status; 10998 } 10999 11000 NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t **out_block) 11001 { 11002 uvm_va_range_t *va_range; 11003 uvm_va_block_t *block; 11004 size_t index; 11005 11006 va_range = uvm_va_range_find(va_space, addr); 11007 if (!va_range) 11008 return uvm_hmm_va_block_find(va_space, addr, out_block); 11009 11010 UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS || 11011 uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND); 11012 11013 if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) 11014 return NV_ERR_INVALID_ADDRESS; 11015 11016 index = uvm_va_range_block_index(va_range, addr); 11017 block = uvm_va_range_block(va_range, index); 11018 if (!block) 11019 return NV_ERR_OBJECT_NOT_FOUND; 11020 11021 *out_block = block; 11022 return NV_OK; 11023 } 11024 11025 NV_STATUS uvm_va_block_find_create_in_range(uvm_va_space_t *va_space, 11026 uvm_va_range_t *va_range, 11027 NvU64 addr, 11028 uvm_va_block_context_t *va_block_context, 11029 uvm_va_block_t **out_block) 11030 { 11031 size_t index; 11032 11033 if (uvm_enable_builtin_tests && atomic_dec_if_positive(&va_space->test.va_block_allocation_fail_nth) == 0) 11034 return NV_ERR_NO_MEMORY; 11035 11036 if (!va_range) { 11037 if (!va_block_context || !va_block_context->mm) 11038 return NV_ERR_INVALID_ADDRESS; 11039 return uvm_hmm_va_block_find_create(va_space, addr, va_block_context, out_block); 11040 } 11041 11042 UVM_ASSERT(addr >= va_range->node.start); 11043 UVM_ASSERT(addr <= va_range->node.end); 11044 11045 UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS || 11046 uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND); 11047 11048 if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) 11049 return NV_ERR_INVALID_ADDRESS; 11050 11051 index = uvm_va_range_block_index(va_range, addr); 11052 return uvm_va_range_block_create(va_range, index, out_block); 11053 } 11054 11055 NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space, 11056 NvU64 addr, 11057 uvm_va_block_context_t *va_block_context, 11058 uvm_va_block_t **out_block) 11059 { 11060 uvm_va_range_t *va_range = uvm_va_range_find(va_space, addr); 11061 11062 return uvm_va_block_find_create_in_range(va_space, va_range, addr, va_block_context, out_block); 11063 } 11064 11065 // Launch a synchronous, encrypted copy between GPU and CPU. 11066 // 11067 // The copy entails a GPU-side encryption (relying on the Copy Engine), and a 11068 // CPU-side decryption step, such that the destination CPU buffer pointed by 11069 // dst_plain will contain the unencrypted (plain text) contents. The destination 11070 // buffer can be in protected or unprotected sysmem, while the source buffer 11071 // must be in protected vidmem. 11072 // 11073 // The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE. 11074 // 11075 // The input tracker, if not NULL, is internally acquired by the push 11076 // responsible for the encrypted copy. 11077 __attribute__ ((format(printf, 6, 7))) 11078 static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu, 11079 void *dst_plain, 11080 uvm_gpu_address_t src_gpu_address, 11081 size_t size, 11082 uvm_tracker_t *tracker, 11083 const char *format, 11084 ...) 11085 { 11086 NV_STATUS status; 11087 UvmCslIv decrypt_iv; 11088 uvm_push_t push; 11089 uvm_conf_computing_dma_buffer_t *dma_buffer; 11090 uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address; 11091 void *src_cipher, *auth_tag; 11092 va_list args; 11093 11094 UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); 11095 UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE); 11096 11097 status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL); 11098 if (status != NV_OK) 11099 return status; 11100 11101 va_start(args, format); 11102 status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args); 11103 va_end(args); 11104 11105 if (status != NV_OK) 11106 goto out; 11107 11108 uvm_conf_computing_log_gpu_encryption(push.channel, &decrypt_iv); 11109 11110 dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu); 11111 auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu); 11112 gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address); 11113 11114 status = uvm_push_end_and_wait(&push); 11115 if (status != NV_OK) 11116 goto out; 11117 11118 src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc); 11119 auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag); 11120 status = uvm_conf_computing_cpu_decrypt(push.channel, dst_plain, src_cipher, &decrypt_iv, size, auth_tag); 11121 11122 out: 11123 uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL); 11124 return status; 11125 } 11126 11127 // Launch a synchronous, encrypted copy between CPU and GPU. 11128 // 11129 // The source CPU buffer pointed by src_plain contains the unencrypted (plain 11130 // text) contents; the function internally performs a CPU-side encryption step 11131 // before launching the GPU-side CE decryption. The source buffer can be in 11132 // protected or unprotected sysmem, while the destination buffer must be in 11133 // protected vidmem. 11134 // 11135 // The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE. 11136 // 11137 // The input tracker, if not NULL, is internally acquired by the push 11138 // responsible for the encrypted copy. 11139 __attribute__ ((format(printf, 6, 7))) 11140 static NV_STATUS encrypted_memcopy_cpu_to_gpu(uvm_gpu_t *gpu, 11141 uvm_gpu_address_t dst_gpu_address, 11142 void *src_plain, 11143 size_t size, 11144 uvm_tracker_t *tracker, 11145 const char *format, 11146 ...) 11147 { 11148 NV_STATUS status; 11149 uvm_push_t push; 11150 uvm_conf_computing_dma_buffer_t *dma_buffer; 11151 uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address; 11152 void *dst_cipher, *auth_tag; 11153 va_list args; 11154 11155 UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); 11156 UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE); 11157 11158 status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL); 11159 if (status != NV_OK) 11160 return status; 11161 11162 va_start(args, format); 11163 status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args); 11164 va_end(args); 11165 11166 if (status != NV_OK) 11167 goto out; 11168 11169 dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc); 11170 auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag); 11171 uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag); 11172 11173 src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu); 11174 auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu); 11175 gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address); 11176 11177 status = uvm_push_end_and_wait(&push); 11178 11179 out: 11180 uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL); 11181 return status; 11182 } 11183 11184 static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block, 11185 uvm_gpu_t *gpu, 11186 uvm_gpu_address_t dst_gpu_address, 11187 NvU64 dst, 11188 uvm_mem_t *src_mem, 11189 size_t size) 11190 { 11191 NV_STATUS status; 11192 uvm_push_t push; 11193 uvm_gpu_address_t src_gpu_address; 11194 11195 if (uvm_conf_computing_mode_enabled(gpu)) { 11196 return encrypted_memcopy_cpu_to_gpu(gpu, 11197 dst_gpu_address, 11198 uvm_mem_get_cpu_addr_kernel(src_mem), 11199 size, 11200 &va_block->tracker, 11201 "Encrypted write to [0x%llx, 0x%llx)", 11202 dst, 11203 dst + size); 11204 } 11205 11206 status = uvm_push_begin_acquire(gpu->channel_manager, 11207 UVM_CHANNEL_TYPE_CPU_TO_GPU, 11208 &va_block->tracker, 11209 &push, 11210 "Direct write to [0x%llx, 0x%llx)", 11211 dst, 11212 dst + size); 11213 if (status != NV_OK) 11214 return status; 11215 11216 src_gpu_address = uvm_mem_gpu_address_virtual_kernel(src_mem, gpu); 11217 gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size); 11218 return uvm_push_end_and_wait(&push); 11219 } 11220 11221 NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block, 11222 uvm_va_block_context_t *block_context, 11223 NvU64 dst, 11224 uvm_mem_t *src_mem, 11225 size_t size) 11226 { 11227 NV_STATUS status; 11228 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, dst); 11229 NvU64 page_offset = dst & (PAGE_SIZE - 1); 11230 uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, page_index, UVM_ID_CPU); 11231 uvm_va_block_region_t region = uvm_va_block_region_for_page(page_index); 11232 11233 uvm_assert_mutex_locked(&va_block->lock); 11234 UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Write spans multiple pages: dst 0x%llx, size 0x%zx\n", dst, size); 11235 11236 if (UVM_ID_IS_INVALID(proc)) 11237 proc = UVM_ID_CPU; 11238 11239 block_context->policy = uvm_va_policy_get(va_block, dst); 11240 11241 // Use make_resident() in all cases to break read-duplication, but 11242 // block_retry can be NULL as if the page is not resident yet we will make 11243 // it resident on the CPU. 11244 // Notably we don't care about coherence with respect to atomics from other 11245 // processors. 11246 status = uvm_va_block_make_resident(va_block, 11247 NULL, 11248 block_context, 11249 proc, 11250 region, 11251 NULL, 11252 NULL, 11253 UVM_MAKE_RESIDENT_CAUSE_API_TOOLS); 11254 11255 if (status != NV_OK) 11256 return status; 11257 11258 if (UVM_ID_IS_CPU(proc)) { 11259 char *mapped_page; 11260 struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, page_index); 11261 void *src = uvm_mem_get_cpu_addr_kernel(src_mem); 11262 11263 status = uvm_tracker_wait(&va_block->tracker); 11264 if (status != NV_OK) 11265 return status; 11266 11267 mapped_page = (char *)kmap(page); 11268 memcpy(mapped_page + page_offset, src, size); 11269 kunmap(page); 11270 11271 return NV_OK; 11272 } 11273 else { 11274 uvm_gpu_t *dst_gpu; 11275 uvm_gpu_address_t dst_gpu_address; 11276 11277 UVM_ASSERT(UVM_ID_IS_GPU(proc)); 11278 11279 dst_gpu = block_get_gpu(va_block, proc); 11280 11281 dst_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), dst_gpu); 11282 dst_gpu_address.address += page_offset; 11283 11284 return va_block_write_cpu_to_gpu(va_block, dst_gpu, dst_gpu_address, dst, src_mem, size); 11285 } 11286 } 11287 11288 static NV_STATUS va_block_read_gpu_to_cpu(uvm_va_block_t *va_block, 11289 uvm_mem_t *dst_mem, 11290 uvm_gpu_t *gpu, 11291 uvm_gpu_address_t src_gpu_address, 11292 NvU64 src, 11293 size_t size) 11294 { 11295 NV_STATUS status; 11296 uvm_push_t push; 11297 uvm_gpu_address_t dst_gpu_address; 11298 11299 if (uvm_conf_computing_mode_enabled(gpu)) { 11300 return encrypted_memcopy_gpu_to_cpu(gpu, 11301 uvm_mem_get_cpu_addr_kernel(dst_mem), 11302 src_gpu_address, 11303 size, 11304 &va_block->tracker, 11305 "Encrypted read from [0x%llx, 0x%llx)", 11306 src, 11307 src + size); 11308 } 11309 11310 status = uvm_push_begin_acquire(gpu->channel_manager, 11311 UVM_CHANNEL_TYPE_GPU_TO_CPU, 11312 &va_block->tracker, 11313 &push, 11314 "Direct read from [0x%llx, 0x%llx)", 11315 src, 11316 src + size); 11317 if (status != NV_OK) 11318 return status; 11319 11320 dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dst_mem, gpu); 11321 gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size); 11322 return uvm_push_end_and_wait(&push); 11323 } 11324 11325 NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block, uvm_mem_t *dst_mem, NvU64 src, size_t size) 11326 { 11327 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, src); 11328 NvU64 page_offset = src & (PAGE_SIZE - 1); 11329 uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, page_index, UVM_ID_CPU); 11330 void *dst = uvm_mem_get_cpu_addr_kernel(dst_mem); 11331 11332 uvm_assert_mutex_locked(&va_block->lock); 11333 UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Read spans multiple pages: src 0x%llx, size 0x%zx\n", src, size); 11334 11335 if (UVM_ID_IS_INVALID(proc)) { 11336 memset(dst, 0, size); 11337 return NV_OK; 11338 } 11339 else if (UVM_ID_IS_CPU(proc)) { 11340 NV_STATUS status; 11341 char *mapped_page; 11342 struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, page_index); 11343 11344 status = uvm_tracker_wait(&va_block->tracker); 11345 if (status != NV_OK) 11346 return status; 11347 11348 mapped_page = (char *)kmap(page); 11349 memcpy(dst, mapped_page + page_offset, size); 11350 kunmap(page); 11351 11352 return NV_OK; 11353 } 11354 else { 11355 uvm_gpu_address_t src_gpu_address; 11356 uvm_gpu_t *gpu = block_get_gpu(va_block, proc); 11357 11358 src_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), gpu); 11359 src_gpu_address.address += page_offset; 11360 11361 return va_block_read_gpu_to_cpu(va_block, dst_mem, gpu, src_gpu_address, src, size); 11362 } 11363 } 11364 11365 // Deferred work item reestablishing accessed by mappings after eviction. On 11366 // GPUs with access counters enabled, the evicted GPU will also get remote 11367 // mappings. 11368 static void block_add_eviction_mappings(void *args) 11369 { 11370 uvm_va_block_t *va_block = (uvm_va_block_t*)args; 11371 uvm_va_space_t *va_space; 11372 uvm_processor_id_t id; 11373 uvm_va_block_context_t *block_context = NULL; 11374 struct mm_struct *mm = NULL; 11375 11376 uvm_mutex_lock(&va_block->lock); 11377 va_space = uvm_va_block_get_va_space_maybe_dead(va_block); 11378 uvm_mutex_unlock(&va_block->lock); 11379 11380 if (!va_space) { 11381 // Block has been killed in the meantime 11382 goto done; 11383 } 11384 11385 mm = uvm_va_space_mm_retain_lock(va_space); 11386 11387 block_context = uvm_va_block_context_alloc(mm); 11388 if (!block_context) 11389 goto done; 11390 11391 // The block wasn't dead when we checked above and that's enough to 11392 // guarantee that the VA space is still around, because 11393 // uvm_va_space_destroy() flushes the associated nv_kthread_q, and that 11394 // flush waits for this function call to finish. 11395 uvm_va_space_down_read(va_space); 11396 11397 // Now that we have the VA space lock held, we can check whether the block 11398 // is still alive since the VA space write lock is needed to kill blocks. 11399 if (uvm_va_block_is_dead(va_block)) 11400 goto unlock; 11401 11402 if (uvm_va_block_is_hmm(va_block)) { 11403 uvm_hmm_block_add_eviction_mappings(va_space, va_block, block_context); 11404 } 11405 else { 11406 uvm_va_range_t *va_range = va_block->va_range; 11407 NV_STATUS status = NV_OK; 11408 11409 block_context->policy = uvm_va_range_get_policy(va_range); 11410 for_each_id_in_mask(id, &uvm_va_range_get_policy(va_range)->accessed_by) { 11411 status = uvm_va_block_set_accessed_by(va_block, block_context, id); 11412 if (status != NV_OK) 11413 break; 11414 } 11415 11416 if (status == NV_OK && uvm_va_space_map_remote_on_eviction(va_space)) { 11417 uvm_processor_mask_t map_processors; 11418 11419 // Exclude the processors that have been already mapped due to 11420 // AccessedBy 11421 uvm_processor_mask_andnot(&map_processors, 11422 &va_block->evicted_gpus, 11423 &uvm_va_range_get_policy(va_range)->accessed_by); 11424 11425 for_each_gpu_id_in_mask(id, &map_processors) { 11426 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id); 11427 uvm_va_block_gpu_state_t *gpu_state; 11428 11429 if (!gpu->parent->access_counters_supported) 11430 continue; 11431 11432 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 11433 UVM_ASSERT(gpu_state); 11434 11435 // TODO: Bug 2096389: uvm_va_block_add_mappings does not add 11436 // remote mappings to read-duplicated pages. Add support for it 11437 // or create a new function. 11438 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, NULL, 11439 uvm_va_block_add_mappings(va_block, 11440 block_context, 11441 id, 11442 uvm_va_block_region_from_block(va_block), 11443 &gpu_state->evicted, 11444 UvmEventMapRemoteCauseEviction)); 11445 if (status != NV_OK) 11446 break; 11447 } 11448 } 11449 11450 if (status != NV_OK) { 11451 UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s, processor %s\n", 11452 va_block->start, 11453 va_block->end, 11454 nvstatusToString(status), 11455 uvm_va_space_processor_name(va_space, id)); 11456 } 11457 } 11458 11459 unlock: 11460 uvm_va_space_up_read(va_space); 11461 uvm_va_block_context_free(block_context); 11462 11463 done: 11464 uvm_va_space_mm_release_unlock(va_space, mm); 11465 uvm_va_block_release(va_block); 11466 } 11467 11468 static void block_add_eviction_mappings_entry(void *args) 11469 { 11470 UVM_ENTRY_VOID(block_add_eviction_mappings(args)); 11471 } 11472 11473 NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block, 11474 uvm_gpu_t *gpu, 11475 uvm_gpu_chunk_t *root_chunk, 11476 uvm_tracker_t *tracker) 11477 { 11478 NV_STATUS status = NV_OK; 11479 NvU32 i; 11480 uvm_va_block_gpu_state_t *gpu_state; 11481 uvm_va_block_region_t chunk_region; 11482 size_t num_gpu_chunks = block_num_gpu_chunks(va_block, gpu); 11483 size_t chunks_to_evict = 0; 11484 uvm_va_block_context_t *block_context; 11485 uvm_page_mask_t *pages_to_evict; 11486 uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block); 11487 uvm_va_space_t *va_space = uvm_va_block_get_va_space_maybe_dead(va_block); 11488 struct mm_struct *mm; 11489 bool accessed_by_set = false; 11490 11491 uvm_assert_mutex_locked(&va_block->lock); 11492 11493 // The block might have been killed in the meantime 11494 if (!va_space) 11495 return NV_OK; 11496 11497 gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 11498 if (!gpu_state) 11499 return NV_OK; 11500 11501 if (va_block_test && va_block_test->inject_eviction_error) { 11502 va_block_test->inject_eviction_error = false; 11503 return NV_ERR_NO_MEMORY; 11504 } 11505 11506 // We cannot take this block's VA space or mmap_lock locks on the eviction 11507 // path, however, we retain mm in order to support accounting of CPU memory 11508 // allocations. If mappings need to be created, 11509 // block_add_eviction_mappings() will be scheduled below. 11510 mm = uvm_va_space_mm_retain(va_space); 11511 block_context = uvm_va_block_context_alloc(mm); 11512 if (!block_context) { 11513 if (mm) 11514 uvm_va_space_mm_release(va_space); 11515 return NV_ERR_NO_MEMORY; 11516 } 11517 11518 pages_to_evict = &block_context->caller_page_mask; 11519 uvm_page_mask_zero(pages_to_evict); 11520 chunk_region.outer = 0; 11521 11522 // Find all chunks that are subchunks of the root chunk 11523 for (i = 0; i < num_gpu_chunks; ++i) { 11524 uvm_chunk_size_t chunk_size; 11525 size_t chunk_index = block_gpu_chunk_index(va_block, gpu, chunk_region.outer, &chunk_size); 11526 UVM_ASSERT(chunk_index == i); 11527 chunk_region.first = chunk_region.outer; 11528 chunk_region.outer = chunk_region.first + chunk_size / PAGE_SIZE; 11529 11530 if (!gpu_state->chunks[i]) 11531 continue; 11532 if (!uvm_gpu_chunk_same_root(gpu_state->chunks[i], root_chunk)) 11533 continue; 11534 11535 if (uvm_va_block_is_hmm(va_block)) { 11536 status = uvm_hmm_va_block_evict_chunk_prep(va_block, block_context, gpu_state->chunks[i], chunk_region); 11537 if (status != NV_OK) 11538 break; 11539 } 11540 11541 uvm_page_mask_region_fill(pages_to_evict, chunk_region); 11542 ++chunks_to_evict; 11543 } 11544 11545 if (chunks_to_evict == 0) 11546 goto out; 11547 11548 // Only move pages resident on the GPU 11549 uvm_page_mask_and(pages_to_evict, pages_to_evict, uvm_va_block_resident_mask_get(va_block, gpu->id)); 11550 uvm_processor_mask_zero(&block_context->make_resident.all_involved_processors); 11551 11552 if (uvm_va_block_is_hmm(va_block)) { 11553 status = uvm_hmm_va_block_evict_chunks(va_block, 11554 block_context, 11555 pages_to_evict, 11556 uvm_va_block_region_from_block(va_block), 11557 &accessed_by_set); 11558 } 11559 else { 11560 block_context->policy = uvm_va_range_get_policy(va_block->va_range); 11561 accessed_by_set = uvm_processor_mask_get_count(&block_context->policy->accessed_by) > 0; 11562 11563 // TODO: Bug 1765193: make_resident() breaks read-duplication, but it's 11564 // not necessary to do so for eviction. Add a version that unmaps only 11565 // the processors that have mappings to the pages being evicted. 11566 status = uvm_va_block_make_resident(va_block, 11567 NULL, 11568 block_context, 11569 UVM_ID_CPU, 11570 uvm_va_block_region_from_block(va_block), 11571 pages_to_evict, 11572 NULL, 11573 UVM_MAKE_RESIDENT_CAUSE_EVICTION); 11574 } 11575 if (status != NV_OK) 11576 goto out; 11577 11578 // VA space lock may not be held and hence we cannot reestablish any 11579 // mappings here and need to defer it to a work queue. 11580 // 11581 // Reading the accessed_by mask without the VA space lock is safe because 11582 // adding a new processor to the mask triggers going over all the VA blocks 11583 // in the range and locking them. And we hold one of the VA block's locks. 11584 // 11585 // If uvm_va_range_set_accessed_by() hasn't called 11586 // uvm_va_block_set_accessed_by() for this block yet then it will take care 11587 // of adding the mapping after we are done. If it already did then we are 11588 // guaranteed to see the new processor in the accessed_by mask because we 11589 // locked the block's lock that the thread calling 11590 // uvm_va_range_set_accessed_by() unlocked after updating the mask. 11591 // 11592 // If a processor gets removed from the mask then we might not notice and 11593 // schedule the work item anyway, but that's benign as 11594 // block_add_eviction_mappings() re-examines the mask. 11595 // 11596 // Checking if access counters migrations are enabled on a VA space is racy 11597 // without holding the VA space lock. However, this is fine as 11598 // block_add_eviction_mappings() reexamines the value with the VA space 11599 // lock being held. 11600 if (accessed_by_set || (gpu->parent->access_counters_supported && uvm_va_space_map_remote_on_eviction(va_space))) { 11601 // Always retain the VA block first so that it's safe for the deferred 11602 // callback to release it immediately after it runs. 11603 uvm_va_block_retain(va_block); 11604 11605 if (!nv_kthread_q_schedule_q_item(&g_uvm_global.global_q, 11606 &va_block->eviction_mappings_q_item)) { 11607 // And release it if no new callback was scheduled 11608 uvm_va_block_release_no_destroy(va_block); 11609 } 11610 } 11611 11612 status = uvm_tracker_add_tracker_safe(tracker, &va_block->tracker); 11613 if (status != NV_OK) 11614 goto out; 11615 11616 for (i = 0; i < num_gpu_chunks; ++i) { 11617 uvm_gpu_id_t accessing_gpu_id; 11618 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; 11619 11620 if (!chunk) 11621 continue; 11622 if (!uvm_gpu_chunk_same_root(chunk, root_chunk)) 11623 continue; 11624 11625 // Remove the mappings of indirect peers from the reverse map. We 11626 // access the indirect peer mask from the VA space without holding the 11627 // VA space lock. Therefore, we can race with enable_peer/disable_peer 11628 // operations. However this is fine: 11629 // 11630 // The enable_peer sequence is as follows: 11631 // 11632 // set_bit in va_space->indirect_peers 11633 // uvm_va_block_enable_peer; 11634 // 11635 // - If we read the mask BEFORE it is set or AFTER the mapping has 11636 // been added to the map there is no race. 11637 // - If we read the mask AFTER it is set but BEFORE adding the mapping 11638 // to the reverse map, we will try to remove it although it is not 11639 // there yet. Therefore, we use 11640 // uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction, which does 11641 // not check if the mapping is present in the reverse map. 11642 // 11643 // The disable_peer sequence is as follows: 11644 // 11645 // uvm_va_block_disable_peer; 11646 // clear_bit in va_space->indirect_peers 11647 // 11648 // - If we read the mask BEFORE the mapping has been added to the map 11649 // or AFTER the bit has been cleared, there is no race. 11650 // - If we read the mask AFTER the mapping has been removed and BEFORE 11651 // the bit is cleared, we will try to remove the mapping, too. 11652 // Again, uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction works 11653 // in this scenario. 11654 // Obtain the uvm_gpu_t directly via the parent GPU's id since indirect 11655 // peers are not supported when SMC is enabled. 11656 for_each_gpu_id_in_mask(accessing_gpu_id, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { 11657 uvm_gpu_t *accessing_gpu = uvm_va_space_get_gpu(va_space, accessing_gpu_id); 11658 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); 11659 11660 uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction(&accessing_gpu->pmm_reverse_sysmem_mappings, 11661 peer_addr); 11662 } 11663 11664 uvm_mmu_chunk_unmap(chunk, tracker); 11665 11666 uvm_pmm_gpu_mark_chunk_evicted(&gpu->pmm, gpu_state->chunks[i]); 11667 gpu_state->chunks[i] = NULL; 11668 } 11669 11670 out: 11671 uvm_va_block_context_free(block_context); 11672 if (mm) 11673 uvm_va_space_mm_release(va_space); 11674 11675 return status; 11676 } 11677 11678 static NV_STATUS block_gpu_force_4k_ptes(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu) 11679 { 11680 uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu); 11681 uvm_push_t push; 11682 NV_STATUS status; 11683 11684 // See comment in uvm_va_block_set_cancel 11685 UVM_ASSERT(!gpu->parent->fault_cancel_va_supported); 11686 11687 if (!gpu_state) 11688 return NV_ERR_NO_MEMORY; 11689 11690 // Force all pages to be 4K and prevent future upgrades during cancel 11691 gpu_state->force_4k_ptes = true; 11692 11693 // If we have no page tables we're done. For fault cancel we need to make 11694 // sure that fatal faults are on different 4k PTEs than non-fatal faults, 11695 // and we need to service all non-fatal faults before issuing the cancel. So 11696 // either all faults are fatal and we have no PTEs (we're PROT_NONE), or 11697 // we'll allocate PTEs later when we service the non-fatal faults. Those 11698 // PTEs will be 4k since force_4k_ptes is set. 11699 if (!block_gpu_has_page_tables(block, gpu)) 11700 return NV_OK; 11701 11702 // Are we 4k already? 11703 if (!gpu_state->pte_is_2m && bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) 11704 return NV_OK; 11705 11706 status = block_alloc_ptes_with_retry(block, gpu, UVM_PAGE_SIZE_4K, NULL); 11707 if (status != NV_OK) 11708 return status; 11709 11710 status = uvm_push_begin_acquire(gpu->channel_manager, 11711 UVM_CHANNEL_TYPE_MEMOPS, 11712 &block->tracker, 11713 &push, 11714 "Forcing 4k PTEs on block [0x%llx, 0x%llx)", 11715 block->start, 11716 block->end + 1); 11717 if (status != NV_OK) 11718 return status; 11719 11720 if (gpu_state->pte_is_2m) 11721 block_gpu_split_2m(block, block_context, gpu, NULL, &push); 11722 else 11723 block_gpu_split_big(block, block_context, gpu, gpu_state->big_ptes, &push); 11724 11725 uvm_push_end(&push); 11726 11727 UVM_ASSERT(block_check_mappings(block)); 11728 11729 return uvm_tracker_add_push_safe(&block->tracker, &push); 11730 } 11731 11732 NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu) 11733 { 11734 uvm_assert_mutex_locked(&va_block->lock); 11735 11736 // Volta+ devices support a global VA cancel method that does not require 11737 // 4k PTEs. Thus, skip doing this PTE splitting, particularly because it 11738 // could result in 4k PTEs on P9 systems which otherwise would never need 11739 // them. 11740 if (gpu->parent->fault_cancel_va_supported) 11741 return NV_OK; 11742 11743 return block_gpu_force_4k_ptes(va_block, block_context, gpu); 11744 } 11745 11746 NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *params, struct file *filp) 11747 { 11748 uvm_va_space_t *va_space = uvm_va_space_get(filp); 11749 struct mm_struct *mm; 11750 uvm_va_block_t *va_block; 11751 uvm_va_block_test_t *va_block_test; 11752 uvm_va_block_context_t *block_context = NULL; 11753 NV_STATUS status = NV_OK; 11754 11755 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 11756 uvm_va_space_down_read(va_space); 11757 11758 block_context = uvm_va_block_context_alloc(mm); 11759 if (!block_context) { 11760 status = NV_ERR_NO_MEMORY; 11761 goto out; 11762 } 11763 11764 status = uvm_va_block_find_create(va_space, params->lookup_address, block_context, &va_block); 11765 if (status != NV_OK) 11766 goto out; 11767 11768 va_block_test = uvm_va_block_get_test(va_block); 11769 UVM_ASSERT(va_block_test); 11770 11771 uvm_mutex_lock(&va_block->lock); 11772 11773 if (params->page_table_allocation_retry_force_count) 11774 va_block_test->page_table_allocation_retry_force_count = params->page_table_allocation_retry_force_count; 11775 11776 if (params->user_pages_allocation_retry_force_count) 11777 va_block_test->user_pages_allocation_retry_force_count = params->user_pages_allocation_retry_force_count; 11778 11779 if (params->cpu_chunk_allocation_size_mask) { 11780 if (params->cpu_chunk_allocation_size_mask & ~UVM_CPU_CHUNK_SIZES || 11781 !(params->cpu_chunk_allocation_size_mask & PAGE_SIZE)) { 11782 status = NV_ERR_INVALID_ARGUMENT; 11783 goto block_unlock; 11784 } 11785 11786 va_block_test->cpu_chunk_allocation_size_mask = params->cpu_chunk_allocation_size_mask & UVM_CPU_CHUNK_SIZES; 11787 } 11788 11789 if (params->eviction_error) 11790 va_block_test->inject_eviction_error = params->eviction_error; 11791 11792 if (params->cpu_pages_allocation_error_count) 11793 va_block_test->inject_cpu_pages_allocation_error_count = params->cpu_pages_allocation_error_count; 11794 11795 if (params->populate_error) 11796 va_block_test->inject_populate_error = params->populate_error; 11797 11798 block_unlock: 11799 uvm_mutex_unlock(&va_block->lock); 11800 11801 out: 11802 uvm_va_space_up_read(va_space); 11803 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 11804 uvm_va_block_context_free(block_context); 11805 return status; 11806 } 11807 11808 static uvm_prot_t g_uvm_test_pte_mapping_to_prot[UVM_TEST_PTE_MAPPING_MAX] = 11809 { 11810 [UVM_TEST_PTE_MAPPING_INVALID] = UVM_PROT_NONE, 11811 [UVM_TEST_PTE_MAPPING_READ_ONLY] = UVM_PROT_READ_ONLY, 11812 [UVM_TEST_PTE_MAPPING_READ_WRITE] = UVM_PROT_READ_WRITE, 11813 [UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC] = UVM_PROT_READ_WRITE_ATOMIC, 11814 }; 11815 11816 static UVM_TEST_PTE_MAPPING g_uvm_prot_to_test_pte_mapping[UVM_PROT_MAX] = 11817 { 11818 [UVM_PROT_NONE] = UVM_TEST_PTE_MAPPING_INVALID, 11819 [UVM_PROT_READ_ONLY] = UVM_TEST_PTE_MAPPING_READ_ONLY, 11820 [UVM_PROT_READ_WRITE] = UVM_TEST_PTE_MAPPING_READ_WRITE, 11821 [UVM_PROT_READ_WRITE_ATOMIC] = UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC, 11822 }; 11823 11824 NV_STATUS uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS *params, struct file *filp) 11825 { 11826 uvm_va_space_t *va_space = uvm_va_space_get(filp); 11827 uvm_va_block_t *block; 11828 struct mm_struct *mm; 11829 NV_STATUS status = NV_OK; 11830 uvm_prot_t curr_prot, new_prot; 11831 uvm_gpu_t *gpu = NULL; 11832 uvm_processor_id_t id; 11833 uvm_tracker_t local_tracker; 11834 uvm_va_block_region_t region; 11835 uvm_va_block_context_t *block_context = NULL; 11836 11837 if (!PAGE_ALIGNED(params->va)) 11838 return NV_ERR_INVALID_ADDRESS; 11839 11840 if (params->mapping >= UVM_TEST_PTE_MAPPING_MAX) 11841 return NV_ERR_INVALID_ARGUMENT; 11842 11843 new_prot = g_uvm_test_pte_mapping_to_prot[params->mapping]; 11844 11845 // mmap_lock isn't needed for invalidating CPU mappings, but it will be 11846 // needed for inserting them. 11847 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 11848 uvm_va_space_down_read(va_space); 11849 11850 if (uvm_uuid_is_cpu(¶ms->uuid)) { 11851 id = UVM_ID_CPU; 11852 } 11853 else { 11854 gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->uuid); 11855 if (!gpu) { 11856 status = NV_ERR_INVALID_DEVICE; 11857 goto out; 11858 } 11859 11860 // Check if the GPU can access the VA 11861 if (!uvm_gpu_can_address(gpu, params->va, PAGE_SIZE)) { 11862 status = NV_ERR_OUT_OF_RANGE; 11863 goto out; 11864 } 11865 11866 id = gpu->id; 11867 } 11868 11869 block_context = uvm_va_block_context_alloc(mm); 11870 if (!block_context) { 11871 status = NV_ERR_NO_MEMORY; 11872 goto out; 11873 } 11874 11875 status = uvm_va_block_find_create(va_space, params->va, block_context, &block); 11876 if (status != NV_OK) 11877 goto out; 11878 11879 // TODO: Bug 3912902: UvmTestChangePteMapping() doesn't work on CPU. 11880 if (UVM_ID_IS_CPU(id) && uvm_va_block_is_hmm(block)) 11881 goto out; 11882 11883 uvm_mutex_lock(&block->lock); 11884 11885 region = uvm_va_block_region_from_start_size(block, params->va, PAGE_SIZE); 11886 curr_prot = block_page_prot(block, id, region.first); 11887 11888 if (new_prot == curr_prot) { 11889 status = NV_OK; 11890 goto out_block; 11891 } 11892 11893 // TODO: Bug 1766124: Upgrades might require revoking other processors' 11894 // access privileges. We just fail for now. Only downgrades are 11895 // supported. If we allowed upgrades, we would need to check the mm 11896 // like we do for revocation below. 11897 if (new_prot > curr_prot) { 11898 status = NV_ERR_INVALID_OPERATION; 11899 goto out_block; 11900 } 11901 11902 block_context->policy = uvm_va_policy_get(block, params->va); 11903 11904 if (new_prot == UVM_PROT_NONE) { 11905 status = uvm_va_block_unmap(block, block_context, id, region, NULL, &block->tracker); 11906 } 11907 else { 11908 UVM_ASSERT(block_is_page_resident_anywhere(block, region.first)); 11909 11910 // Revoking CPU mappings performs a combination of unmap + map. The map 11911 // portion requires a valid mm. 11912 if (UVM_ID_IS_CPU(id) && !uvm_va_range_vma_check(block->va_range, mm)) { 11913 status = NV_ERR_INVALID_STATE; 11914 } 11915 else { 11916 status = uvm_va_block_revoke_prot(block, 11917 block_context, 11918 id, 11919 region, 11920 NULL, 11921 new_prot + 1, 11922 &block->tracker); 11923 } 11924 } 11925 11926 out_block: 11927 if (status == NV_OK) 11928 status = uvm_tracker_init_from(&local_tracker, &block->tracker); 11929 11930 uvm_mutex_unlock(&block->lock); 11931 11932 if (status == NV_OK) 11933 status = uvm_tracker_wait_deinit(&local_tracker); 11934 11935 out: 11936 uvm_va_space_up_read(va_space); 11937 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 11938 11939 uvm_va_block_context_free(block_context); 11940 11941 return status; 11942 } 11943 11944 NV_STATUS uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS *params, struct file *filp) 11945 { 11946 uvm_va_space_t *va_space = uvm_va_space_get(filp); 11947 uvm_va_block_t *va_block; 11948 uvm_va_range_t *va_range; 11949 struct mm_struct *mm; 11950 size_t index; 11951 NV_STATUS status = NV_OK; 11952 11953 BUILD_BUG_ON(UVM_TEST_VA_BLOCK_SIZE != UVM_VA_BLOCK_SIZE); 11954 11955 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 11956 uvm_va_space_down_read(va_space); 11957 11958 va_range = uvm_va_range_find(va_space, params->lookup_address); 11959 if (!va_range) { 11960 status = uvm_hmm_va_block_find(va_space, params->lookup_address, &va_block); 11961 if (status == NV_ERR_OBJECT_NOT_FOUND) { 11962 status = uvm_hmm_va_block_range_bounds(va_space, 11963 mm, 11964 params->lookup_address, 11965 ¶ms->va_block_start, 11966 ¶ms->va_block_end, 11967 NULL); 11968 goto out; 11969 } 11970 else if (status != NV_OK) { 11971 goto out; 11972 } 11973 } 11974 else { 11975 index = uvm_va_range_block_index(va_range, params->lookup_address); 11976 va_block = uvm_va_range_block(va_range, index); 11977 if (!va_block) { 11978 status = NV_ERR_OBJECT_NOT_FOUND; 11979 goto out; 11980 } 11981 } 11982 11983 params->va_block_start = va_block->start; 11984 params->va_block_end = va_block->end; 11985 11986 out: 11987 uvm_va_space_up_read(va_space); 11988 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 11989 return status; 11990 } 11991 11992 NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params, struct file *filp) 11993 { 11994 NV_STATUS status = NV_OK; 11995 uvm_va_space_t *va_space = uvm_va_space_get(filp); 11996 uvm_va_range_t *va_range; 11997 uvm_va_block_t *block = NULL; 11998 struct mm_struct *mm; 11999 NvU32 count = 0; 12000 uvm_processor_mask_t resident_on_mask; 12001 uvm_processor_id_t id; 12002 uvm_page_index_t page_index; 12003 unsigned release_block_count = 0; 12004 NvU64 addr = UVM_ALIGN_DOWN(params->lookup_address, PAGE_SIZE); 12005 size_t index; 12006 12007 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 12008 uvm_va_space_down_read(va_space); 12009 12010 // Inline uvm_va_block_find() to get the va_range. 12011 va_range = uvm_va_range_find(va_space, addr); 12012 if (!va_range) { 12013 NvU64 start, end; 12014 12015 status = uvm_hmm_va_block_find(va_space, addr, &block); 12016 if (status != NV_OK) { 12017 if (status != NV_ERR_OBJECT_NOT_FOUND) 12018 goto out; 12019 status = uvm_hmm_va_block_range_bounds(va_space, mm, addr, &start, &end, params); 12020 goto out; 12021 } 12022 // Update current CPU mapping information. 12023 status = uvm_hmm_va_block_update_residency_info(block, mm, addr, false); 12024 if (status != NV_OK) { 12025 block = NULL; 12026 goto out; 12027 } 12028 } 12029 else if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) { 12030 status = NV_ERR_INVALID_ADDRESS; 12031 goto out; 12032 } 12033 else { 12034 index = uvm_va_range_block_index(va_range, addr); 12035 block = uvm_va_range_block(va_range, index); 12036 if (!block) { 12037 params->resident_on_count = 0; 12038 params->populated_on_count = 0; 12039 params->mapped_on_count = 0; 12040 12041 status = NV_OK; 12042 12043 goto out; 12044 } 12045 } 12046 12047 uvm_mutex_lock(&block->lock); 12048 12049 page_index = uvm_va_block_cpu_page_index(block, addr); 12050 uvm_va_block_page_resident_processors(block, page_index, &resident_on_mask); 12051 12052 for_each_id_in_mask(id, &resident_on_mask) { 12053 block_phys_page_t block_page = block_phys_page(id, page_index); 12054 uvm_va_space_processor_uuid(va_space, ¶ms->resident_on[count], id); 12055 params->resident_physical_size[count] = block_phys_page_size(block, block_page); 12056 if (UVM_ID_IS_CPU(id)) { 12057 params->resident_physical_address[count] = page_to_phys(uvm_cpu_chunk_get_cpu_page(block, page_index)); 12058 } 12059 else { 12060 params->resident_physical_address[count] = 12061 block_phys_page_address(block, block_page, uvm_va_space_get_gpu(va_space, id)).address; 12062 } 12063 ++count; 12064 } 12065 params->resident_on_count = count; 12066 12067 count = 0; 12068 for_each_id_in_mask(id, &block->mapped) { 12069 uvm_processor_id_t processor_to_map; 12070 block_phys_page_t block_page; 12071 NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index); 12072 12073 if (page_size == 0) 12074 continue; 12075 12076 uvm_va_space_processor_uuid(va_space, ¶ms->mapped_on[count], id); 12077 12078 params->mapping_type[count] = g_uvm_prot_to_test_pte_mapping[block_page_prot(block, id, page_index)]; 12079 UVM_ASSERT(params->mapping_type[count] != UVM_TEST_PTE_MAPPING_INVALID); 12080 processor_to_map = block_get_processor_to_map(block, id, page_index); 12081 block_page = block_phys_page(processor_to_map, page_index); 12082 12083 if (!UVM_ID_IS_CPU(id)) { 12084 uvm_gpu_phys_address_t gpu_phys_addr = block_phys_page_address(block, 12085 block_page, 12086 uvm_va_space_get_gpu(va_space, id)); 12087 params->mapping_physical_address[count] = gpu_phys_addr.address; 12088 } 12089 else { 12090 struct page *page = block_page_get(block, block_page); 12091 12092 params->mapping_physical_address[count] = page_to_phys(page); 12093 } 12094 12095 params->page_size[count] = page_size; 12096 ++count; 12097 } 12098 12099 if (params->resident_on_count == 1) { 12100 if (uvm_processor_mask_test(&resident_on_mask, UVM_ID_CPU)) { 12101 if (uvm_pmm_sysmem_mappings_indirect_supported()) { 12102 for_each_gpu_id(id) { 12103 NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index); 12104 uvm_reverse_map_t sysmem_page; 12105 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); 12106 size_t num_pages; 12107 uvm_gpu_t *gpu; 12108 12109 if (!uvm_va_block_gpu_state_get(block, id)) 12110 continue; 12111 12112 gpu = uvm_va_space_get_gpu(va_space, id); 12113 12114 if (!gpu->parent->access_counters_supported) 12115 continue; 12116 12117 num_pages = uvm_pmm_sysmem_mappings_dma_to_virt(&gpu->pmm_reverse_sysmem_mappings, 12118 uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent), 12119 uvm_cpu_chunk_get_size(chunk), 12120 &sysmem_page, 12121 1); 12122 if (page_size > 0) 12123 UVM_ASSERT(num_pages == 1); 12124 else 12125 UVM_ASSERT(num_pages <= 1); 12126 12127 if (num_pages == 1) { 12128 UVM_ASSERT(sysmem_page.va_block == block); 12129 UVM_ASSERT(uvm_reverse_map_start(&sysmem_page) <= addr); 12130 UVM_ASSERT(uvm_reverse_map_end(&sysmem_page) > addr); 12131 12132 ++release_block_count; 12133 } 12134 } 12135 } 12136 } 12137 else { 12138 uvm_gpu_id_t id = uvm_processor_mask_find_first_id(&resident_on_mask); 12139 uvm_reverse_map_t gpu_mapping; 12140 size_t num_pages; 12141 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id); 12142 uvm_gpu_phys_address_t phys_addr; 12143 12144 phys_addr = uvm_va_block_gpu_phys_page_address(block, page_index, gpu); 12145 num_pages = uvm_pmm_gpu_phys_to_virt(&gpu->pmm, phys_addr.address, PAGE_SIZE, &gpu_mapping); 12146 12147 // Chunk may be in TEMP_PINNED state so it may not have a VA block 12148 // assigned. In that case, we don't get a valid translation. 12149 if (num_pages > 0) { 12150 UVM_ASSERT(num_pages == 1); 12151 UVM_ASSERT(gpu_mapping.va_block == block); 12152 UVM_ASSERT(uvm_reverse_map_start(&gpu_mapping) == addr); 12153 12154 ++release_block_count; 12155 } 12156 } 12157 } 12158 12159 params->mapped_on_count = count; 12160 12161 count = 0; 12162 for_each_processor_id(id) { 12163 if (!block_processor_page_is_populated(block, id, page_index)) 12164 continue; 12165 12166 uvm_va_space_processor_uuid(va_space, ¶ms->populated_on[count], id); 12167 ++count; 12168 } 12169 params->populated_on_count = count; 12170 12171 out: 12172 if (block) { 12173 if (!params->is_async && status == NV_OK) 12174 status = uvm_tracker_wait(&block->tracker); 12175 uvm_mutex_unlock(&block->lock); 12176 while (release_block_count--) 12177 uvm_va_block_release(block); 12178 } 12179 uvm_va_space_up_read(va_space); 12180 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 12181 return status; 12182 } 12183 12184 void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block) 12185 { 12186 block_mark_region_cpu_dirty(va_block, uvm_va_block_region_from_block(va_block)); 12187 } 12188