1 /******************************************************************************* 2 Copyright (c) 2015-2024 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "linux/sort.h" 25 #include "nv_uvm_interface.h" 26 #include "uvm_common.h" 27 #include "uvm_linux.h" 28 #include "uvm_global.h" 29 #include "uvm_gpu_replayable_faults.h" 30 #include "uvm_hal.h" 31 #include "uvm_kvmalloc.h" 32 #include "uvm_tools.h" 33 #include "uvm_va_block.h" 34 #include "uvm_va_range.h" 35 #include "uvm_va_space.h" 36 #include "uvm_va_space_mm.h" 37 #include "uvm_procfs.h" 38 #include "uvm_perf_thrashing.h" 39 #include "uvm_gpu_non_replayable_faults.h" 40 #include "uvm_ats_faults.h" 41 #include "uvm_test.h" 42 43 // The documentation at the beginning of uvm_gpu_non_replayable_faults.c 44 // provides some background for understanding replayable faults, non-replayable 45 // faults, and how UVM services each fault type. 46 47 // The HW fault buffer flush mode instructs RM on how to flush the hardware 48 // replayable fault buffer; it is only used in Confidential Computing. 49 // 50 // Unless HW_FAULT_BUFFER_FLUSH_MODE_MOVE is functionally required (because UVM 51 // needs to inspect the faults currently present in the HW fault buffer) it is 52 // recommended to use HW_FAULT_BUFFER_FLUSH_MODE_DISCARD for performance 53 // reasons. 54 typedef enum 55 { 56 // Flush the HW fault buffer, discarding all the resulting faults. UVM never 57 // gets to see these faults. 58 HW_FAULT_BUFFER_FLUSH_MODE_DISCARD, 59 60 // Flush the HW fault buffer, and move all the resulting faults to the SW 61 // fault ("shadow") buffer. 62 HW_FAULT_BUFFER_FLUSH_MODE_MOVE, 63 } hw_fault_buffer_flush_mode_t; 64 65 #define UVM_PERF_REENABLE_PREFETCH_FAULTS_LAPSE_MSEC_DEFAULT 1000 66 67 // Lapse of time in milliseconds after which prefetch faults can be re-enabled. 68 // 0 means it is never disabled 69 static unsigned uvm_perf_reenable_prefetch_faults_lapse_msec = UVM_PERF_REENABLE_PREFETCH_FAULTS_LAPSE_MSEC_DEFAULT; 70 module_param(uvm_perf_reenable_prefetch_faults_lapse_msec, uint, S_IRUGO); 71 72 #define UVM_PERF_FAULT_BATCH_COUNT_MIN 1 73 #define UVM_PERF_FAULT_BATCH_COUNT_DEFAULT 256 74 75 // Number of entries that are fetched from the GPU fault buffer and serviced in 76 // batch 77 static unsigned uvm_perf_fault_batch_count = UVM_PERF_FAULT_BATCH_COUNT_DEFAULT; 78 module_param(uvm_perf_fault_batch_count, uint, S_IRUGO); 79 80 #define UVM_PERF_FAULT_REPLAY_POLICY_DEFAULT UVM_PERF_FAULT_REPLAY_POLICY_BATCH_FLUSH 81 82 // Policy that determines when to issue fault replays 83 static uvm_perf_fault_replay_policy_t uvm_perf_fault_replay_policy = UVM_PERF_FAULT_REPLAY_POLICY_DEFAULT; 84 module_param(uvm_perf_fault_replay_policy, uint, S_IRUGO); 85 86 #define UVM_PERF_FAULT_REPLAY_UPDATE_PUT_RATIO_DEFAULT 50 87 88 // Reading fault buffer GET/PUT pointers from the CPU is expensive. However, 89 // updating PUT before flushing the buffer helps minimizing the number of 90 // duplicates in the buffer as it discards faults that were not processed 91 // because of the batch size limit or because they arrived during servicing. 92 // If PUT is not updated, the replay operation will make them show up again 93 // in the buffer as duplicates. 94 // 95 // We keep track of the number of duplicates in each batch and we use 96 // UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT for the fault buffer flush after if the 97 // percentage of duplicate faults in a batch is greater than the ratio defined 98 // in the following module parameter. UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT is 99 // used, otherwise. 100 static unsigned uvm_perf_fault_replay_update_put_ratio = UVM_PERF_FAULT_REPLAY_UPDATE_PUT_RATIO_DEFAULT; 101 module_param(uvm_perf_fault_replay_update_put_ratio, uint, S_IRUGO); 102 103 #define UVM_PERF_FAULT_MAX_BATCHES_PER_SERVICE_DEFAULT 20 104 105 #define UVM_PERF_FAULT_MAX_THROTTLE_PER_SERVICE_DEFAULT 5 106 107 // Maximum number of batches to be processed per execution of the bottom-half 108 static unsigned uvm_perf_fault_max_batches_per_service = UVM_PERF_FAULT_MAX_BATCHES_PER_SERVICE_DEFAULT; 109 module_param(uvm_perf_fault_max_batches_per_service, uint, S_IRUGO); 110 111 // Maximum number of batches with thrashing pages per execution of the bottom-half 112 static unsigned uvm_perf_fault_max_throttle_per_service = UVM_PERF_FAULT_MAX_THROTTLE_PER_SERVICE_DEFAULT; 113 module_param(uvm_perf_fault_max_throttle_per_service, uint, S_IRUGO); 114 115 static unsigned uvm_perf_fault_coalesce = 1; 116 module_param(uvm_perf_fault_coalesce, uint, S_IRUGO); 117 118 // This function is used for both the initial fault buffer initialization and 119 // the power management resume path. 120 static void fault_buffer_reinit_replayable_faults(uvm_parent_gpu_t *parent_gpu) 121 { 122 uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable; 123 124 // Read the current get/put pointers, as this might not be the first time 125 // we take control of the fault buffer since the GPU was initialized, 126 // or since we may need to bring UVM's cached copies back in sync following 127 // a sleep cycle. 128 replayable_faults->cached_get = parent_gpu->fault_buffer_hal->read_get(parent_gpu); 129 replayable_faults->cached_put = parent_gpu->fault_buffer_hal->read_put(parent_gpu); 130 131 // (Re-)enable fault prefetching 132 if (parent_gpu->fault_buffer_info.prefetch_faults_enabled) 133 parent_gpu->arch_hal->enable_prefetch_faults(parent_gpu); 134 else 135 parent_gpu->arch_hal->disable_prefetch_faults(parent_gpu); 136 } 137 138 // There is no error handling in this function. The caller is in charge of 139 // calling fault_buffer_deinit_replayable_faults on failure. 140 static NV_STATUS fault_buffer_init_replayable_faults(uvm_parent_gpu_t *parent_gpu) 141 { 142 NV_STATUS status = NV_OK; 143 uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable; 144 uvm_fault_service_batch_context_t *batch_context = &replayable_faults->batch_service_context; 145 146 UVM_ASSERT(parent_gpu->fault_buffer_info.rm_info.replayable.bufferSize % 147 parent_gpu->fault_buffer_hal->entry_size(parent_gpu) == 0); 148 149 replayable_faults->max_faults = parent_gpu->fault_buffer_info.rm_info.replayable.bufferSize / 150 parent_gpu->fault_buffer_hal->entry_size(parent_gpu); 151 152 // Check provided module parameter value 153 parent_gpu->fault_buffer_info.max_batch_size = max(uvm_perf_fault_batch_count, 154 (NvU32)UVM_PERF_FAULT_BATCH_COUNT_MIN); 155 parent_gpu->fault_buffer_info.max_batch_size = min(parent_gpu->fault_buffer_info.max_batch_size, 156 replayable_faults->max_faults); 157 158 if (parent_gpu->fault_buffer_info.max_batch_size != uvm_perf_fault_batch_count) { 159 pr_info("Invalid uvm_perf_fault_batch_count value on GPU %s: %u. Valid range [%u:%u] Using %u instead\n", 160 uvm_parent_gpu_name(parent_gpu), 161 uvm_perf_fault_batch_count, 162 UVM_PERF_FAULT_BATCH_COUNT_MIN, 163 replayable_faults->max_faults, 164 parent_gpu->fault_buffer_info.max_batch_size); 165 } 166 167 batch_context->fault_cache = uvm_kvmalloc_zero(replayable_faults->max_faults * sizeof(*batch_context->fault_cache)); 168 if (!batch_context->fault_cache) 169 return NV_ERR_NO_MEMORY; 170 171 // fault_cache is used to signal that the tracker was initialized. 172 uvm_tracker_init(&replayable_faults->replay_tracker); 173 174 batch_context->ordered_fault_cache = uvm_kvmalloc_zero(replayable_faults->max_faults * 175 sizeof(*batch_context->ordered_fault_cache)); 176 if (!batch_context->ordered_fault_cache) 177 return NV_ERR_NO_MEMORY; 178 179 // This value must be initialized by HAL 180 UVM_ASSERT(replayable_faults->utlb_count > 0); 181 182 batch_context->utlbs = uvm_kvmalloc_zero(replayable_faults->utlb_count * sizeof(*batch_context->utlbs)); 183 if (!batch_context->utlbs) 184 return NV_ERR_NO_MEMORY; 185 186 batch_context->max_utlb_id = 0; 187 188 status = uvm_rm_locked_call(nvUvmInterfaceOwnPageFaultIntr(parent_gpu->rm_device, NV_TRUE)); 189 if (status != NV_OK) { 190 UVM_ERR_PRINT("Failed to take page fault ownership from RM: %s, GPU %s\n", 191 nvstatusToString(status), 192 uvm_parent_gpu_name(parent_gpu)); 193 return status; 194 } 195 196 replayable_faults->replay_policy = uvm_perf_fault_replay_policy < UVM_PERF_FAULT_REPLAY_POLICY_MAX? 197 uvm_perf_fault_replay_policy: 198 UVM_PERF_FAULT_REPLAY_POLICY_DEFAULT; 199 200 if (replayable_faults->replay_policy != uvm_perf_fault_replay_policy) { 201 pr_info("Invalid uvm_perf_fault_replay_policy value on GPU %s: %d. Using %d instead\n", 202 uvm_parent_gpu_name(parent_gpu), 203 uvm_perf_fault_replay_policy, 204 replayable_faults->replay_policy); 205 } 206 207 replayable_faults->replay_update_put_ratio = min(uvm_perf_fault_replay_update_put_ratio, 100u); 208 if (replayable_faults->replay_update_put_ratio != uvm_perf_fault_replay_update_put_ratio) { 209 pr_info("Invalid uvm_perf_fault_replay_update_put_ratio value on GPU %s: %u. Using %u instead\n", 210 uvm_parent_gpu_name(parent_gpu), 211 uvm_perf_fault_replay_update_put_ratio, 212 replayable_faults->replay_update_put_ratio); 213 } 214 215 // Re-enable fault prefetching just in case it was disabled in a previous run 216 parent_gpu->fault_buffer_info.prefetch_faults_enabled = parent_gpu->prefetch_fault_supported; 217 218 fault_buffer_reinit_replayable_faults(parent_gpu); 219 220 return NV_OK; 221 } 222 223 static void fault_buffer_deinit_replayable_faults(uvm_parent_gpu_t *parent_gpu) 224 { 225 uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable; 226 uvm_fault_service_batch_context_t *batch_context = &replayable_faults->batch_service_context; 227 228 if (batch_context->fault_cache) { 229 UVM_ASSERT(uvm_tracker_is_empty(&replayable_faults->replay_tracker)); 230 uvm_tracker_deinit(&replayable_faults->replay_tracker); 231 } 232 233 if (parent_gpu->fault_buffer_info.rm_info.faultBufferHandle) { 234 // Re-enable prefetch faults in case we disabled them 235 if (parent_gpu->prefetch_fault_supported && !parent_gpu->fault_buffer_info.prefetch_faults_enabled) 236 parent_gpu->arch_hal->enable_prefetch_faults(parent_gpu); 237 } 238 239 uvm_kvfree(batch_context->fault_cache); 240 uvm_kvfree(batch_context->ordered_fault_cache); 241 uvm_kvfree(batch_context->utlbs); 242 batch_context->fault_cache = NULL; 243 batch_context->ordered_fault_cache = NULL; 244 batch_context->utlbs = NULL; 245 } 246 247 NV_STATUS uvm_parent_gpu_fault_buffer_init(uvm_parent_gpu_t *parent_gpu) 248 { 249 NV_STATUS status = NV_OK; 250 251 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 252 UVM_ASSERT(parent_gpu->replayable_faults_supported); 253 254 status = uvm_rm_locked_call(nvUvmInterfaceInitFaultInfo(parent_gpu->rm_device, 255 &parent_gpu->fault_buffer_info.rm_info)); 256 if (status != NV_OK) { 257 UVM_ERR_PRINT("Failed to init fault buffer info from RM: %s, GPU %s\n", 258 nvstatusToString(status), 259 uvm_parent_gpu_name(parent_gpu)); 260 261 // nvUvmInterfaceInitFaultInfo may leave fields in rm_info populated 262 // when it returns an error. Set the buffer handle to zero as it is 263 // used by the deinitialization logic to determine if it was correctly 264 // initialized. 265 parent_gpu->fault_buffer_info.rm_info.faultBufferHandle = 0; 266 goto fail; 267 } 268 269 status = fault_buffer_init_replayable_faults(parent_gpu); 270 if (status != NV_OK) 271 goto fail; 272 273 if (parent_gpu->non_replayable_faults_supported) { 274 status = uvm_parent_gpu_fault_buffer_init_non_replayable_faults(parent_gpu); 275 if (status != NV_OK) 276 goto fail; 277 } 278 279 return NV_OK; 280 281 fail: 282 uvm_parent_gpu_fault_buffer_deinit(parent_gpu); 283 284 return status; 285 } 286 287 // Reinitialize state relevant to replayable fault handling after returning 288 // from a power management cycle. 289 void uvm_parent_gpu_fault_buffer_resume(uvm_parent_gpu_t *parent_gpu) 290 { 291 UVM_ASSERT(parent_gpu->replayable_faults_supported); 292 293 fault_buffer_reinit_replayable_faults(parent_gpu); 294 } 295 296 void uvm_parent_gpu_fault_buffer_deinit(uvm_parent_gpu_t *parent_gpu) 297 { 298 NV_STATUS status = NV_OK; 299 300 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 301 302 if (parent_gpu->non_replayable_faults_supported) 303 uvm_parent_gpu_fault_buffer_deinit_non_replayable_faults(parent_gpu); 304 305 fault_buffer_deinit_replayable_faults(parent_gpu); 306 307 if (parent_gpu->fault_buffer_info.rm_info.faultBufferHandle) { 308 status = uvm_rm_locked_call(nvUvmInterfaceOwnPageFaultIntr(parent_gpu->rm_device, NV_FALSE)); 309 UVM_ASSERT(status == NV_OK); 310 311 uvm_rm_locked_call_void(nvUvmInterfaceDestroyFaultInfo(parent_gpu->rm_device, 312 &parent_gpu->fault_buffer_info.rm_info)); 313 314 parent_gpu->fault_buffer_info.rm_info.faultBufferHandle = 0; 315 } 316 } 317 318 bool uvm_parent_gpu_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu) 319 { 320 uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable; 321 322 UVM_ASSERT(parent_gpu->replayable_faults_supported); 323 324 // Fast path 1: we left some faults unserviced in the buffer in the last pass 325 if (replayable_faults->cached_get != replayable_faults->cached_put) 326 return true; 327 328 // Fast path 2: read the valid bit of the fault buffer entry pointed by the 329 // cached get pointer 330 if (!parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, replayable_faults->cached_get)) { 331 // Slow path: read the put pointer from the GPU register via BAR0 332 // over PCIe 333 replayable_faults->cached_put = parent_gpu->fault_buffer_hal->read_put(parent_gpu); 334 335 // No interrupt pending 336 if (replayable_faults->cached_get == replayable_faults->cached_put) 337 return false; 338 } 339 340 return true; 341 } 342 343 // Push a fault cancel method on the given client. Any failure during this 344 // operation may lead to application hang (requiring manual Ctrl+C from the 345 // user) or system crash (requiring reboot). 346 // In that case we log an error message. 347 // 348 // gpc_id and client_id aren't used if global_cancel is true. 349 // 350 // This function acquires both the given tracker and the replay tracker 351 static NV_STATUS push_cancel_on_gpu(uvm_gpu_t *gpu, 352 uvm_gpu_phys_address_t instance_ptr, 353 bool global_cancel, 354 NvU32 gpc_id, 355 NvU32 client_id, 356 uvm_tracker_t *tracker) 357 { 358 NV_STATUS status; 359 uvm_push_t push; 360 uvm_tracker_t *replay_tracker = &gpu->parent->fault_buffer_info.replayable.replay_tracker; 361 362 UVM_ASSERT(tracker != NULL); 363 364 status = uvm_tracker_add_tracker_safe(tracker, replay_tracker); 365 if (status != NV_OK) 366 return status; 367 368 if (global_cancel) { 369 status = uvm_push_begin_acquire(gpu->channel_manager, 370 UVM_CHANNEL_TYPE_MEMOPS, 371 tracker, 372 &push, 373 "Cancel targeting instance_ptr {0x%llx:%s}\n", 374 instance_ptr.address, 375 uvm_aperture_string(instance_ptr.aperture)); 376 } 377 else { 378 status = uvm_push_begin_acquire(gpu->channel_manager, 379 UVM_CHANNEL_TYPE_MEMOPS, 380 tracker, 381 &push, 382 "Cancel targeting instance_ptr {0x%llx:%s} gpc %u client %u\n", 383 instance_ptr.address, 384 uvm_aperture_string(instance_ptr.aperture), 385 gpc_id, 386 client_id); 387 } 388 389 UVM_ASSERT(status == NV_OK); 390 if (status != NV_OK) { 391 UVM_ERR_PRINT("Failed to create push and acquire trackers before pushing cancel: %s, GPU %s\n", 392 nvstatusToString(status), 393 uvm_gpu_name(gpu)); 394 return status; 395 } 396 397 if (global_cancel) 398 gpu->parent->host_hal->cancel_faults_global(&push, instance_ptr); 399 else 400 gpu->parent->host_hal->cancel_faults_targeted(&push, instance_ptr, gpc_id, client_id); 401 402 // We don't need to put the cancel in the GPU replay tracker since we wait 403 // on it immediately. 404 status = uvm_push_end_and_wait(&push); 405 406 UVM_ASSERT(status == NV_OK); 407 if (status != NV_OK) 408 UVM_ERR_PRINT("Failed to wait for pushed cancel: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 409 410 // The cancellation is complete, so the input trackers must be complete too. 411 uvm_tracker_clear(tracker); 412 uvm_tracker_clear(replay_tracker); 413 414 return status; 415 } 416 417 static NV_STATUS push_cancel_on_gpu_targeted(uvm_gpu_t *gpu, 418 uvm_gpu_phys_address_t instance_ptr, 419 NvU32 gpc_id, 420 NvU32 client_id, 421 uvm_tracker_t *tracker) 422 { 423 return push_cancel_on_gpu(gpu, instance_ptr, false, gpc_id, client_id, tracker); 424 } 425 426 static NV_STATUS push_cancel_on_gpu_global(uvm_gpu_t *gpu, uvm_gpu_phys_address_t instance_ptr, uvm_tracker_t *tracker) 427 { 428 UVM_ASSERT(!gpu->parent->smc.enabled); 429 430 return push_cancel_on_gpu(gpu, instance_ptr, true, 0, 0, tracker); 431 } 432 433 // Volta implements a targeted VA fault cancel that simplifies the fault cancel 434 // process. You only need to specify the address, type, and mmu_engine_id for 435 // the access to be cancelled. Caller must hold the VA space lock for the access 436 // to be cancelled. 437 static NV_STATUS cancel_fault_precise_va(uvm_gpu_t *gpu, 438 uvm_fault_buffer_entry_t *fault_entry, 439 uvm_fault_cancel_va_mode_t cancel_va_mode) 440 { 441 NV_STATUS status; 442 uvm_gpu_va_space_t *gpu_va_space; 443 uvm_gpu_phys_address_t pdb; 444 uvm_push_t push; 445 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 446 NvU64 offset; 447 448 UVM_ASSERT(gpu->parent->replayable_faults_supported); 449 UVM_ASSERT(fault_entry->fatal_reason != UvmEventFatalReasonInvalid); 450 UVM_ASSERT(!fault_entry->filtered); 451 452 gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(fault_entry->va_space, gpu->parent); 453 UVM_ASSERT(gpu_va_space); 454 pdb = uvm_page_tree_pdb(&gpu_va_space->page_tables)->addr; 455 456 // Record fatal fault event 457 uvm_tools_record_gpu_fatal_fault(gpu->id, fault_entry->va_space, fault_entry, fault_entry->fatal_reason); 458 459 status = uvm_push_begin_acquire(gpu->channel_manager, 460 UVM_CHANNEL_TYPE_MEMOPS, 461 &replayable_faults->replay_tracker, 462 &push, 463 "Precise cancel targeting PDB {0x%llx:%s} VA 0x%llx VEID %u with access type %s", 464 pdb.address, 465 uvm_aperture_string(pdb.aperture), 466 fault_entry->fault_address, 467 fault_entry->fault_source.ve_id, 468 uvm_fault_access_type_string(fault_entry->fault_access_type)); 469 if (status != NV_OK) { 470 UVM_ERR_PRINT("Failed to create push and acquire replay tracker before pushing cancel: %s, GPU %s\n", 471 nvstatusToString(status), 472 uvm_gpu_name(gpu)); 473 return status; 474 } 475 476 // UVM aligns fault addresses to PAGE_SIZE as it is the smallest mapping 477 // and coherence tracking granularity. However, the cancel method requires 478 // the original address (4K-aligned) reported in the packet, which is lost 479 // at this point. Since the access permissions are the same for the whole 480 // 64K page, we issue a cancel per 4K range to make sure that the HW sees 481 // the address reported in the packet. 482 for (offset = 0; offset < PAGE_SIZE; offset += UVM_PAGE_SIZE_4K) { 483 gpu->parent->host_hal->cancel_faults_va(&push, pdb, fault_entry, cancel_va_mode); 484 fault_entry->fault_address += UVM_PAGE_SIZE_4K; 485 } 486 fault_entry->fault_address = UVM_PAGE_ALIGN_DOWN(fault_entry->fault_address - 1); 487 488 // We don't need to put the cancel in the GPU replay tracker since we wait 489 // on it immediately. 490 status = uvm_push_end_and_wait(&push); 491 if (status != NV_OK) { 492 UVM_ERR_PRINT("Failed to wait for pushed VA global fault cancel: %s, GPU %s\n", 493 nvstatusToString(status), uvm_gpu_name(gpu)); 494 } 495 496 uvm_tracker_clear(&replayable_faults->replay_tracker); 497 498 return status; 499 } 500 501 static NV_STATUS push_replay_on_gpu(uvm_gpu_t *gpu, 502 uvm_fault_replay_type_t type, 503 uvm_fault_service_batch_context_t *batch_context) 504 { 505 NV_STATUS status; 506 uvm_push_t push; 507 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 508 uvm_tracker_t *tracker = NULL; 509 510 if (batch_context) 511 tracker = &batch_context->tracker; 512 513 status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, tracker, &push, 514 "Replaying faults"); 515 if (status != NV_OK) 516 return status; 517 518 gpu->parent->host_hal->replay_faults(&push, type); 519 520 // Do not count REPLAY_TYPE_START_ACK_ALL's toward the replay count. 521 // REPLAY_TYPE_START_ACK_ALL's are issued for cancels, and the cancel 522 // algorithm checks to make sure that no REPLAY_TYPE_START's have been 523 // issued using batch_context->replays. 524 if (batch_context && type != UVM_FAULT_REPLAY_TYPE_START_ACK_ALL) { 525 uvm_tools_broadcast_replay(gpu, &push, batch_context->batch_id, UVM_FAULT_CLIENT_TYPE_GPC); 526 ++batch_context->num_replays; 527 } 528 529 uvm_push_end(&push); 530 531 // Add this push to the GPU's replay_tracker so cancel can wait on it. 532 status = uvm_tracker_add_push_safe(&replayable_faults->replay_tracker, &push); 533 534 if (uvm_procfs_is_debug_enabled()) { 535 if (type == UVM_FAULT_REPLAY_TYPE_START) 536 ++replayable_faults->stats.num_replays; 537 else 538 ++replayable_faults->stats.num_replays_ack_all; 539 } 540 541 return status; 542 } 543 544 static void write_get(uvm_parent_gpu_t *parent_gpu, NvU32 get) 545 { 546 uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable; 547 548 UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock)); 549 550 // Write get on the GPU only if it's changed. 551 if (replayable_faults->cached_get == get) 552 return; 553 554 replayable_faults->cached_get = get; 555 556 // Update get pointer on the GPU 557 parent_gpu->fault_buffer_hal->write_get(parent_gpu, get); 558 } 559 560 // In Confidential Computing GSP-RM owns the HW replayable fault buffer. 561 // Flushing the fault buffer implies flushing both the HW buffer (using a RM 562 // API), and the SW buffer accessible by UVM ("shadow" buffer). 563 // 564 // The HW buffer needs to be flushed first. This is because, once that flush 565 // completes, any faults that were present in the HW buffer have been moved to 566 // the shadow buffer, or have been discarded by RM. 567 static NV_STATUS hw_fault_buffer_flush_locked(uvm_parent_gpu_t *parent_gpu, hw_fault_buffer_flush_mode_t flush_mode) 568 { 569 NV_STATUS status; 570 NvBool is_flush_mode_move; 571 572 UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock)); 573 UVM_ASSERT((flush_mode == HW_FAULT_BUFFER_FLUSH_MODE_MOVE) || (flush_mode == HW_FAULT_BUFFER_FLUSH_MODE_DISCARD)); 574 575 if (!g_uvm_global.conf_computing_enabled) 576 return NV_OK; 577 578 is_flush_mode_move = (NvBool) (flush_mode == HW_FAULT_BUFFER_FLUSH_MODE_MOVE); 579 status = nvUvmInterfaceFlushReplayableFaultBuffer(&parent_gpu->fault_buffer_info.rm_info, is_flush_mode_move); 580 581 UVM_ASSERT(status == NV_OK); 582 583 return status; 584 } 585 586 static void fault_buffer_skip_replayable_entry(uvm_parent_gpu_t *parent_gpu, NvU32 index) 587 { 588 UVM_ASSERT(parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, index)); 589 590 // Flushed faults are never decrypted, but the decryption IV associated with 591 // replayable faults still requires manual adjustment so it is kept in sync 592 // with the encryption IV on the GSP-RM's side. 593 if (g_uvm_global.conf_computing_enabled) 594 uvm_conf_computing_fault_increment_decrypt_iv(parent_gpu, 1); 595 596 parent_gpu->fault_buffer_hal->entry_clear_valid(parent_gpu, index); 597 } 598 599 static NV_STATUS fault_buffer_flush_locked(uvm_gpu_t *gpu, 600 uvm_gpu_buffer_flush_mode_t flush_mode, 601 uvm_fault_replay_type_t fault_replay, 602 uvm_fault_service_batch_context_t *batch_context) 603 { 604 NvU32 get; 605 NvU32 put; 606 uvm_spin_loop_t spin; 607 uvm_parent_gpu_t *parent_gpu = gpu->parent; 608 uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable; 609 NV_STATUS status; 610 611 UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock)); 612 UVM_ASSERT(parent_gpu->replayable_faults_supported); 613 614 // Wait for the prior replay to flush out old fault messages 615 if (flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT) { 616 status = uvm_tracker_wait(&replayable_faults->replay_tracker); 617 if (status != NV_OK) 618 return status; 619 } 620 621 // Read PUT pointer from the GPU if requested 622 if (flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT || flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT) { 623 status = hw_fault_buffer_flush_locked(parent_gpu, HW_FAULT_BUFFER_FLUSH_MODE_DISCARD); 624 if (status != NV_OK) 625 return status; 626 replayable_faults->cached_put = parent_gpu->fault_buffer_hal->read_put(parent_gpu); 627 } 628 629 get = replayable_faults->cached_get; 630 put = replayable_faults->cached_put; 631 632 while (get != put) { 633 // Wait until valid bit is set 634 UVM_SPIN_WHILE(!parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, get), &spin); 635 636 fault_buffer_skip_replayable_entry(parent_gpu, get); 637 ++get; 638 if (get == replayable_faults->max_faults) 639 get = 0; 640 } 641 642 write_get(gpu->parent, get); 643 644 // Issue fault replay 645 return push_replay_on_gpu(gpu, fault_replay, batch_context); 646 } 647 648 NV_STATUS uvm_gpu_fault_buffer_flush(uvm_gpu_t *gpu) 649 { 650 NV_STATUS status = NV_OK; 651 652 UVM_ASSERT(gpu->parent->replayable_faults_supported); 653 654 // Disables replayable fault interrupts and fault servicing 655 uvm_parent_gpu_replayable_faults_isr_lock(gpu->parent); 656 657 status = fault_buffer_flush_locked(gpu, 658 UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT, 659 UVM_FAULT_REPLAY_TYPE_START, 660 NULL); 661 662 // This will trigger the top half to start servicing faults again, if the 663 // replay brought any back in 664 uvm_parent_gpu_replayable_faults_isr_unlock(gpu->parent); 665 return status; 666 } 667 668 static inline int cmp_fault_instance_ptr(const uvm_fault_buffer_entry_t *a, 669 const uvm_fault_buffer_entry_t *b) 670 { 671 int result = uvm_gpu_phys_addr_cmp(a->instance_ptr, b->instance_ptr); 672 // On Volta+ we need to sort by {instance_ptr + subctx_id} pair since it can 673 // map to a different VA space 674 if (result != 0) 675 return result; 676 return UVM_CMP_DEFAULT(a->fault_source.ve_id, b->fault_source.ve_id); 677 } 678 679 // Compare two VA spaces 680 static inline int cmp_va_space(const uvm_va_space_t *a, const uvm_va_space_t *b) 681 { 682 return UVM_CMP_DEFAULT(a, b); 683 } 684 685 // Compare two virtual addresses 686 static inline int cmp_addr(NvU64 a, NvU64 b) 687 { 688 return UVM_CMP_DEFAULT(a, b); 689 } 690 691 // Compare two fault access types 692 static inline int cmp_access_type(uvm_fault_access_type_t a, uvm_fault_access_type_t b) 693 { 694 UVM_ASSERT(a >= 0 && a < UVM_FAULT_ACCESS_TYPE_COUNT); 695 UVM_ASSERT(b >= 0 && b < UVM_FAULT_ACCESS_TYPE_COUNT); 696 697 // Check that fault access type enum values are ordered by "intrusiveness" 698 BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG <= UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK); 699 BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK <= UVM_FAULT_ACCESS_TYPE_WRITE); 700 BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_WRITE <= UVM_FAULT_ACCESS_TYPE_READ); 701 BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_READ <= UVM_FAULT_ACCESS_TYPE_PREFETCH); 702 703 return b - a; 704 } 705 706 typedef enum 707 { 708 // Fetch a batch of faults from the buffer. Stop at the first entry that is 709 // not ready yet 710 FAULT_FETCH_MODE_BATCH_READY, 711 712 // Fetch all faults in the buffer before PUT. Wait for all faults to become 713 // ready 714 FAULT_FETCH_MODE_ALL, 715 } fault_fetch_mode_t; 716 717 static void fetch_fault_buffer_merge_entry(uvm_fault_buffer_entry_t *current_entry, 718 uvm_fault_buffer_entry_t *last_entry) 719 { 720 UVM_ASSERT(last_entry->num_instances > 0); 721 722 ++last_entry->num_instances; 723 uvm_fault_access_type_mask_set(&last_entry->access_type_mask, current_entry->fault_access_type); 724 725 if (current_entry->fault_access_type > last_entry->fault_access_type) { 726 // If the new entry has a higher access type, it becomes the 727 // fault to be serviced. Add the previous one to the list of instances 728 current_entry->access_type_mask = last_entry->access_type_mask; 729 current_entry->num_instances = last_entry->num_instances; 730 last_entry->filtered = true; 731 732 // We only merge faults from different uTLBs if the new fault has an 733 // access type with the same or lower level of intrusiveness. 734 UVM_ASSERT(current_entry->fault_source.utlb_id == last_entry->fault_source.utlb_id); 735 736 list_replace(&last_entry->merged_instances_list, ¤t_entry->merged_instances_list); 737 list_add(&last_entry->merged_instances_list, ¤t_entry->merged_instances_list); 738 } 739 else { 740 // Add the new entry to the list of instances for reporting purposes 741 current_entry->filtered = true; 742 list_add(¤t_entry->merged_instances_list, &last_entry->merged_instances_list); 743 } 744 } 745 746 static bool fetch_fault_buffer_try_merge_entry(uvm_fault_buffer_entry_t *current_entry, 747 uvm_fault_service_batch_context_t *batch_context, 748 uvm_fault_utlb_info_t *current_tlb, 749 bool is_same_instance_ptr) 750 { 751 uvm_fault_buffer_entry_t *last_tlb_entry = current_tlb->last_fault; 752 uvm_fault_buffer_entry_t *last_global_entry = batch_context->last_fault; 753 754 // Check the last coalesced fault and the coalesced fault that was 755 // originated from this uTLB 756 const bool is_last_tlb_fault = current_tlb->num_pending_faults > 0 && 757 cmp_fault_instance_ptr(current_entry, last_tlb_entry) == 0 && 758 current_entry->fault_address == last_tlb_entry->fault_address; 759 760 // We only merge faults from different uTLBs if the new fault has an 761 // access type with the same or lower level of intrusiveness. This is to 762 // avoid having to update num_pending_faults on both uTLBs and recomputing 763 // last_fault. 764 const bool is_last_fault = is_same_instance_ptr && 765 current_entry->fault_address == last_global_entry->fault_address && 766 current_entry->fault_access_type <= last_global_entry->fault_access_type; 767 768 if (is_last_tlb_fault) { 769 fetch_fault_buffer_merge_entry(current_entry, last_tlb_entry); 770 if (current_entry->fault_access_type > last_tlb_entry->fault_access_type) 771 current_tlb->last_fault = current_entry; 772 773 return true; 774 } 775 else if (is_last_fault) { 776 fetch_fault_buffer_merge_entry(current_entry, last_global_entry); 777 if (current_entry->fault_access_type > last_global_entry->fault_access_type) 778 batch_context->last_fault = current_entry; 779 780 return true; 781 } 782 783 return false; 784 } 785 786 // Fetch entries from the fault buffer, decode them and store them in the batch 787 // context. We implement the fetch modes described above. 788 // 789 // When possible, we coalesce duplicate entries to minimize the fault handling 790 // overhead. Basically, we merge faults with the same instance pointer and page 791 // virtual address. We keep track of the last fault per uTLB to detect 792 // duplicates due to local reuse and the last fault in the whole batch to 793 // detect reuse across CTAs. 794 // 795 // We will service the first fault entry with the most "intrusive" (atomic > 796 // write > read > prefetch) access type*. That fault entry is called the 797 // "representative". The rest of filtered faults have the "filtered" flag set 798 // and are added to a list in the representative fault entry for reporting 799 // purposes. The representative fault entry also contains a mask with all the 800 // access types that produced a fault on the page. 801 // 802 // *We only merge faults from different uTLBs if the new fault has an access 803 // type with the same or lower level of intrusiveness. 804 // 805 // This optimization cannot be performed during fault cancel on Pascal GPUs 806 // (fetch_mode == FAULT_FETCH_MODE_ALL) since we need accurate tracking of all 807 // the faults in each uTLB in order to guarantee precise fault attribution. 808 static NV_STATUS fetch_fault_buffer_entries(uvm_gpu_t *gpu, 809 uvm_fault_service_batch_context_t *batch_context, 810 fault_fetch_mode_t fetch_mode) 811 { 812 NvU32 get; 813 NvU32 put; 814 NvU32 fault_index; 815 NvU32 num_coalesced_faults; 816 NvU32 utlb_id; 817 uvm_fault_buffer_entry_t *fault_cache; 818 uvm_spin_loop_t spin; 819 NV_STATUS status = NV_OK; 820 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 821 const bool in_pascal_cancel_path = (!gpu->parent->fault_cancel_va_supported && fetch_mode == FAULT_FETCH_MODE_ALL); 822 const bool may_filter = uvm_perf_fault_coalesce && !in_pascal_cancel_path; 823 824 UVM_ASSERT(uvm_sem_is_locked(&gpu->parent->isr.replayable_faults.service_lock)); 825 UVM_ASSERT(gpu->parent->replayable_faults_supported); 826 827 fault_cache = batch_context->fault_cache; 828 829 get = replayable_faults->cached_get; 830 831 // Read put pointer from GPU and cache it 832 if (get == replayable_faults->cached_put) 833 replayable_faults->cached_put = gpu->parent->fault_buffer_hal->read_put(gpu->parent); 834 835 put = replayable_faults->cached_put; 836 837 batch_context->is_single_instance_ptr = true; 838 batch_context->last_fault = NULL; 839 840 fault_index = 0; 841 num_coalesced_faults = 0; 842 843 // Clear uTLB counters 844 for (utlb_id = 0; utlb_id <= batch_context->max_utlb_id; ++utlb_id) { 845 batch_context->utlbs[utlb_id].num_pending_faults = 0; 846 batch_context->utlbs[utlb_id].has_fatal_faults = false; 847 } 848 batch_context->max_utlb_id = 0; 849 850 if (get == put) 851 goto done; 852 853 // Parse until get != put and have enough space to cache. 854 while ((get != put) && 855 (fetch_mode == FAULT_FETCH_MODE_ALL || fault_index < gpu->parent->fault_buffer_info.max_batch_size)) { 856 bool is_same_instance_ptr = true; 857 uvm_fault_buffer_entry_t *current_entry = &fault_cache[fault_index]; 858 uvm_fault_utlb_info_t *current_tlb; 859 860 // We cannot just wait for the last entry (the one pointed by put) to 861 // become valid, we have to do it individually since entries can be 862 // written out of order 863 UVM_SPIN_WHILE(!gpu->parent->fault_buffer_hal->entry_is_valid(gpu->parent, get), &spin) { 864 // We have some entry to work on. Let's do the rest later. 865 if (fetch_mode == FAULT_FETCH_MODE_BATCH_READY && fault_index > 0) 866 goto done; 867 } 868 869 // Prevent later accesses being moved above the read of the valid bit 870 smp_mb__after_atomic(); 871 872 // Got valid bit set. Let's cache. 873 status = gpu->parent->fault_buffer_hal->parse_replayable_entry(gpu->parent, get, current_entry); 874 if (status != NV_OK) 875 goto done; 876 877 // The GPU aligns the fault addresses to 4k, but all of our tracking is 878 // done in PAGE_SIZE chunks which might be larger. 879 current_entry->fault_address = UVM_PAGE_ALIGN_DOWN(current_entry->fault_address); 880 881 // Make sure that all fields in the entry are properly initialized 882 current_entry->is_fatal = (current_entry->fault_type >= UVM_FAULT_TYPE_FATAL); 883 884 if (current_entry->is_fatal) { 885 // Record the fatal fault event later as we need the va_space locked 886 current_entry->fatal_reason = UvmEventFatalReasonInvalidFaultType; 887 } 888 else { 889 current_entry->fatal_reason = UvmEventFatalReasonInvalid; 890 } 891 892 current_entry->va_space = NULL; 893 current_entry->filtered = false; 894 current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL; 895 896 if (current_entry->fault_source.utlb_id > batch_context->max_utlb_id) { 897 UVM_ASSERT(current_entry->fault_source.utlb_id < replayable_faults->utlb_count); 898 batch_context->max_utlb_id = current_entry->fault_source.utlb_id; 899 } 900 901 current_tlb = &batch_context->utlbs[current_entry->fault_source.utlb_id]; 902 903 if (fault_index > 0) { 904 UVM_ASSERT(batch_context->last_fault); 905 is_same_instance_ptr = cmp_fault_instance_ptr(current_entry, batch_context->last_fault) == 0; 906 907 // Coalesce duplicate faults when possible 908 if (may_filter && !current_entry->is_fatal) { 909 bool merged = fetch_fault_buffer_try_merge_entry(current_entry, 910 batch_context, 911 current_tlb, 912 is_same_instance_ptr); 913 if (merged) 914 goto next_fault; 915 } 916 } 917 918 if (batch_context->is_single_instance_ptr && !is_same_instance_ptr) 919 batch_context->is_single_instance_ptr = false; 920 921 current_entry->num_instances = 1; 922 current_entry->access_type_mask = uvm_fault_access_type_mask_bit(current_entry->fault_access_type); 923 INIT_LIST_HEAD(¤t_entry->merged_instances_list); 924 925 ++current_tlb->num_pending_faults; 926 current_tlb->last_fault = current_entry; 927 batch_context->last_fault = current_entry; 928 929 ++num_coalesced_faults; 930 931 next_fault: 932 ++fault_index; 933 ++get; 934 if (get == replayable_faults->max_faults) 935 get = 0; 936 } 937 938 done: 939 write_get(gpu->parent, get); 940 941 batch_context->num_cached_faults = fault_index; 942 batch_context->num_coalesced_faults = num_coalesced_faults; 943 944 return status; 945 } 946 947 // Sort comparator for pointers to fault buffer entries that sorts by 948 // instance pointer 949 static int cmp_sort_fault_entry_by_instance_ptr(const void *_a, const void *_b) 950 { 951 const uvm_fault_buffer_entry_t **a = (const uvm_fault_buffer_entry_t **)_a; 952 const uvm_fault_buffer_entry_t **b = (const uvm_fault_buffer_entry_t **)_b; 953 954 return cmp_fault_instance_ptr(*a, *b); 955 } 956 957 // Sort comparator for pointers to fault buffer entries that sorts by va_space, 958 // fault address and fault access type 959 static int cmp_sort_fault_entry_by_va_space_address_access_type(const void *_a, const void *_b) 960 { 961 const uvm_fault_buffer_entry_t **a = (const uvm_fault_buffer_entry_t **)_a; 962 const uvm_fault_buffer_entry_t **b = (const uvm_fault_buffer_entry_t **)_b; 963 964 int result; 965 966 result = cmp_va_space((*a)->va_space, (*b)->va_space); 967 if (result != 0) 968 return result; 969 970 result = cmp_addr((*a)->fault_address, (*b)->fault_address); 971 if (result != 0) 972 return result; 973 974 return cmp_access_type((*a)->fault_access_type, (*b)->fault_access_type); 975 } 976 977 // Translate all instance pointers to VA spaces. Since the buffer is ordered by 978 // instance_ptr, we minimize the number of translations 979 // 980 // This function returns NV_WARN_MORE_PROCESSING_REQUIRED if a fault buffer 981 // flush occurred and executed successfully, or the error code if it failed. 982 // NV_OK otherwise. 983 static NV_STATUS translate_instance_ptrs(uvm_gpu_t *gpu, 984 uvm_fault_service_batch_context_t *batch_context) 985 { 986 NvU32 i; 987 NV_STATUS status; 988 989 for (i = 0; i < batch_context->num_coalesced_faults; ++i) { 990 uvm_fault_buffer_entry_t *current_entry; 991 992 current_entry = batch_context->ordered_fault_cache[i]; 993 994 // If this instance pointer matches the previous instance pointer, just 995 // copy over the already-translated va_space and move on. 996 if (i != 0 && cmp_fault_instance_ptr(current_entry, batch_context->ordered_fault_cache[i - 1]) == 0) { 997 current_entry->va_space = batch_context->ordered_fault_cache[i - 1]->va_space; 998 continue; 999 } 1000 1001 status = uvm_parent_gpu_fault_entry_to_va_space(gpu->parent, current_entry, ¤t_entry->va_space); 1002 if (status != NV_OK) { 1003 if (status == NV_ERR_PAGE_TABLE_NOT_AVAIL) { 1004 // The channel is valid but the subcontext is not. This can only 1005 // happen if the subcontext is torn down before its work is 1006 // complete while other subcontexts in the same TSG are still 1007 // executing. This is a violation of the programming model. We 1008 // have limited options since the VA space is gone, meaning we 1009 // can't target the PDB for cancel even if we wanted to. So 1010 // we'll just throw away precise attribution and cancel this 1011 // fault using the SW method, which validates that the intended 1012 // context (TSG) is still running so we don't cancel an innocent 1013 // context. 1014 UVM_ASSERT(!current_entry->va_space); 1015 UVM_ASSERT(gpu->max_subcontexts > 0); 1016 1017 if (gpu->parent->smc.enabled) { 1018 status = push_cancel_on_gpu_targeted(gpu, 1019 current_entry->instance_ptr, 1020 current_entry->fault_source.gpc_id, 1021 current_entry->fault_source.client_id, 1022 &batch_context->tracker); 1023 } 1024 else { 1025 status = push_cancel_on_gpu_global(gpu, current_entry->instance_ptr, &batch_context->tracker); 1026 } 1027 1028 if (status != NV_OK) 1029 return status; 1030 1031 // Fall through and let the flush restart fault processing 1032 } 1033 else { 1034 UVM_ASSERT(status == NV_ERR_INVALID_CHANNEL); 1035 } 1036 1037 // If the channel is gone then we're looking at a stale fault entry. 1038 // The fault must have been resolved already (serviced or 1039 // cancelled), so we can just flush the fault buffer. 1040 // 1041 // No need to use UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT since 1042 // there was a context preemption for the entries we want to flush, 1043 // meaning PUT must reflect them. 1044 status = fault_buffer_flush_locked(gpu, 1045 UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT, 1046 UVM_FAULT_REPLAY_TYPE_START, 1047 batch_context); 1048 if (status != NV_OK) 1049 return status; 1050 1051 return NV_WARN_MORE_PROCESSING_REQUIRED; 1052 } 1053 else { 1054 UVM_ASSERT(current_entry->va_space); 1055 } 1056 } 1057 1058 return NV_OK; 1059 } 1060 1061 // Fault cache preprocessing for fault coalescing 1062 // 1063 // This function generates an ordered view of the given fault_cache in which 1064 // faults are sorted by VA space, fault address (aligned to 4K) and access type 1065 // "intrusiveness". In order to minimize the number of instance_ptr to VA space 1066 // translations we perform a first sort by instance_ptr. 1067 // 1068 // This function returns NV_WARN_MORE_PROCESSING_REQUIRED if a fault buffer 1069 // flush occurred during instance_ptr translation and executed successfully, or 1070 // the error code if it failed. NV_OK otherwise. 1071 // 1072 // Current scheme: 1073 // 1) sort by instance_ptr 1074 // 2) translate all instance_ptrs to VA spaces 1075 // 3) sort by va_space, fault address (fault_address is page-aligned at this 1076 // point) and access type 1077 static NV_STATUS preprocess_fault_batch(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context) 1078 { 1079 NV_STATUS status; 1080 NvU32 i, j; 1081 uvm_fault_buffer_entry_t **ordered_fault_cache = batch_context->ordered_fault_cache; 1082 1083 UVM_ASSERT(batch_context->num_coalesced_faults > 0); 1084 UVM_ASSERT(batch_context->num_cached_faults >= batch_context->num_coalesced_faults); 1085 1086 // Generate an ordered view of the fault cache in ordered_fault_cache. 1087 // We sort the pointers, not the entries in fault_cache 1088 1089 // Initialize pointers before they are sorted. We only sort one instance per 1090 // coalesced fault 1091 for (i = 0, j = 0; i < batch_context->num_cached_faults; ++i) { 1092 if (!batch_context->fault_cache[i].filtered) 1093 ordered_fault_cache[j++] = &batch_context->fault_cache[i]; 1094 } 1095 UVM_ASSERT(j == batch_context->num_coalesced_faults); 1096 1097 // 1) if the fault batch contains more than one, sort by instance_ptr 1098 if (!batch_context->is_single_instance_ptr) { 1099 sort(ordered_fault_cache, 1100 batch_context->num_coalesced_faults, 1101 sizeof(*ordered_fault_cache), 1102 cmp_sort_fault_entry_by_instance_ptr, 1103 NULL); 1104 } 1105 1106 // 2) translate all instance_ptrs to VA spaces 1107 status = translate_instance_ptrs(gpu, batch_context); 1108 if (status != NV_OK) 1109 return status; 1110 1111 // 3) sort by va_space, fault address (GPU already reports 4K-aligned 1112 // address) and access type 1113 sort(ordered_fault_cache, 1114 batch_context->num_coalesced_faults, 1115 sizeof(*ordered_fault_cache), 1116 cmp_sort_fault_entry_by_va_space_address_access_type, 1117 NULL); 1118 1119 return NV_OK; 1120 } 1121 1122 static bool check_fault_entry_duplicate(const uvm_fault_buffer_entry_t *current_entry, 1123 const uvm_fault_buffer_entry_t *previous_entry) 1124 { 1125 bool is_duplicate = false; 1126 1127 if (previous_entry) { 1128 is_duplicate = (current_entry->va_space == previous_entry->va_space) && 1129 (current_entry->fault_address == previous_entry->fault_address); 1130 } 1131 1132 return is_duplicate; 1133 } 1134 1135 static void update_batch_and_notify_fault(uvm_gpu_t *gpu, 1136 uvm_fault_service_batch_context_t *batch_context, 1137 uvm_va_block_t *va_block, 1138 uvm_processor_id_t preferred_location, 1139 uvm_fault_buffer_entry_t *current_entry, 1140 bool is_duplicate) 1141 { 1142 if (is_duplicate) 1143 batch_context->num_duplicate_faults += current_entry->num_instances; 1144 else 1145 batch_context->num_duplicate_faults += current_entry->num_instances - 1; 1146 1147 uvm_perf_event_notify_gpu_fault(¤t_entry->va_space->perf_events, 1148 va_block, 1149 gpu->id, 1150 preferred_location, 1151 current_entry, 1152 batch_context->batch_id, 1153 is_duplicate); 1154 } 1155 1156 static void mark_fault_invalid_prefetch(uvm_fault_service_batch_context_t *batch_context, 1157 uvm_fault_buffer_entry_t *fault_entry) 1158 { 1159 fault_entry->is_invalid_prefetch = true; 1160 1161 // For block faults, the following counter might be updated more than once 1162 // for the same fault if block_context->num_retries > 0. As a result, this 1163 // counter might be higher than the actual count. In order for this counter 1164 // to be always accurate, block_context needs to passed down the stack from 1165 // all callers. But since num_retries > 0 case is uncommon and imprecise 1166 // invalid_prefetch counter doesn't affect functionality (other than 1167 // disabling prefetching if the counter indicates lots of invalid prefetch 1168 // faults), this is ok. 1169 batch_context->num_invalid_prefetch_faults += fault_entry->num_instances; 1170 } 1171 1172 static void mark_fault_throttled(uvm_fault_service_batch_context_t *batch_context, 1173 uvm_fault_buffer_entry_t *fault_entry) 1174 { 1175 fault_entry->is_throttled = true; 1176 batch_context->has_throttled_faults = true; 1177 } 1178 1179 static void mark_fault_fatal(uvm_fault_service_batch_context_t *batch_context, 1180 uvm_fault_buffer_entry_t *fault_entry, 1181 UvmEventFatalReason fatal_reason, 1182 uvm_fault_cancel_va_mode_t cancel_va_mode) 1183 { 1184 uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[fault_entry->fault_source.utlb_id]; 1185 1186 fault_entry->is_fatal = true; 1187 fault_entry->fatal_reason = fatal_reason; 1188 fault_entry->replayable.cancel_va_mode = cancel_va_mode; 1189 1190 utlb->has_fatal_faults = true; 1191 1192 if (!batch_context->fatal_va_space) { 1193 UVM_ASSERT(fault_entry->va_space); 1194 batch_context->fatal_va_space = fault_entry->va_space; 1195 } 1196 } 1197 1198 static void fault_entry_duplicate_flags(uvm_fault_service_batch_context_t *batch_context, 1199 uvm_fault_buffer_entry_t *current_entry, 1200 const uvm_fault_buffer_entry_t *previous_entry) 1201 { 1202 UVM_ASSERT(previous_entry); 1203 UVM_ASSERT(check_fault_entry_duplicate(current_entry, previous_entry)); 1204 1205 // Propagate the is_invalid_prefetch flag across all prefetch faults 1206 // on the page 1207 if (previous_entry->is_invalid_prefetch) 1208 mark_fault_invalid_prefetch(batch_context, current_entry); 1209 1210 // If a page is throttled, all faults on the page must be skipped 1211 if (previous_entry->is_throttled) 1212 mark_fault_throttled(batch_context, current_entry); 1213 } 1214 1215 // This function computes the maximum access type that can be serviced for the 1216 // reported fault instances given the logical permissions of the VA range. If 1217 // none of the fault instances can be serviced UVM_FAULT_ACCESS_TYPE_COUNT is 1218 // returned instead. 1219 // 1220 // In the case that there are faults that cannot be serviced, this function 1221 // also sets the flags required for fault cancellation. Prefetch faults do not 1222 // need to be cancelled since they disappear on replay. 1223 // 1224 // The UVM driver considers two scenarios for logical permissions violation: 1225 // - All access types are invalid. For example, when faulting from a processor 1226 // that doesn't have access to the preferred location of a range group when it 1227 // is not migratable. In this case all accesses to the page must be cancelled. 1228 // - Write/atomic accesses are invalid. Basically, when trying to modify a 1229 // read-only VA range. In this case we restrict fault cancelling to those types 1230 // of accesses. 1231 // 1232 // Return values: 1233 // - service_access_type: highest access type that can be serviced. 1234 static uvm_fault_access_type_t check_fault_access_permissions(uvm_gpu_t *gpu, 1235 uvm_fault_service_batch_context_t *batch_context, 1236 uvm_va_block_t *va_block, 1237 uvm_service_block_context_t *service_block_context, 1238 uvm_fault_buffer_entry_t *fault_entry, 1239 bool allow_migration) 1240 { 1241 NV_STATUS perm_status; 1242 UvmEventFatalReason fatal_reason; 1243 uvm_fault_cancel_va_mode_t cancel_va_mode; 1244 uvm_fault_access_type_t ret = UVM_FAULT_ACCESS_TYPE_COUNT; 1245 uvm_va_block_context_t *va_block_context = service_block_context->block_context; 1246 1247 perm_status = uvm_va_block_check_logical_permissions(va_block, 1248 va_block_context, 1249 gpu->id, 1250 uvm_va_block_cpu_page_index(va_block, 1251 fault_entry->fault_address), 1252 fault_entry->fault_access_type, 1253 allow_migration); 1254 if (perm_status == NV_OK) 1255 return fault_entry->fault_access_type; 1256 1257 if (fault_entry->fault_access_type == UVM_FAULT_ACCESS_TYPE_PREFETCH) { 1258 // Only update the count the first time since logical permissions cannot 1259 // change while we hold the VA space lock 1260 // TODO: Bug 1750144: That might not be true with HMM. 1261 if (service_block_context->num_retries == 0) 1262 mark_fault_invalid_prefetch(batch_context, fault_entry); 1263 1264 return ret; 1265 } 1266 1267 // At this point we know that some fault instances cannot be serviced 1268 fatal_reason = uvm_tools_status_to_fatal_fault_reason(perm_status); 1269 1270 if (fault_entry->fault_access_type > UVM_FAULT_ACCESS_TYPE_READ) { 1271 cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_WRITE_AND_ATOMIC; 1272 1273 // If there are pending read accesses on the same page, we have to 1274 // service them before we can cancel the write/atomic faults. So we 1275 // retry with read fault access type. 1276 if (uvm_fault_access_type_mask_test(fault_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ)) { 1277 perm_status = uvm_va_block_check_logical_permissions(va_block, 1278 va_block_context, 1279 gpu->id, 1280 uvm_va_block_cpu_page_index(va_block, 1281 fault_entry->fault_address), 1282 UVM_FAULT_ACCESS_TYPE_READ, 1283 allow_migration); 1284 if (perm_status == NV_OK) { 1285 ret = UVM_FAULT_ACCESS_TYPE_READ; 1286 } 1287 else { 1288 // Read accesses didn't succeed, cancel all faults 1289 cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL; 1290 fatal_reason = uvm_tools_status_to_fatal_fault_reason(perm_status); 1291 } 1292 } 1293 } 1294 else { 1295 cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL; 1296 } 1297 1298 mark_fault_fatal(batch_context, fault_entry, fatal_reason, cancel_va_mode); 1299 1300 return ret; 1301 } 1302 1303 // We notify the fault event for all faults within the block so that the 1304 // performance heuristics are updated. Then, all required actions for the block 1305 // data are performed by the performance heuristics code. 1306 // 1307 // Fatal faults are flagged as fatal for later cancellation. Servicing is not 1308 // interrupted on fatal faults due to insufficient permissions or invalid 1309 // addresses. 1310 // 1311 // Return codes: 1312 // - NV_OK if all faults were handled (both fatal and non-fatal) 1313 // - NV_ERR_MORE_PROCESSING_REQUIRED if servicing needs allocation retry 1314 // - NV_ERR_NO_MEMORY if the faults could not be serviced due to OOM 1315 // - Any other value is a UVM-global error 1316 static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu, 1317 uvm_va_block_t *va_block, 1318 uvm_va_block_retry_t *va_block_retry, 1319 uvm_fault_service_batch_context_t *batch_context, 1320 NvU32 first_fault_index, 1321 const bool hmm_migratable, 1322 NvU32 *block_faults) 1323 { 1324 NV_STATUS status = NV_OK; 1325 NvU32 i; 1326 uvm_page_index_t first_page_index; 1327 uvm_page_index_t last_page_index; 1328 NvU32 page_fault_count = 0; 1329 uvm_range_group_range_iter_t iter; 1330 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 1331 uvm_fault_buffer_entry_t **ordered_fault_cache = batch_context->ordered_fault_cache; 1332 uvm_service_block_context_t *block_context = &replayable_faults->block_service_context; 1333 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 1334 const uvm_va_policy_t *policy; 1335 NvU64 end; 1336 1337 // Check that all uvm_fault_access_type_t values can fit into an NvU8 1338 BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_COUNT > (int)(NvU8)-1); 1339 1340 uvm_assert_mutex_locked(&va_block->lock); 1341 1342 *block_faults = 0; 1343 1344 first_page_index = PAGES_PER_UVM_VA_BLOCK; 1345 last_page_index = 0; 1346 1347 // Initialize fault service block context 1348 uvm_processor_mask_zero(&block_context->resident_processors); 1349 block_context->thrashing_pin_count = 0; 1350 block_context->read_duplicate_count = 0; 1351 1352 uvm_range_group_range_migratability_iter_first(va_space, va_block->start, va_block->end, &iter); 1353 1354 // The first entry is guaranteed to fall within this block 1355 UVM_ASSERT(ordered_fault_cache[first_fault_index]->va_space == va_space); 1356 UVM_ASSERT(ordered_fault_cache[first_fault_index]->fault_address >= va_block->start); 1357 UVM_ASSERT(ordered_fault_cache[first_fault_index]->fault_address <= va_block->end); 1358 1359 if (uvm_va_block_is_hmm(va_block)) { 1360 policy = uvm_hmm_find_policy_end(va_block, 1361 block_context->block_context->hmm.vma, 1362 ordered_fault_cache[first_fault_index]->fault_address, 1363 &end); 1364 } 1365 else { 1366 policy = uvm_va_range_get_policy(va_block->va_range); 1367 end = va_block->end; 1368 } 1369 1370 // Scan the sorted array and notify the fault event for all fault entries 1371 // in the block 1372 for (i = first_fault_index; 1373 i < batch_context->num_coalesced_faults && 1374 ordered_fault_cache[i]->va_space == va_space && 1375 ordered_fault_cache[i]->fault_address <= end; 1376 ++i) { 1377 uvm_fault_buffer_entry_t *current_entry = ordered_fault_cache[i]; 1378 const uvm_fault_buffer_entry_t *previous_entry = NULL; 1379 bool read_duplicate; 1380 uvm_processor_id_t new_residency; 1381 uvm_perf_thrashing_hint_t thrashing_hint; 1382 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, current_entry->fault_address); 1383 bool is_duplicate = false; 1384 uvm_fault_access_type_t service_access_type; 1385 NvU32 service_access_type_mask; 1386 1387 UVM_ASSERT(current_entry->fault_access_type == 1388 uvm_fault_access_type_mask_highest(current_entry->access_type_mask)); 1389 1390 // Unserviceable faults were already skipped by the caller. There are no 1391 // unserviceable fault types that could be in the same VA block as a 1392 // serviceable fault. 1393 UVM_ASSERT(!current_entry->is_fatal); 1394 current_entry->is_throttled = false; 1395 current_entry->is_invalid_prefetch = false; 1396 1397 if (i > first_fault_index) { 1398 previous_entry = ordered_fault_cache[i - 1]; 1399 is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry); 1400 } 1401 1402 // Only update counters the first time since logical permissions cannot 1403 // change while we hold the VA space lock. 1404 // TODO: Bug 1750144: That might not be true with HMM. 1405 if (block_context->num_retries == 0) { 1406 update_batch_and_notify_fault(gpu, 1407 batch_context, 1408 va_block, 1409 policy->preferred_location, 1410 current_entry, 1411 is_duplicate); 1412 } 1413 1414 // Service the most intrusive fault per page, only. Waive the rest 1415 if (is_duplicate) { 1416 fault_entry_duplicate_flags(batch_context, current_entry, previous_entry); 1417 1418 // The previous fault was non-fatal so the page has been already 1419 // serviced 1420 if (!previous_entry->is_fatal) 1421 continue; 1422 } 1423 1424 // Ensure that the migratability iterator covers the current fault 1425 // address 1426 while (iter.end < current_entry->fault_address) 1427 uvm_range_group_range_migratability_iter_next(va_space, &iter, va_block->end); 1428 1429 UVM_ASSERT(iter.start <= current_entry->fault_address && iter.end >= current_entry->fault_address); 1430 1431 service_access_type = check_fault_access_permissions(gpu, 1432 batch_context, 1433 va_block, 1434 block_context, 1435 current_entry, 1436 iter.migratable); 1437 1438 // Do not exit early due to logical errors such as access permission 1439 // violation. 1440 if (service_access_type == UVM_FAULT_ACCESS_TYPE_COUNT) 1441 continue; 1442 1443 if (service_access_type != current_entry->fault_access_type) { 1444 // Some of the fault instances cannot be serviced due to invalid 1445 // access permissions. Recompute the access type service mask to 1446 // service the rest. 1447 UVM_ASSERT(service_access_type < current_entry->fault_access_type); 1448 service_access_type_mask = uvm_fault_access_type_mask_bit(service_access_type); 1449 } 1450 else { 1451 service_access_type_mask = current_entry->access_type_mask; 1452 } 1453 1454 // If the GPU already has the necessary access permission, the fault 1455 // does not need to be serviced 1456 if (uvm_va_block_page_is_gpu_authorized(va_block, 1457 page_index, 1458 gpu->id, 1459 uvm_fault_access_type_to_prot(service_access_type))) 1460 continue; 1461 1462 thrashing_hint = uvm_perf_thrashing_get_hint(va_block, 1463 block_context->block_context, 1464 current_entry->fault_address, 1465 gpu->id); 1466 if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) { 1467 // Throttling is implemented by sleeping in the fault handler on 1468 // the CPU and by continuing to process faults on other pages on 1469 // the GPU 1470 // 1471 // Only update the flag the first time since logical permissions 1472 // cannot change while we hold the VA space lock. 1473 // TODO: Bug 1750144: That might not be true with HMM. 1474 if (block_context->num_retries == 0) 1475 mark_fault_throttled(batch_context, current_entry); 1476 1477 continue; 1478 } 1479 else if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) { 1480 if (block_context->thrashing_pin_count++ == 0) 1481 uvm_page_mask_zero(&block_context->thrashing_pin_mask); 1482 1483 uvm_page_mask_set(&block_context->thrashing_pin_mask, page_index); 1484 } 1485 1486 // Compute new residency and update the masks 1487 new_residency = uvm_va_block_select_residency(va_block, 1488 block_context->block_context, 1489 page_index, 1490 gpu->id, 1491 service_access_type_mask, 1492 policy, 1493 &thrashing_hint, 1494 UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS, 1495 hmm_migratable, 1496 &read_duplicate); 1497 1498 if (!uvm_processor_mask_test_and_set(&block_context->resident_processors, new_residency)) 1499 uvm_page_mask_zero(&block_context->per_processor_masks[uvm_id_value(new_residency)].new_residency); 1500 1501 uvm_page_mask_set(&block_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index); 1502 1503 if (read_duplicate) { 1504 if (block_context->read_duplicate_count++ == 0) 1505 uvm_page_mask_zero(&block_context->read_duplicate_mask); 1506 1507 uvm_page_mask_set(&block_context->read_duplicate_mask, page_index); 1508 } 1509 1510 ++page_fault_count; 1511 1512 block_context->access_type[page_index] = service_access_type; 1513 1514 if (page_index < first_page_index) 1515 first_page_index = page_index; 1516 if (page_index > last_page_index) 1517 last_page_index = page_index; 1518 } 1519 1520 // Apply the changes computed in the fault service block context, if there 1521 // are pages to be serviced 1522 if (page_fault_count > 0) { 1523 block_context->region = uvm_va_block_region(first_page_index, last_page_index + 1); 1524 status = uvm_va_block_service_locked(gpu->id, va_block, va_block_retry, block_context); 1525 } 1526 1527 *block_faults = i - first_fault_index; 1528 1529 ++block_context->num_retries; 1530 1531 if (status == NV_OK && batch_context->fatal_va_space) 1532 status = uvm_va_block_set_cancel(va_block, block_context->block_context, gpu); 1533 1534 return status; 1535 } 1536 1537 // We notify the fault event for all faults within the block so that the 1538 // performance heuristics are updated. The VA block lock is taken for the whole 1539 // fault servicing although it might be temporarily dropped and re-taken if 1540 // memory eviction is required. 1541 // 1542 // See the comments for function service_fault_batch_block_locked for 1543 // implementation details and error codes. 1544 static NV_STATUS service_fault_batch_block(uvm_gpu_t *gpu, 1545 uvm_va_block_t *va_block, 1546 uvm_fault_service_batch_context_t *batch_context, 1547 NvU32 first_fault_index, 1548 const bool hmm_migratable, 1549 NvU32 *block_faults) 1550 { 1551 NV_STATUS status; 1552 uvm_va_block_retry_t va_block_retry; 1553 NV_STATUS tracker_status; 1554 uvm_service_block_context_t *fault_block_context = &gpu->parent->fault_buffer_info.replayable.block_service_context; 1555 1556 fault_block_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS; 1557 fault_block_context->num_retries = 0; 1558 1559 if (uvm_va_block_is_hmm(va_block)) 1560 uvm_hmm_migrate_begin_wait(va_block); 1561 1562 uvm_mutex_lock(&va_block->lock); 1563 1564 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, &va_block_retry, 1565 service_fault_batch_block_locked(gpu, 1566 va_block, 1567 &va_block_retry, 1568 batch_context, 1569 first_fault_index, 1570 hmm_migratable, 1571 block_faults)); 1572 1573 tracker_status = uvm_tracker_add_tracker_safe(&batch_context->tracker, &va_block->tracker); 1574 1575 uvm_mutex_unlock(&va_block->lock); 1576 1577 if (uvm_va_block_is_hmm(va_block)) 1578 uvm_hmm_migrate_finish(va_block); 1579 1580 return status == NV_OK? tracker_status: status; 1581 } 1582 1583 typedef enum 1584 { 1585 // Use this mode when calling from the normal fault servicing path 1586 FAULT_SERVICE_MODE_REGULAR, 1587 1588 // Use this mode when servicing faults from the fault cancelling algorithm. 1589 // In this mode no replays are issued 1590 FAULT_SERVICE_MODE_CANCEL, 1591 } fault_service_mode_t; 1592 1593 static void service_fault_batch_fatal(uvm_gpu_t *gpu, 1594 uvm_fault_service_batch_context_t *batch_context, 1595 NvU32 first_fault_index, 1596 NV_STATUS status, 1597 uvm_fault_cancel_va_mode_t cancel_va_mode, 1598 NvU32 *block_faults) 1599 { 1600 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index]; 1601 const uvm_fault_buffer_entry_t *previous_entry = first_fault_index > 0 ? 1602 batch_context->ordered_fault_cache[first_fault_index - 1] : NULL; 1603 bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry); 1604 1605 if (is_duplicate) 1606 fault_entry_duplicate_flags(batch_context, current_entry, previous_entry); 1607 1608 if (current_entry->fault_access_type == UVM_FAULT_ACCESS_TYPE_PREFETCH) 1609 mark_fault_invalid_prefetch(batch_context, current_entry); 1610 else 1611 mark_fault_fatal(batch_context, current_entry, uvm_tools_status_to_fatal_fault_reason(status), cancel_va_mode); 1612 1613 (*block_faults)++; 1614 } 1615 1616 static void service_fault_batch_fatal_notify(uvm_gpu_t *gpu, 1617 uvm_fault_service_batch_context_t *batch_context, 1618 NvU32 first_fault_index, 1619 NV_STATUS status, 1620 uvm_fault_cancel_va_mode_t cancel_va_mode, 1621 NvU32 *block_faults) 1622 { 1623 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index]; 1624 const uvm_fault_buffer_entry_t *previous_entry = first_fault_index > 0 ? 1625 batch_context->ordered_fault_cache[first_fault_index - 1] : NULL; 1626 bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry); 1627 1628 service_fault_batch_fatal(gpu, batch_context, first_fault_index, status, cancel_va_mode, block_faults); 1629 1630 update_batch_and_notify_fault(gpu, batch_context, NULL, UVM_ID_INVALID, current_entry, is_duplicate); 1631 } 1632 1633 static NV_STATUS service_fault_batch_ats_sub_vma(uvm_gpu_va_space_t *gpu_va_space, 1634 struct vm_area_struct *vma, 1635 NvU64 base, 1636 uvm_fault_service_batch_context_t *batch_context, 1637 NvU32 fault_index_start, 1638 NvU32 fault_index_end, 1639 NvU32 *block_faults) 1640 { 1641 NvU32 i; 1642 NV_STATUS status = NV_OK; 1643 uvm_gpu_t *gpu = gpu_va_space->gpu; 1644 uvm_ats_fault_context_t *ats_context = &batch_context->ats_context; 1645 const uvm_page_mask_t *read_fault_mask = &ats_context->read_fault_mask; 1646 const uvm_page_mask_t *write_fault_mask = &ats_context->write_fault_mask; 1647 const uvm_page_mask_t *reads_serviced_mask = &ats_context->reads_serviced_mask; 1648 uvm_page_mask_t *faults_serviced_mask = &ats_context->faults_serviced_mask; 1649 uvm_page_mask_t *accessed_mask = &ats_context->accessed_mask; 1650 1651 UVM_ASSERT(vma); 1652 1653 ats_context->client_type = UVM_FAULT_CLIENT_TYPE_GPC; 1654 1655 uvm_page_mask_or(accessed_mask, write_fault_mask, read_fault_mask); 1656 1657 status = uvm_ats_service_faults(gpu_va_space, vma, base, &batch_context->ats_context); 1658 1659 // Remove prefetched pages from the serviced mask since fault servicing 1660 // failures belonging to prefetch pages need to be ignored. 1661 uvm_page_mask_and(faults_serviced_mask, faults_serviced_mask, accessed_mask); 1662 1663 UVM_ASSERT(uvm_page_mask_subset(faults_serviced_mask, accessed_mask)); 1664 1665 if ((status != NV_OK) || uvm_page_mask_equal(faults_serviced_mask, accessed_mask)) { 1666 (*block_faults) += (fault_index_end - fault_index_start); 1667 return status; 1668 } 1669 1670 // Check faults_serviced_mask and reads_serviced_mask for precise fault 1671 // attribution after calling the ATS servicing routine. The 1672 // errors returned from ATS servicing routine should only be 1673 // global errors such as OOM or ECC. uvm_gpu_service_replayable_faults() 1674 // handles global errors by calling cancel_fault_batch(). Precise 1675 // attribution isn't currently supported in such cases. 1676 // 1677 // Precise fault attribution for global errors can be handled by 1678 // servicing one fault at a time until fault servicing encounters an 1679 // error. 1680 // TODO: Bug 3989244: Precise ATS fault attribution for global errors. 1681 for (i = fault_index_start; i < fault_index_end; i++) { 1682 uvm_page_index_t page_index; 1683 uvm_fault_cancel_va_mode_t cancel_va_mode; 1684 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i]; 1685 uvm_fault_access_type_t access_type = current_entry->fault_access_type; 1686 1687 page_index = (current_entry->fault_address - base) / PAGE_SIZE; 1688 1689 if (uvm_page_mask_test(faults_serviced_mask, page_index)) { 1690 (*block_faults)++; 1691 continue; 1692 } 1693 1694 if (access_type <= UVM_FAULT_ACCESS_TYPE_READ) { 1695 cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL; 1696 } 1697 else { 1698 UVM_ASSERT(access_type >= UVM_FAULT_ACCESS_TYPE_WRITE); 1699 if (uvm_fault_access_type_mask_test(current_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ) && 1700 !uvm_page_mask_test(reads_serviced_mask, page_index)) 1701 cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL; 1702 else 1703 cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_WRITE_AND_ATOMIC; 1704 } 1705 1706 service_fault_batch_fatal(gpu, batch_context, i, NV_ERR_INVALID_ADDRESS, cancel_va_mode, block_faults); 1707 } 1708 1709 return status; 1710 } 1711 1712 static void start_new_sub_batch(NvU64 *sub_batch_base, 1713 NvU64 address, 1714 NvU32 *sub_batch_fault_index, 1715 NvU32 fault_index, 1716 uvm_ats_fault_context_t *ats_context) 1717 { 1718 uvm_page_mask_zero(&ats_context->read_fault_mask); 1719 uvm_page_mask_zero(&ats_context->write_fault_mask); 1720 1721 *sub_batch_fault_index = fault_index; 1722 *sub_batch_base = UVM_VA_BLOCK_ALIGN_DOWN(address); 1723 } 1724 1725 static NV_STATUS service_fault_batch_ats_sub(uvm_gpu_va_space_t *gpu_va_space, 1726 struct vm_area_struct *vma, 1727 uvm_fault_service_batch_context_t *batch_context, 1728 NvU32 fault_index, 1729 NvU64 outer, 1730 NvU32 *block_faults) 1731 { 1732 NV_STATUS status = NV_OK; 1733 NvU32 i = fault_index; 1734 NvU32 sub_batch_fault_index; 1735 NvU64 sub_batch_base; 1736 uvm_fault_buffer_entry_t *previous_entry = NULL; 1737 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i]; 1738 uvm_ats_fault_context_t *ats_context = &batch_context->ats_context; 1739 uvm_page_mask_t *read_fault_mask = &ats_context->read_fault_mask; 1740 uvm_page_mask_t *write_fault_mask = &ats_context->write_fault_mask; 1741 uvm_gpu_t *gpu = gpu_va_space->gpu; 1742 bool replay_per_va_block = 1743 (gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK); 1744 1745 UVM_ASSERT(vma); 1746 1747 outer = min(outer, (NvU64) vma->vm_end); 1748 1749 start_new_sub_batch(&sub_batch_base, current_entry->fault_address, &sub_batch_fault_index, i, ats_context); 1750 1751 do { 1752 uvm_page_index_t page_index; 1753 NvU64 fault_address = current_entry->fault_address; 1754 uvm_fault_access_type_t access_type = current_entry->fault_access_type; 1755 bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry); 1756 1757 // ATS faults can't be unserviceable, since unserviceable faults require 1758 // GMMU PTEs. 1759 UVM_ASSERT(!current_entry->is_fatal); 1760 1761 i++; 1762 1763 update_batch_and_notify_fault(gpu_va_space->gpu, 1764 batch_context, 1765 NULL, 1766 UVM_ID_INVALID, 1767 current_entry, 1768 is_duplicate); 1769 1770 // End of sub-batch. Service faults gathered so far. 1771 if (fault_address >= (sub_batch_base + UVM_VA_BLOCK_SIZE)) { 1772 UVM_ASSERT(!uvm_page_mask_empty(read_fault_mask) || !uvm_page_mask_empty(write_fault_mask)); 1773 1774 status = service_fault_batch_ats_sub_vma(gpu_va_space, 1775 vma, 1776 sub_batch_base, 1777 batch_context, 1778 sub_batch_fault_index, 1779 i - 1, 1780 block_faults); 1781 if (status != NV_OK || replay_per_va_block) 1782 break; 1783 1784 start_new_sub_batch(&sub_batch_base, fault_address, &sub_batch_fault_index, i - 1, ats_context); 1785 } 1786 1787 page_index = (fault_address - sub_batch_base) / PAGE_SIZE; 1788 1789 if ((access_type <= UVM_FAULT_ACCESS_TYPE_READ) || 1790 uvm_fault_access_type_mask_test(current_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ)) 1791 uvm_page_mask_set(read_fault_mask, page_index); 1792 1793 if (access_type >= UVM_FAULT_ACCESS_TYPE_WRITE) 1794 uvm_page_mask_set(write_fault_mask, page_index); 1795 1796 previous_entry = current_entry; 1797 current_entry = i < batch_context->num_coalesced_faults ? batch_context->ordered_fault_cache[i] : NULL; 1798 1799 } while (current_entry && 1800 (current_entry->fault_address < outer) && 1801 (previous_entry->va_space == current_entry->va_space)); 1802 1803 // Service the last sub-batch. 1804 if ((status == NV_OK) && (!uvm_page_mask_empty(read_fault_mask) || !uvm_page_mask_empty(write_fault_mask))) { 1805 status = service_fault_batch_ats_sub_vma(gpu_va_space, 1806 vma, 1807 sub_batch_base, 1808 batch_context, 1809 sub_batch_fault_index, 1810 i, 1811 block_faults); 1812 } 1813 1814 return status; 1815 } 1816 1817 static NV_STATUS service_fault_batch_ats(uvm_gpu_va_space_t *gpu_va_space, 1818 struct mm_struct *mm, 1819 uvm_fault_service_batch_context_t *batch_context, 1820 NvU32 first_fault_index, 1821 NvU64 outer, 1822 NvU32 *block_faults) 1823 { 1824 NvU32 i; 1825 NV_STATUS status = NV_OK; 1826 1827 for (i = first_fault_index; i < batch_context->num_coalesced_faults;) { 1828 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i]; 1829 const uvm_fault_buffer_entry_t *previous_entry = i > first_fault_index ? 1830 batch_context->ordered_fault_cache[i - 1] : NULL; 1831 NvU64 fault_address = current_entry->fault_address; 1832 struct vm_area_struct *vma; 1833 NvU32 num_faults_before = (*block_faults); 1834 1835 if (previous_entry && (previous_entry->va_space != current_entry->va_space)) 1836 break; 1837 1838 if (fault_address >= outer) 1839 break; 1840 1841 vma = find_vma_intersection(mm, fault_address, fault_address + 1); 1842 if (!vma) { 1843 // Since a vma wasn't found, cancel all accesses on the page since 1844 // cancelling write and atomic accesses will not cancel pending read 1845 // faults and this can lead to a deadlock since read faults need to 1846 // be serviced first before cancelling write faults. 1847 service_fault_batch_fatal_notify(gpu_va_space->gpu, 1848 batch_context, 1849 i, 1850 NV_ERR_INVALID_ADDRESS, 1851 UVM_FAULT_CANCEL_VA_MODE_ALL, 1852 block_faults); 1853 1854 // Do not fail due to logical errors. 1855 status = NV_OK; 1856 1857 break; 1858 } 1859 1860 status = service_fault_batch_ats_sub(gpu_va_space, vma, batch_context, i, outer, block_faults); 1861 if (status != NV_OK) 1862 break; 1863 1864 i += ((*block_faults) - num_faults_before); 1865 } 1866 1867 return status; 1868 } 1869 1870 static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space, 1871 uvm_gpu_va_space_t *gpu_va_space, 1872 uvm_fault_service_batch_context_t *batch_context, 1873 NvU32 fault_index, 1874 NvU32 *block_faults, 1875 bool replay_per_va_block, 1876 const bool hmm_migratable) 1877 { 1878 NV_STATUS status; 1879 uvm_va_range_t *va_range = NULL; 1880 uvm_va_range_t *va_range_next = NULL; 1881 uvm_va_block_t *va_block; 1882 uvm_gpu_t *gpu = gpu_va_space->gpu; 1883 uvm_va_block_context_t *va_block_context = 1884 gpu->parent->fault_buffer_info.replayable.block_service_context.block_context; 1885 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[fault_index]; 1886 struct mm_struct *mm = va_block_context->mm; 1887 NvU64 fault_address = current_entry->fault_address; 1888 1889 (*block_faults) = 0; 1890 1891 va_range_next = uvm_va_space_iter_first(va_space, fault_address, ~0ULL); 1892 if (va_range_next && (fault_address >= va_range_next->node.start)) { 1893 UVM_ASSERT(fault_address < va_range_next->node.end); 1894 1895 va_range = va_range_next; 1896 va_range_next = uvm_va_space_iter_next(va_range_next, ~0ULL); 1897 } 1898 1899 if (va_range) 1900 status = uvm_va_block_find_create_in_range(va_space, va_range, fault_address, &va_block); 1901 else if (mm) 1902 status = uvm_hmm_va_block_find_create(va_space, fault_address, &va_block_context->hmm.vma, &va_block); 1903 else 1904 status = NV_ERR_INVALID_ADDRESS; 1905 1906 if (status == NV_OK) { 1907 status = service_fault_batch_block(gpu, va_block, batch_context, fault_index, hmm_migratable, block_faults); 1908 } 1909 else if ((status == NV_ERR_INVALID_ADDRESS) && uvm_ats_can_service_faults(gpu_va_space, mm)) { 1910 NvU64 outer = ~0ULL; 1911 1912 UVM_ASSERT(replay_per_va_block == 1913 (gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK)); 1914 1915 // Limit outer to the minimum of next va_range.start and first 1916 // fault_address' next UVM_GMMU_ATS_GRANULARITY alignment so that it's 1917 // enough to check whether the first fault in this dispatch belongs to a 1918 // GMMU region. 1919 if (va_range_next) { 1920 outer = min(va_range_next->node.start, 1921 UVM_ALIGN_DOWN(fault_address + UVM_GMMU_ATS_GRANULARITY, UVM_GMMU_ATS_GRANULARITY)); 1922 } 1923 1924 // ATS lookups are disabled on all addresses within the same 1925 // UVM_GMMU_ATS_GRANULARITY as existing GMMU mappings (see documentation 1926 // in uvm_mmu.h). User mode is supposed to reserve VAs as appropriate to 1927 // prevent any system memory allocations from falling within the NO_ATS 1928 // range of other GMMU mappings, so this shouldn't happen during normal 1929 // operation. However, since this scenario may lead to infinite fault 1930 // loops, we handle it by canceling the fault. 1931 if (uvm_ats_check_in_gmmu_region(va_space, fault_address, va_range_next)) { 1932 service_fault_batch_fatal_notify(gpu, 1933 batch_context, 1934 fault_index, 1935 NV_ERR_INVALID_ADDRESS, 1936 UVM_FAULT_CANCEL_VA_MODE_ALL, 1937 block_faults); 1938 1939 // Do not fail due to logical errors 1940 status = NV_OK; 1941 } 1942 else { 1943 status = service_fault_batch_ats(gpu_va_space, mm, batch_context, fault_index, outer, block_faults); 1944 } 1945 } 1946 else { 1947 service_fault_batch_fatal_notify(gpu, 1948 batch_context, 1949 fault_index, 1950 status, 1951 UVM_FAULT_CANCEL_VA_MODE_ALL, 1952 block_faults); 1953 1954 // Do not fail due to logical errors 1955 status = NV_OK; 1956 } 1957 1958 return status; 1959 } 1960 1961 // Called when a fault in the batch has been marked fatal. Flush the buffer 1962 // under the VA and mmap locks to remove any potential stale fatal faults, then 1963 // service all new faults for just that VA space and cancel those which are 1964 // fatal. Faults in other VA spaces are replayed when done and will be processed 1965 // when normal fault servicing resumes. 1966 static NV_STATUS service_fault_batch_for_cancel(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context) 1967 { 1968 NV_STATUS status = NV_OK; 1969 NvU32 i; 1970 uvm_va_space_t *va_space = batch_context->fatal_va_space; 1971 uvm_gpu_va_space_t *gpu_va_space = NULL; 1972 struct mm_struct *mm; 1973 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 1974 uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.replayable.block_service_context; 1975 uvm_va_block_context_t *va_block_context = service_context->block_context; 1976 1977 UVM_ASSERT(gpu->parent->replayable_faults_supported); 1978 UVM_ASSERT(va_space); 1979 1980 // Perform the flush and re-fetch while holding the mmap_lock and the 1981 // VA space lock. This avoids stale faults because it prevents any vma 1982 // modifications (mmap, munmap, mprotect) from happening between the time HW 1983 // takes the fault and we cancel it. 1984 mm = uvm_va_space_mm_retain_lock(va_space); 1985 uvm_va_block_context_init(va_block_context, mm); 1986 uvm_va_space_down_read(va_space); 1987 1988 // We saw fatal faults in this VA space before. Flush while holding 1989 // mmap_lock to make sure those faults come back (aren't stale). 1990 // 1991 // We need to wait until all old fault messages have arrived before 1992 // flushing, hence UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT. 1993 status = fault_buffer_flush_locked(gpu, 1994 UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT, 1995 UVM_FAULT_REPLAY_TYPE_START, 1996 batch_context); 1997 if (status != NV_OK) 1998 goto done; 1999 2000 // Wait for the flush's replay to finish to give the legitimate faults a 2001 // chance to show up in the buffer again. 2002 status = uvm_tracker_wait(&replayable_faults->replay_tracker); 2003 if (status != NV_OK) 2004 goto done; 2005 2006 // We expect all replayed faults to have arrived in the buffer so we can re- 2007 // service them. The replay-and-wait sequence above will ensure they're all 2008 // in the HW buffer. When GSP owns the HW buffer, we also have to wait for 2009 // GSP to copy all available faults from the HW buffer into the shadow 2010 // buffer. 2011 status = hw_fault_buffer_flush_locked(gpu->parent, HW_FAULT_BUFFER_FLUSH_MODE_MOVE); 2012 if (status != NV_OK) 2013 goto done; 2014 2015 // If there is no GPU VA space for the GPU, ignore all faults in the VA 2016 // space. This can happen if the GPU VA space has been destroyed since we 2017 // unlocked the VA space in service_fault_batch. That means the fatal faults 2018 // are stale, because unregistering the GPU VA space requires preempting the 2019 // context and detaching all channels in that VA space. Restart fault 2020 // servicing from the top. 2021 gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent); 2022 if (!gpu_va_space) 2023 goto done; 2024 2025 // Re-parse the new faults 2026 batch_context->num_invalid_prefetch_faults = 0; 2027 batch_context->num_duplicate_faults = 0; 2028 batch_context->num_replays = 0; 2029 batch_context->fatal_va_space = NULL; 2030 batch_context->has_throttled_faults = false; 2031 2032 status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_ALL); 2033 if (status != NV_OK) 2034 goto done; 2035 2036 // No more faults left. Either the previously-seen fatal entry was stale, or 2037 // RM killed the context underneath us. 2038 if (batch_context->num_cached_faults == 0) 2039 goto done; 2040 2041 ++batch_context->batch_id; 2042 2043 status = preprocess_fault_batch(gpu, batch_context); 2044 if (status != NV_OK) { 2045 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) { 2046 // Another flush happened due to stale faults or a context-fatal 2047 // error. The previously-seen fatal fault might not exist anymore, 2048 // so restart fault servicing from the top. 2049 status = NV_OK; 2050 } 2051 2052 goto done; 2053 } 2054 2055 // Search for the target VA space 2056 for (i = 0; i < batch_context->num_coalesced_faults; i++) { 2057 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i]; 2058 UVM_ASSERT(current_entry->va_space); 2059 if (current_entry->va_space == va_space) 2060 break; 2061 } 2062 2063 while (i < batch_context->num_coalesced_faults) { 2064 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i]; 2065 2066 if (current_entry->va_space != va_space) 2067 break; 2068 2069 // service_fault_batch_dispatch() doesn't expect unserviceable faults. 2070 // Just cancel them directly. 2071 if (current_entry->is_fatal) { 2072 status = cancel_fault_precise_va(gpu, current_entry, UVM_FAULT_CANCEL_VA_MODE_ALL); 2073 if (status != NV_OK) 2074 break; 2075 2076 ++i; 2077 } 2078 else { 2079 uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate; 2080 NvU32 block_faults; 2081 const bool hmm_migratable = true; 2082 2083 ats_invalidate->tlb_batch_pending = false; 2084 2085 // Service all the faults that we can. We only really need to search 2086 // for fatal faults, but attempting to service all is the easiest 2087 // way to do that. 2088 status = service_fault_batch_dispatch(va_space, gpu_va_space, batch_context, i, &block_faults, false, hmm_migratable); 2089 if (status != NV_OK) { 2090 // TODO: Bug 3900733: clean up locking in service_fault_batch(). 2091 // We need to drop lock and retry. That means flushing and 2092 // starting over. 2093 if (status == NV_WARN_MORE_PROCESSING_REQUIRED || status == NV_WARN_MISMATCHED_TARGET) 2094 status = NV_OK; 2095 2096 break; 2097 } 2098 2099 // Invalidate TLBs before cancel to ensure that fatal faults don't 2100 // get stuck in HW behind non-fatal faults to the same line. 2101 status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker); 2102 if (status != NV_OK) 2103 break; 2104 2105 while (block_faults-- > 0) { 2106 current_entry = batch_context->ordered_fault_cache[i]; 2107 if (current_entry->is_fatal) { 2108 status = cancel_fault_precise_va(gpu, current_entry, current_entry->replayable.cancel_va_mode); 2109 if (status != NV_OK) 2110 break; 2111 } 2112 2113 ++i; 2114 } 2115 } 2116 } 2117 2118 done: 2119 uvm_va_space_up_read(va_space); 2120 uvm_va_space_mm_release_unlock(va_space, mm); 2121 2122 if (status == NV_OK) { 2123 // There are two reasons to flush the fault buffer here. 2124 // 2125 // 1) Functional. We need to replay both the serviced non-fatal faults 2126 // and the skipped faults in other VA spaces. The former need to be 2127 // restarted and the latter need to be replayed so the normal fault 2128 // service mechanism can fetch and process them. 2129 // 2130 // 2) Performance. After cancelling the fatal faults, a flush removes 2131 // any potential duplicated fault that may have been added while 2132 // processing the faults in this batch. This flush also avoids doing 2133 // unnecessary processing after the fatal faults have been cancelled, 2134 // so all the rest are unlikely to remain after a replay because the 2135 // context is probably in the process of dying. 2136 status = fault_buffer_flush_locked(gpu, 2137 UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT, 2138 UVM_FAULT_REPLAY_TYPE_START, 2139 batch_context); 2140 } 2141 2142 return status; 2143 } 2144 // Scan the ordered view of faults and group them by different va_blocks 2145 // (managed faults) and service faults for each va_block, in batch. 2146 // Service non-managed faults one at a time as they are encountered during the 2147 // scan. 2148 // 2149 // Fatal faults are marked for later processing by the caller. 2150 static NV_STATUS service_fault_batch(uvm_gpu_t *gpu, 2151 fault_service_mode_t service_mode, 2152 uvm_fault_service_batch_context_t *batch_context) 2153 { 2154 NV_STATUS status = NV_OK; 2155 NvU32 i; 2156 uvm_va_space_t *va_space = NULL; 2157 uvm_gpu_va_space_t *gpu_va_space = NULL; 2158 uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate; 2159 struct mm_struct *mm = NULL; 2160 const bool replay_per_va_block = service_mode != FAULT_SERVICE_MODE_CANCEL && 2161 gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK; 2162 uvm_service_block_context_t *service_context = 2163 &gpu->parent->fault_buffer_info.replayable.block_service_context; 2164 uvm_va_block_context_t *va_block_context = service_context->block_context; 2165 bool hmm_migratable = true; 2166 2167 UVM_ASSERT(gpu->parent->replayable_faults_supported); 2168 2169 ats_invalidate->tlb_batch_pending = false; 2170 2171 for (i = 0; i < batch_context->num_coalesced_faults;) { 2172 NvU32 block_faults; 2173 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i]; 2174 uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[current_entry->fault_source.utlb_id]; 2175 2176 UVM_ASSERT(current_entry->va_space); 2177 2178 if (current_entry->va_space != va_space) { 2179 // Fault on a different va_space, drop the lock of the old one... 2180 if (va_space != NULL) { 2181 // TLB entries are invalidated per GPU VA space 2182 status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker); 2183 if (status != NV_OK) 2184 goto fail; 2185 2186 uvm_va_space_up_read(va_space); 2187 uvm_va_space_mm_release_unlock(va_space, mm); 2188 mm = NULL; 2189 } 2190 2191 va_space = current_entry->va_space; 2192 2193 // ... and take the lock of the new one 2194 2195 // If an mm is registered with the VA space, we have to retain it 2196 // in order to lock it before locking the VA space. It is guaranteed 2197 // to remain valid until we release. If no mm is registered, we 2198 // can only service managed faults, not ATS/HMM faults. 2199 mm = uvm_va_space_mm_retain_lock(va_space); 2200 uvm_va_block_context_init(va_block_context, mm); 2201 2202 uvm_va_space_down_read(va_space); 2203 gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent); 2204 } 2205 2206 // Some faults could be already fatal if they cannot be handled by 2207 // the UVM driver 2208 if (current_entry->is_fatal) { 2209 ++i; 2210 if (!batch_context->fatal_va_space) 2211 batch_context->fatal_va_space = va_space; 2212 2213 utlb->has_fatal_faults = true; 2214 UVM_ASSERT(utlb->num_pending_faults > 0); 2215 continue; 2216 } 2217 2218 if (!gpu_va_space) { 2219 // If there is no GPU VA space for the GPU, ignore the fault. This 2220 // can happen if a GPU VA space is destroyed without explicitly 2221 // freeing all memory ranges and there are stale entries in the 2222 // buffer that got fixed by the servicing in a previous batch. 2223 ++i; 2224 continue; 2225 } 2226 2227 status = service_fault_batch_dispatch(va_space, 2228 gpu_va_space, 2229 batch_context, 2230 i, 2231 &block_faults, 2232 replay_per_va_block, 2233 hmm_migratable); 2234 // TODO: Bug 3900733: clean up locking in service_fault_batch(). 2235 if (status == NV_WARN_MORE_PROCESSING_REQUIRED || status == NV_WARN_MISMATCHED_TARGET) { 2236 if (status == NV_WARN_MISMATCHED_TARGET) 2237 hmm_migratable = false; 2238 uvm_va_space_up_read(va_space); 2239 uvm_va_space_mm_release_unlock(va_space, mm); 2240 mm = NULL; 2241 va_space = NULL; 2242 status = NV_OK; 2243 continue; 2244 } 2245 2246 if (status != NV_OK) 2247 goto fail; 2248 2249 hmm_migratable = true; 2250 i += block_faults; 2251 2252 // Don't issue replays in cancel mode 2253 if (replay_per_va_block && !batch_context->fatal_va_space) { 2254 status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context); 2255 if (status != NV_OK) 2256 goto fail; 2257 2258 // Increment the batch id if UVM_PERF_FAULT_REPLAY_POLICY_BLOCK 2259 // is used, as we issue a replay after servicing each VA block 2260 // and we can service a number of VA blocks before returning. 2261 ++batch_context->batch_id; 2262 } 2263 } 2264 2265 if (va_space != NULL) { 2266 NV_STATUS invalidate_status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker); 2267 if (invalidate_status != NV_OK) 2268 status = invalidate_status; 2269 } 2270 2271 fail: 2272 if (va_space != NULL) { 2273 uvm_va_space_up_read(va_space); 2274 uvm_va_space_mm_release_unlock(va_space, mm); 2275 } 2276 2277 return status; 2278 } 2279 2280 // Tells if the given fault entry is the first one in its uTLB 2281 static bool is_first_fault_in_utlb(uvm_fault_service_batch_context_t *batch_context, NvU32 fault_index) 2282 { 2283 NvU32 i; 2284 NvU32 utlb_id = batch_context->fault_cache[fault_index].fault_source.utlb_id; 2285 2286 for (i = 0; i < fault_index; ++i) { 2287 uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i]; 2288 2289 // We have found a prior fault in the same uTLB 2290 if (current_entry->fault_source.utlb_id == utlb_id) 2291 return false; 2292 } 2293 2294 return true; 2295 } 2296 2297 // Compute the number of fatal and non-fatal faults for a page in the given uTLB 2298 static void faults_for_page_in_utlb(uvm_fault_service_batch_context_t *batch_context, 2299 uvm_va_space_t *va_space, 2300 NvU64 addr, 2301 NvU32 utlb_id, 2302 NvU32 *fatal_faults, 2303 NvU32 *non_fatal_faults) 2304 { 2305 NvU32 i; 2306 2307 *fatal_faults = 0; 2308 *non_fatal_faults = 0; 2309 2310 // Fault filtering is not allowed in the TLB-based fault cancel path 2311 UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults); 2312 2313 for (i = 0; i < batch_context->num_cached_faults; ++i) { 2314 uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i]; 2315 2316 if (current_entry->fault_source.utlb_id == utlb_id && 2317 current_entry->va_space == va_space && current_entry->fault_address == addr) { 2318 // We have found the page 2319 if (current_entry->is_fatal) 2320 ++(*fatal_faults); 2321 else 2322 ++(*non_fatal_faults); 2323 } 2324 } 2325 } 2326 2327 // Function that tells if there are addresses (reminder: they are aligned to 4K) 2328 // with non-fatal faults only 2329 static bool no_fatal_pages_in_utlb(uvm_fault_service_batch_context_t *batch_context, 2330 NvU32 start_index, 2331 NvU32 utlb_id) 2332 { 2333 NvU32 i; 2334 2335 // Fault filtering is not allowed in the TLB-based fault cancel path 2336 UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults); 2337 2338 for (i = start_index; i < batch_context->num_cached_faults; ++i) { 2339 uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i]; 2340 2341 if (current_entry->fault_source.utlb_id == utlb_id) { 2342 // We have found a fault for the uTLB 2343 NvU32 fatal_faults; 2344 NvU32 non_fatal_faults; 2345 2346 faults_for_page_in_utlb(batch_context, 2347 current_entry->va_space, 2348 current_entry->fault_address, 2349 utlb_id, 2350 &fatal_faults, 2351 &non_fatal_faults); 2352 2353 if (non_fatal_faults > 0 && fatal_faults == 0) 2354 return true; 2355 } 2356 } 2357 2358 return false; 2359 } 2360 2361 static void record_fatal_fault_helper(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *entry, UvmEventFatalReason reason) 2362 { 2363 uvm_va_space_t *va_space; 2364 2365 va_space = entry->va_space; 2366 UVM_ASSERT(va_space); 2367 uvm_va_space_down_read(va_space); 2368 // Record fatal fault event 2369 uvm_tools_record_gpu_fatal_fault(gpu->id, va_space, entry, reason); 2370 uvm_va_space_up_read(va_space); 2371 } 2372 2373 // This function tries to find and issue a cancel for each uTLB that meets 2374 // the requirements to guarantee precise fault attribution: 2375 // - No new faults can arrive on the uTLB (uTLB is in lockdown) 2376 // - The first fault in the buffer for a specific uTLB is fatal 2377 // - There are no other addresses in the uTLB with non-fatal faults only 2378 // 2379 // This function and the related helpers iterate over faults as read from HW, 2380 // not through the ordered fault view 2381 // 2382 // TODO: Bug 1766754 2383 // This is very costly, although not critical for performance since we are 2384 // cancelling. 2385 // - Build a list with all the faults within a uTLB 2386 // - Sort by uTLB id 2387 static NV_STATUS try_to_cancel_utlbs(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context) 2388 { 2389 NvU32 i; 2390 2391 // Fault filtering is not allowed in the TLB-based fault cancel path 2392 UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults); 2393 2394 for (i = 0; i < batch_context->num_cached_faults; ++i) { 2395 uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i]; 2396 uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[current_entry->fault_source.utlb_id]; 2397 NvU32 gpc_id = current_entry->fault_source.gpc_id; 2398 NvU32 utlb_id = current_entry->fault_source.utlb_id; 2399 NvU32 client_id = current_entry->fault_source.client_id; 2400 2401 // Only fatal faults are considered 2402 if (!current_entry->is_fatal) 2403 continue; 2404 2405 // Only consider uTLBs in lock-down 2406 if (!utlb->in_lockdown) 2407 continue; 2408 2409 // Issue a single cancel per uTLB 2410 if (utlb->cancelled) 2411 continue; 2412 2413 if (is_first_fault_in_utlb(batch_context, i) && 2414 !no_fatal_pages_in_utlb(batch_context, i + 1, utlb_id)) { 2415 NV_STATUS status; 2416 2417 record_fatal_fault_helper(gpu, current_entry, current_entry->fatal_reason); 2418 2419 status = push_cancel_on_gpu_targeted(gpu, 2420 current_entry->instance_ptr, 2421 gpc_id, 2422 client_id, 2423 &batch_context->tracker); 2424 if (status != NV_OK) 2425 return status; 2426 2427 utlb->cancelled = true; 2428 } 2429 } 2430 2431 return NV_OK; 2432 } 2433 2434 static NvU32 find_fatal_fault_in_utlb(uvm_fault_service_batch_context_t *batch_context, 2435 NvU32 utlb_id) 2436 { 2437 NvU32 i; 2438 2439 // Fault filtering is not allowed in the TLB-based fault cancel path 2440 UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults); 2441 2442 for (i = 0; i < batch_context->num_cached_faults; ++i) { 2443 if (batch_context->fault_cache[i].is_fatal && 2444 batch_context->fault_cache[i].fault_source.utlb_id == utlb_id) 2445 return i; 2446 } 2447 2448 return i; 2449 } 2450 2451 static NvU32 is_fatal_fault_in_buffer(uvm_fault_service_batch_context_t *batch_context, 2452 uvm_fault_buffer_entry_t *fault) 2453 { 2454 NvU32 i; 2455 2456 // Fault filtering is not allowed in the TLB-based fault cancel path 2457 UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults); 2458 2459 for (i = 0; i < batch_context->num_cached_faults; ++i) { 2460 uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i]; 2461 if (cmp_fault_instance_ptr(current_entry, fault) == 0 && 2462 current_entry->fault_address == fault->fault_address && 2463 current_entry->fault_access_type == fault->fault_access_type && 2464 current_entry->fault_source.utlb_id == fault->fault_source.utlb_id) { 2465 return true; 2466 } 2467 } 2468 2469 return false; 2470 } 2471 2472 // Cancel all faults in the given fault service batch context, even those not 2473 // marked as fatal. 2474 static NV_STATUS cancel_faults_all(uvm_gpu_t *gpu, 2475 uvm_fault_service_batch_context_t *batch_context, 2476 UvmEventFatalReason reason) 2477 { 2478 NV_STATUS status = NV_OK; 2479 NV_STATUS fault_status; 2480 NvU32 i = 0; 2481 2482 UVM_ASSERT(gpu->parent->fault_cancel_va_supported); 2483 UVM_ASSERT(reason != UvmEventFatalReasonInvalid); 2484 2485 while (i < batch_context->num_coalesced_faults && status == NV_OK) { 2486 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i]; 2487 uvm_va_space_t *va_space = current_entry->va_space; 2488 bool skip_va_space; 2489 2490 UVM_ASSERT(va_space); 2491 2492 uvm_va_space_down_read(va_space); 2493 2494 // If there is no GPU VA space for the GPU, ignore all faults in 2495 // that VA space. This can happen if the GPU VA space has been 2496 // destroyed since we unlocked the VA space in service_fault_batch. 2497 // Ignoring the fault avoids targetting a PDB that might have been 2498 // reused by another process. 2499 skip_va_space = !uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent); 2500 2501 for (; 2502 i < batch_context->num_coalesced_faults && current_entry->va_space == va_space; 2503 current_entry = batch_context->ordered_fault_cache[++i]) { 2504 uvm_fault_cancel_va_mode_t cancel_va_mode; 2505 2506 if (skip_va_space) 2507 continue; 2508 2509 if (current_entry->is_fatal) { 2510 UVM_ASSERT(current_entry->fatal_reason != UvmEventFatalReasonInvalid); 2511 cancel_va_mode = current_entry->replayable.cancel_va_mode; 2512 } 2513 else { 2514 current_entry->fatal_reason = reason; 2515 cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL; 2516 } 2517 2518 status = cancel_fault_precise_va(gpu, current_entry, cancel_va_mode); 2519 if (status != NV_OK) 2520 break; 2521 } 2522 2523 uvm_va_space_up_read(va_space); 2524 } 2525 2526 // Because each cancel itself triggers a replay, there may be a large number 2527 // of new duplicated faults in the buffer after cancelling all the known 2528 // ones. Flushing the buffer discards them to avoid unnecessary processing. 2529 fault_status = fault_buffer_flush_locked(gpu, 2530 UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT, 2531 UVM_FAULT_REPLAY_TYPE_START, 2532 batch_context); 2533 2534 // We report the first encountered error. 2535 if (status == NV_OK) 2536 status = fault_status; 2537 2538 return status; 2539 } 2540 2541 // Function called when the system has found a global error and needs to 2542 // trigger RC in RM. 2543 static void cancel_fault_batch_tlb(uvm_gpu_t *gpu, 2544 uvm_fault_service_batch_context_t *batch_context, 2545 UvmEventFatalReason reason) 2546 { 2547 NvU32 i; 2548 2549 for (i = 0; i < batch_context->num_coalesced_faults; ++i) { 2550 NV_STATUS status = NV_OK; 2551 uvm_fault_buffer_entry_t *current_entry; 2552 uvm_fault_buffer_entry_t *coalesced_entry; 2553 2554 current_entry = batch_context->ordered_fault_cache[i]; 2555 2556 // The list iteration below skips the entry used as 'head'. 2557 // Report the 'head' entry explicitly. 2558 uvm_va_space_down_read(current_entry->va_space); 2559 uvm_tools_record_gpu_fatal_fault(gpu->id, current_entry->va_space, current_entry, reason); 2560 2561 list_for_each_entry(coalesced_entry, ¤t_entry->merged_instances_list, merged_instances_list) 2562 uvm_tools_record_gpu_fatal_fault(gpu->id, current_entry->va_space, coalesced_entry, reason); 2563 uvm_va_space_up_read(current_entry->va_space); 2564 2565 // We need to cancel each instance pointer to correctly handle faults from multiple contexts. 2566 status = push_cancel_on_gpu_global(gpu, current_entry->instance_ptr, &batch_context->tracker); 2567 if (status != NV_OK) 2568 break; 2569 } 2570 } 2571 2572 static void cancel_fault_batch(uvm_gpu_t *gpu, 2573 uvm_fault_service_batch_context_t *batch_context, 2574 UvmEventFatalReason reason) 2575 { 2576 // Return code is ignored since we're on a global error path and wouldn't be 2577 // able to recover anyway. 2578 if (gpu->parent->fault_cancel_va_supported) 2579 cancel_faults_all(gpu, batch_context, reason); 2580 else 2581 cancel_fault_batch_tlb(gpu, batch_context, reason); 2582 } 2583 2584 2585 // Current fault cancel algorithm 2586 // 2587 // 1- Disable prefetching to avoid new requests keep coming and flooding the 2588 // buffer. 2589 // LOOP 2590 // 2- Record one fatal fault per uTLB to check if it shows up after the replay 2591 // 3- Flush fault buffer (REPLAY_TYPE_START_ACK_ALL to prevent new faults from 2592 // coming to TLBs with pending faults) 2593 // 4- Wait for replay to finish 2594 // 5- Fetch all faults from buffer 2595 // 6- Check what uTLBs are in lockdown mode and can be cancelled 2596 // 7- Preprocess faults (order per va_space, fault address, access type) 2597 // 8- Service all non-fatal faults and mark all non-serviceable faults as fatal 2598 // 6.1- If fatal faults are not found, we are done 2599 // 9- Search for a uTLB which can be targeted for cancel, as described in 2600 // try_to_cancel_utlbs. If found, cancel it. 2601 // END LOOP 2602 // 10- Re-enable prefetching 2603 // 2604 // NOTE: prefetch faults MUST NOT trigger fault cancel. We make sure that no 2605 // prefetch faults are left in the buffer by disabling prefetching and 2606 // flushing the fault buffer afterwards (prefetch faults are not replayed and, 2607 // therefore, will not show up again) 2608 static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context) 2609 { 2610 NV_STATUS status; 2611 NV_STATUS tracker_status; 2612 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 2613 bool first = true; 2614 2615 UVM_ASSERT(gpu->parent->replayable_faults_supported); 2616 2617 // 1) Disable prefetching to avoid new requests keep coming and flooding 2618 // the buffer 2619 if (gpu->parent->fault_buffer_info.prefetch_faults_enabled) 2620 gpu->parent->arch_hal->disable_prefetch_faults(gpu->parent); 2621 2622 while (1) { 2623 NvU32 utlb_id; 2624 2625 // 2) Record one fatal fault per uTLB to check if it shows up after 2626 // the replay. This is used to handle the case in which the uTLB is 2627 // being cancelled from behind our backs by RM. See the comment in 2628 // step 6. 2629 for (utlb_id = 0; utlb_id <= batch_context->max_utlb_id; ++utlb_id) { 2630 uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[utlb_id]; 2631 2632 if (!first && utlb->has_fatal_faults) { 2633 NvU32 idx = find_fatal_fault_in_utlb(batch_context, utlb_id); 2634 UVM_ASSERT(idx < batch_context->num_cached_faults); 2635 2636 utlb->prev_fatal_fault = batch_context->fault_cache[idx]; 2637 } 2638 else { 2639 utlb->prev_fatal_fault.fault_address = (NvU64)-1; 2640 } 2641 } 2642 first = false; 2643 2644 // 3) Flush fault buffer. After this call, all faults from any of the 2645 // faulting uTLBs are before PUT. New faults from other uTLBs can keep 2646 // arriving. Therefore, in each iteration we just try to cancel faults 2647 // from uTLBs that contained fatal faults in the previous iterations 2648 // and will cause the TLB to stop generating new page faults after the 2649 // following replay with type UVM_FAULT_REPLAY_TYPE_START_ACK_ALL. 2650 // 2651 // No need to use UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT since we 2652 // don't care too much about old faults, just new faults from uTLBs 2653 // which faulted before the replay. 2654 status = fault_buffer_flush_locked(gpu, 2655 UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT, 2656 UVM_FAULT_REPLAY_TYPE_START_ACK_ALL, 2657 batch_context); 2658 if (status != NV_OK) 2659 break; 2660 2661 // 4) Wait for replay to finish 2662 status = uvm_tracker_wait(&replayable_faults->replay_tracker); 2663 if (status != NV_OK) 2664 break; 2665 2666 batch_context->num_invalid_prefetch_faults = 0; 2667 batch_context->num_replays = 0; 2668 batch_context->fatal_va_space = NULL; 2669 batch_context->has_throttled_faults = false; 2670 2671 // 5) Fetch all faults from buffer 2672 status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_ALL); 2673 if (status != NV_OK) 2674 break; 2675 2676 ++batch_context->batch_id; 2677 2678 UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults); 2679 2680 // No more faults left, we are done 2681 if (batch_context->num_cached_faults == 0) 2682 break; 2683 2684 // 6) Check what uTLBs are in lockdown mode and can be cancelled 2685 for (utlb_id = 0; utlb_id <= batch_context->max_utlb_id; ++utlb_id) { 2686 uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[utlb_id]; 2687 2688 utlb->in_lockdown = false; 2689 utlb->cancelled = false; 2690 2691 if (utlb->prev_fatal_fault.fault_address != (NvU64)-1) { 2692 // If a previously-reported fault shows up again we can "safely" 2693 // assume that the uTLB that contains it is in lockdown mode 2694 // and no new translations will show up before cancel. 2695 // A fatal fault could only be removed behind our backs by RM 2696 // issuing a cancel, which only happens when RM is resetting the 2697 // engine. That means the instance pointer can't generate any 2698 // new faults, so we won't have an ABA problem where a new 2699 // fault arrives with the same state. 2700 if (is_fatal_fault_in_buffer(batch_context, &utlb->prev_fatal_fault)) 2701 utlb->in_lockdown = true; 2702 } 2703 } 2704 2705 // 7) Preprocess faults 2706 status = preprocess_fault_batch(gpu, batch_context); 2707 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) 2708 continue; 2709 else if (status != NV_OK) 2710 break; 2711 2712 // 8) Service all non-fatal faults and mark all non-serviceable faults 2713 // as fatal 2714 status = service_fault_batch(gpu, FAULT_SERVICE_MODE_CANCEL, batch_context); 2715 UVM_ASSERT(batch_context->num_replays == 0); 2716 if (status == NV_ERR_NO_MEMORY) 2717 continue; 2718 else if (status != NV_OK) 2719 break; 2720 2721 // No more fatal faults left, we are done 2722 if (!batch_context->fatal_va_space) 2723 break; 2724 2725 // 9) Search for uTLBs that contain fatal faults and meet the 2726 // requirements to be cancelled 2727 try_to_cancel_utlbs(gpu, batch_context); 2728 } 2729 2730 // 10) Re-enable prefetching 2731 if (gpu->parent->fault_buffer_info.prefetch_faults_enabled) 2732 gpu->parent->arch_hal->enable_prefetch_faults(gpu->parent); 2733 2734 if (status == NV_OK) 2735 status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context); 2736 2737 tracker_status = uvm_tracker_wait(&batch_context->tracker); 2738 2739 return status == NV_OK? tracker_status: status; 2740 } 2741 2742 static NV_STATUS cancel_faults_precise(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context) 2743 { 2744 UVM_ASSERT(batch_context->fatal_va_space); 2745 if (gpu->parent->fault_cancel_va_supported) 2746 return service_fault_batch_for_cancel(gpu, batch_context); 2747 2748 return cancel_faults_precise_tlb(gpu, batch_context); 2749 } 2750 2751 static void enable_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu, uvm_fault_service_batch_context_t *batch_context) 2752 { 2753 if (!parent_gpu->prefetch_fault_supported) 2754 return; 2755 2756 // If more than 66% of faults are invalid prefetch accesses, disable 2757 // prefetch faults for a while. 2758 // num_invalid_prefetch_faults may be higher than the actual count. See the 2759 // comment in mark_fault_invalid_prefetch(..). 2760 // Some tests rely on this logic (and ratio) to correctly disable prefetch 2761 // fault reporting. If the logic changes, the tests will have to be changed. 2762 if (parent_gpu->fault_buffer_info.prefetch_faults_enabled && 2763 uvm_perf_reenable_prefetch_faults_lapse_msec > 0 && 2764 ((batch_context->num_invalid_prefetch_faults * 3 > parent_gpu->fault_buffer_info.max_batch_size * 2) || 2765 (uvm_enable_builtin_tests && 2766 parent_gpu->rm_info.isSimulated && 2767 batch_context->num_invalid_prefetch_faults > 5))) { 2768 uvm_parent_gpu_disable_prefetch_faults(parent_gpu); 2769 } 2770 else if (!parent_gpu->fault_buffer_info.prefetch_faults_enabled) { 2771 NvU64 lapse = NV_GETTIME() - parent_gpu->fault_buffer_info.disable_prefetch_faults_timestamp; 2772 2773 // Reenable prefetch faults after some time 2774 if (lapse > ((NvU64)uvm_perf_reenable_prefetch_faults_lapse_msec * (1000 * 1000))) 2775 uvm_parent_gpu_enable_prefetch_faults(parent_gpu); 2776 } 2777 } 2778 2779 void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu) 2780 { 2781 NvU32 num_replays = 0; 2782 NvU32 num_batches = 0; 2783 NvU32 num_throttled = 0; 2784 NV_STATUS status = NV_OK; 2785 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 2786 uvm_fault_service_batch_context_t *batch_context = &replayable_faults->batch_service_context; 2787 2788 UVM_ASSERT(gpu->parent->replayable_faults_supported); 2789 2790 uvm_tracker_init(&batch_context->tracker); 2791 2792 // Process all faults in the buffer 2793 while (1) { 2794 if (num_throttled >= uvm_perf_fault_max_throttle_per_service || 2795 num_batches >= uvm_perf_fault_max_batches_per_service) { 2796 break; 2797 } 2798 2799 batch_context->num_invalid_prefetch_faults = 0; 2800 batch_context->num_duplicate_faults = 0; 2801 batch_context->num_replays = 0; 2802 batch_context->fatal_va_space = NULL; 2803 batch_context->has_throttled_faults = false; 2804 2805 status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_BATCH_READY); 2806 if (status != NV_OK) 2807 break; 2808 2809 if (batch_context->num_cached_faults == 0) 2810 break; 2811 2812 ++batch_context->batch_id; 2813 2814 status = preprocess_fault_batch(gpu, batch_context); 2815 2816 num_replays += batch_context->num_replays; 2817 2818 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) 2819 continue; 2820 else if (status != NV_OK) 2821 break; 2822 2823 status = service_fault_batch(gpu, FAULT_SERVICE_MODE_REGULAR, batch_context); 2824 2825 // We may have issued replays even if status != NV_OK if 2826 // UVM_PERF_FAULT_REPLAY_POLICY_BLOCK is being used or the fault buffer 2827 // was flushed 2828 num_replays += batch_context->num_replays; 2829 2830 enable_disable_prefetch_faults(gpu->parent, batch_context); 2831 2832 if (status != NV_OK) { 2833 // Unconditionally cancel all faults to trigger RC. This will not 2834 // provide precise attribution, but this case handles global 2835 // errors such as OOM or ECC where it's not reasonable to 2836 // guarantee precise attribution. We ignore the return value of 2837 // the cancel operation since this path is already returning an 2838 // error code. 2839 cancel_fault_batch(gpu, batch_context, uvm_tools_status_to_fatal_fault_reason(status)); 2840 break; 2841 } 2842 2843 if (batch_context->fatal_va_space) { 2844 status = uvm_tracker_wait(&batch_context->tracker); 2845 if (status == NV_OK) { 2846 status = cancel_faults_precise(gpu, batch_context); 2847 if (status == NV_OK) { 2848 // Cancel handling should've issued at least one replay 2849 UVM_ASSERT(batch_context->num_replays > 0); 2850 ++num_batches; 2851 continue; 2852 } 2853 } 2854 2855 break; 2856 } 2857 2858 if (replayable_faults->replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BATCH) { 2859 status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context); 2860 if (status != NV_OK) 2861 break; 2862 ++num_replays; 2863 } 2864 else if (replayable_faults->replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BATCH_FLUSH) { 2865 uvm_gpu_buffer_flush_mode_t flush_mode = UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT; 2866 2867 if (batch_context->num_duplicate_faults * 100 > 2868 batch_context->num_cached_faults * replayable_faults->replay_update_put_ratio) { 2869 flush_mode = UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT; 2870 } 2871 2872 status = fault_buffer_flush_locked(gpu, flush_mode, UVM_FAULT_REPLAY_TYPE_START, batch_context); 2873 if (status != NV_OK) 2874 break; 2875 ++num_replays; 2876 status = uvm_tracker_wait(&replayable_faults->replay_tracker); 2877 if (status != NV_OK) 2878 break; 2879 } 2880 2881 if (batch_context->has_throttled_faults) 2882 ++num_throttled; 2883 2884 ++num_batches; 2885 } 2886 2887 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) 2888 status = NV_OK; 2889 2890 // Make sure that we issue at least one replay if no replay has been 2891 // issued yet to avoid dropping faults that do not show up in the buffer 2892 if ((status == NV_OK && replayable_faults->replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_ONCE) || 2893 num_replays == 0) 2894 status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context); 2895 2896 uvm_tracker_deinit(&batch_context->tracker); 2897 2898 if (status != NV_OK) 2899 UVM_DBG_PRINT("Error servicing replayable faults on GPU: %s\n", uvm_gpu_name(gpu)); 2900 } 2901 2902 void uvm_parent_gpu_enable_prefetch_faults(uvm_parent_gpu_t *parent_gpu) 2903 { 2904 UVM_ASSERT(parent_gpu->isr.replayable_faults.handling); 2905 UVM_ASSERT(parent_gpu->prefetch_fault_supported); 2906 2907 if (!parent_gpu->fault_buffer_info.prefetch_faults_enabled) { 2908 parent_gpu->arch_hal->enable_prefetch_faults(parent_gpu); 2909 parent_gpu->fault_buffer_info.prefetch_faults_enabled = true; 2910 } 2911 } 2912 2913 void uvm_parent_gpu_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu) 2914 { 2915 UVM_ASSERT(parent_gpu->isr.replayable_faults.handling); 2916 UVM_ASSERT(parent_gpu->prefetch_fault_supported); 2917 2918 if (parent_gpu->fault_buffer_info.prefetch_faults_enabled) { 2919 parent_gpu->arch_hal->disable_prefetch_faults(parent_gpu); 2920 parent_gpu->fault_buffer_info.prefetch_faults_enabled = false; 2921 parent_gpu->fault_buffer_info.disable_prefetch_faults_timestamp = NV_GETTIME(); 2922 } 2923 } 2924 2925 const char *uvm_perf_fault_replay_policy_string(uvm_perf_fault_replay_policy_t replay_policy) 2926 { 2927 BUILD_BUG_ON(UVM_PERF_FAULT_REPLAY_POLICY_MAX != 4); 2928 2929 switch (replay_policy) { 2930 UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_BLOCK); 2931 UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_BATCH); 2932 UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_BATCH_FLUSH); 2933 UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_ONCE); 2934 UVM_ENUM_STRING_DEFAULT(); 2935 } 2936 } 2937 2938 NV_STATUS uvm_test_get_prefetch_faults_reenable_lapse(UVM_TEST_GET_PREFETCH_FAULTS_REENABLE_LAPSE_PARAMS *params, 2939 struct file *filp) 2940 { 2941 params->reenable_lapse = uvm_perf_reenable_prefetch_faults_lapse_msec; 2942 2943 return NV_OK; 2944 } 2945 2946 NV_STATUS uvm_test_set_prefetch_faults_reenable_lapse(UVM_TEST_SET_PREFETCH_FAULTS_REENABLE_LAPSE_PARAMS *params, 2947 struct file *filp) 2948 { 2949 uvm_perf_reenable_prefetch_faults_lapse_msec = params->reenable_lapse; 2950 2951 return NV_OK; 2952 } 2953 2954 NV_STATUS uvm_test_drain_replayable_faults(UVM_TEST_DRAIN_REPLAYABLE_FAULTS_PARAMS *params, struct file *filp) 2955 { 2956 uvm_gpu_t *gpu; 2957 NV_STATUS status = NV_OK; 2958 uvm_spin_loop_t spin; 2959 bool pending = true; 2960 uvm_va_space_t *va_space = uvm_va_space_get(filp); 2961 2962 gpu = uvm_va_space_retain_gpu_by_uuid(va_space, ¶ms->gpu_uuid); 2963 if (!gpu) 2964 return NV_ERR_INVALID_DEVICE; 2965 2966 uvm_spin_loop_init(&spin); 2967 2968 do { 2969 uvm_parent_gpu_replayable_faults_isr_lock(gpu->parent); 2970 pending = uvm_parent_gpu_replayable_faults_pending(gpu->parent); 2971 uvm_parent_gpu_replayable_faults_isr_unlock(gpu->parent); 2972 2973 if (!pending) 2974 break; 2975 2976 if (fatal_signal_pending(current)) { 2977 status = NV_ERR_SIGNAL_PENDING; 2978 break; 2979 } 2980 2981 UVM_SPIN_LOOP(&spin); 2982 } while (uvm_spin_loop_elapsed(&spin) < params->timeout_ns); 2983 2984 if (pending && status == NV_OK) 2985 status = NV_ERR_TIMEOUT; 2986 2987 uvm_gpu_release(gpu); 2988 2989 return status; 2990 } 2991