1 /******************************************************************************* 2 Copyright (c) 2015-2022 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "linux/sort.h" 25 #include "nv_uvm_interface.h" 26 #include "uvm_linux.h" 27 #include "uvm_global.h" 28 #include "uvm_gpu_replayable_faults.h" 29 #include "uvm_hal.h" 30 #include "uvm_kvmalloc.h" 31 #include "uvm_tools.h" 32 #include "uvm_va_block.h" 33 #include "uvm_va_range.h" 34 #include "uvm_va_space.h" 35 #include "uvm_va_space_mm.h" 36 #include "uvm_procfs.h" 37 #include "uvm_perf_thrashing.h" 38 #include "uvm_gpu_non_replayable_faults.h" 39 #include "uvm_ats_faults.h" 40 #include "uvm_test.h" 41 42 // The documentation at the beginning of uvm_gpu_non_replayable_faults.c 43 // provides some background for understanding replayable faults, non-replayable 44 // faults, and how UVM services each fault type. 45 46 #define UVM_PERF_REENABLE_PREFETCH_FAULTS_LAPSE_MSEC_DEFAULT 1000 47 48 // Lapse of time in milliseconds after which prefetch faults can be re-enabled. 49 // 0 means it is never disabled 50 static unsigned uvm_perf_reenable_prefetch_faults_lapse_msec = UVM_PERF_REENABLE_PREFETCH_FAULTS_LAPSE_MSEC_DEFAULT; 51 module_param(uvm_perf_reenable_prefetch_faults_lapse_msec, uint, S_IRUGO); 52 53 #define UVM_PERF_FAULT_BATCH_COUNT_MIN 1 54 #define UVM_PERF_FAULT_BATCH_COUNT_DEFAULT 256 55 56 // Number of entries that are fetched from the GPU fault buffer and serviced in 57 // batch 58 static unsigned uvm_perf_fault_batch_count = UVM_PERF_FAULT_BATCH_COUNT_DEFAULT; 59 module_param(uvm_perf_fault_batch_count, uint, S_IRUGO); 60 61 #define UVM_PERF_FAULT_REPLAY_POLICY_DEFAULT UVM_PERF_FAULT_REPLAY_POLICY_BATCH_FLUSH 62 63 // Policy that determines when to issue fault replays 64 static uvm_perf_fault_replay_policy_t uvm_perf_fault_replay_policy = UVM_PERF_FAULT_REPLAY_POLICY_DEFAULT; 65 module_param(uvm_perf_fault_replay_policy, uint, S_IRUGO); 66 67 #define UVM_PERF_FAULT_REPLAY_UPDATE_PUT_RATIO_DEFAULT 50 68 69 // Reading fault buffer GET/PUT pointers from the CPU is expensive. However, 70 // updating PUT before flushing the buffer helps minimizing the number of 71 // duplicates in the buffer as it discards faults that were not processed 72 // because of the batch size limit or because they arrived during servicing. 73 // If PUT is not updated, the replay operation will make them show up again 74 // in the buffer as duplicates. 75 // 76 // We keep track of the number of duplicates in each batch and we use 77 // UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT for the fault buffer flush after if the 78 // percentage of duplicate faults in a batch is greater than the ratio defined 79 // in the following module parameter. UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT is 80 // used, otherwise. 81 static unsigned uvm_perf_fault_replay_update_put_ratio = UVM_PERF_FAULT_REPLAY_UPDATE_PUT_RATIO_DEFAULT; 82 module_param(uvm_perf_fault_replay_update_put_ratio, uint, S_IRUGO); 83 84 #define UVM_PERF_FAULT_MAX_BATCHES_PER_SERVICE_DEFAULT 20 85 86 #define UVM_PERF_FAULT_MAX_THROTTLE_PER_SERVICE_DEFAULT 5 87 88 // Maximum number of batches to be processed per execution of the bottom-half 89 static unsigned uvm_perf_fault_max_batches_per_service = UVM_PERF_FAULT_MAX_BATCHES_PER_SERVICE_DEFAULT; 90 module_param(uvm_perf_fault_max_batches_per_service, uint, S_IRUGO); 91 92 // Maximum number of batches with thrashing pages per execution of the bottom-half 93 static unsigned uvm_perf_fault_max_throttle_per_service = UVM_PERF_FAULT_MAX_THROTTLE_PER_SERVICE_DEFAULT; 94 module_param(uvm_perf_fault_max_throttle_per_service, uint, S_IRUGO); 95 96 static unsigned uvm_perf_fault_coalesce = 1; 97 module_param(uvm_perf_fault_coalesce, uint, S_IRUGO); 98 99 // This function is used for both the initial fault buffer initialization and 100 // the power management resume path. 101 static void fault_buffer_reinit_replayable_faults(uvm_parent_gpu_t *parent_gpu) 102 { 103 uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable; 104 105 // Read the current get/put pointers, as this might not be the first time 106 // we take control of the fault buffer since the GPU was initialized, 107 // or since we may need to bring UVM's cached copies back in sync following 108 // a sleep cycle. 109 replayable_faults->cached_get = parent_gpu->fault_buffer_hal->read_get(parent_gpu); 110 replayable_faults->cached_put = parent_gpu->fault_buffer_hal->read_put(parent_gpu); 111 112 // (Re-)enable fault prefetching 113 if (parent_gpu->fault_buffer_info.prefetch_faults_enabled) 114 parent_gpu->arch_hal->enable_prefetch_faults(parent_gpu); 115 else 116 parent_gpu->arch_hal->disable_prefetch_faults(parent_gpu); 117 } 118 119 // There is no error handling in this function. The caller is in charge of 120 // calling fault_buffer_deinit_replayable_faults on failure. 121 static NV_STATUS fault_buffer_init_replayable_faults(uvm_parent_gpu_t *parent_gpu) 122 { 123 NV_STATUS status = NV_OK; 124 uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable; 125 uvm_fault_service_batch_context_t *batch_context = &replayable_faults->batch_service_context; 126 127 UVM_ASSERT(parent_gpu->fault_buffer_info.rm_info.replayable.bufferSize % 128 parent_gpu->fault_buffer_hal->entry_size(parent_gpu) == 0); 129 130 replayable_faults->max_faults = parent_gpu->fault_buffer_info.rm_info.replayable.bufferSize / 131 parent_gpu->fault_buffer_hal->entry_size(parent_gpu); 132 133 // Check provided module parameter value 134 parent_gpu->fault_buffer_info.max_batch_size = max(uvm_perf_fault_batch_count, 135 (NvU32)UVM_PERF_FAULT_BATCH_COUNT_MIN); 136 parent_gpu->fault_buffer_info.max_batch_size = min(parent_gpu->fault_buffer_info.max_batch_size, 137 replayable_faults->max_faults); 138 139 if (parent_gpu->fault_buffer_info.max_batch_size != uvm_perf_fault_batch_count) { 140 pr_info("Invalid uvm_perf_fault_batch_count value on GPU %s: %u. Valid range [%u:%u] Using %u instead\n", 141 parent_gpu->name, 142 uvm_perf_fault_batch_count, 143 UVM_PERF_FAULT_BATCH_COUNT_MIN, 144 replayable_faults->max_faults, 145 parent_gpu->fault_buffer_info.max_batch_size); 146 } 147 148 batch_context->fault_cache = uvm_kvmalloc_zero(replayable_faults->max_faults * sizeof(*batch_context->fault_cache)); 149 if (!batch_context->fault_cache) 150 return NV_ERR_NO_MEMORY; 151 152 // fault_cache is used to signal that the tracker was initialized. 153 uvm_tracker_init(&replayable_faults->replay_tracker); 154 155 batch_context->ordered_fault_cache = uvm_kvmalloc_zero(replayable_faults->max_faults * 156 sizeof(*batch_context->ordered_fault_cache)); 157 if (!batch_context->ordered_fault_cache) 158 return NV_ERR_NO_MEMORY; 159 160 // This value must be initialized by HAL 161 UVM_ASSERT(replayable_faults->utlb_count > 0); 162 163 batch_context->utlbs = uvm_kvmalloc_zero(replayable_faults->utlb_count * sizeof(*batch_context->utlbs)); 164 if (!batch_context->utlbs) 165 return NV_ERR_NO_MEMORY; 166 167 batch_context->max_utlb_id = 0; 168 169 status = uvm_rm_locked_call(nvUvmInterfaceOwnPageFaultIntr(parent_gpu->rm_device, NV_TRUE)); 170 if (status != NV_OK) { 171 UVM_ERR_PRINT("Failed to take page fault ownership from RM: %s, GPU %s\n", 172 nvstatusToString(status), 173 parent_gpu->name); 174 return status; 175 } 176 177 replayable_faults->replay_policy = uvm_perf_fault_replay_policy < UVM_PERF_FAULT_REPLAY_POLICY_MAX? 178 uvm_perf_fault_replay_policy: 179 UVM_PERF_FAULT_REPLAY_POLICY_DEFAULT; 180 181 if (replayable_faults->replay_policy != uvm_perf_fault_replay_policy) { 182 pr_info("Invalid uvm_perf_fault_replay_policy value on GPU %s: %d. Using %d instead\n", 183 parent_gpu->name, 184 uvm_perf_fault_replay_policy, 185 replayable_faults->replay_policy); 186 } 187 188 replayable_faults->replay_update_put_ratio = min(uvm_perf_fault_replay_update_put_ratio, 100u); 189 if (replayable_faults->replay_update_put_ratio != uvm_perf_fault_replay_update_put_ratio) { 190 pr_info("Invalid uvm_perf_fault_replay_update_put_ratio value on GPU %s: %u. Using %u instead\n", 191 parent_gpu->name, 192 uvm_perf_fault_replay_update_put_ratio, 193 replayable_faults->replay_update_put_ratio); 194 } 195 196 // Re-enable fault prefetching just in case it was disabled in a previous run 197 parent_gpu->fault_buffer_info.prefetch_faults_enabled = parent_gpu->prefetch_fault_supported; 198 199 fault_buffer_reinit_replayable_faults(parent_gpu); 200 201 return NV_OK; 202 } 203 204 static void fault_buffer_deinit_replayable_faults(uvm_parent_gpu_t *parent_gpu) 205 { 206 uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable; 207 uvm_fault_service_batch_context_t *batch_context = &replayable_faults->batch_service_context; 208 209 if (batch_context->fault_cache) { 210 UVM_ASSERT(uvm_tracker_is_empty(&replayable_faults->replay_tracker)); 211 uvm_tracker_deinit(&replayable_faults->replay_tracker); 212 } 213 214 if (parent_gpu->fault_buffer_info.rm_info.faultBufferHandle) { 215 // Re-enable prefetch faults in case we disabled them 216 if (parent_gpu->prefetch_fault_supported && !parent_gpu->fault_buffer_info.prefetch_faults_enabled) 217 parent_gpu->arch_hal->enable_prefetch_faults(parent_gpu); 218 } 219 220 uvm_kvfree(batch_context->fault_cache); 221 uvm_kvfree(batch_context->ordered_fault_cache); 222 uvm_kvfree(batch_context->utlbs); 223 batch_context->fault_cache = NULL; 224 batch_context->ordered_fault_cache = NULL; 225 batch_context->utlbs = NULL; 226 } 227 228 NV_STATUS uvm_gpu_fault_buffer_init(uvm_parent_gpu_t *parent_gpu) 229 { 230 NV_STATUS status = NV_OK; 231 232 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 233 UVM_ASSERT(parent_gpu->replayable_faults_supported); 234 235 status = uvm_rm_locked_call(nvUvmInterfaceInitFaultInfo(parent_gpu->rm_device, 236 &parent_gpu->fault_buffer_info.rm_info)); 237 if (status != NV_OK) { 238 UVM_ERR_PRINT("Failed to init fault buffer info from RM: %s, GPU %s\n", 239 nvstatusToString(status), 240 parent_gpu->name); 241 242 // nvUvmInterfaceInitFaultInfo may leave fields in rm_info populated 243 // when it returns an error. Set the buffer handle to zero as it is 244 // used by the deinitialization logic to determine if it was correctly 245 // initialized. 246 parent_gpu->fault_buffer_info.rm_info.faultBufferHandle = 0; 247 goto fail; 248 } 249 250 status = fault_buffer_init_replayable_faults(parent_gpu); 251 if (status != NV_OK) 252 goto fail; 253 254 if (parent_gpu->non_replayable_faults_supported) { 255 status = uvm_gpu_fault_buffer_init_non_replayable_faults(parent_gpu); 256 if (status != NV_OK) 257 goto fail; 258 } 259 260 return NV_OK; 261 262 fail: 263 uvm_gpu_fault_buffer_deinit(parent_gpu); 264 265 return status; 266 } 267 268 // Reinitialize state relevant to replayable fault handling after returning 269 // from a power management cycle. 270 void uvm_gpu_fault_buffer_resume(uvm_parent_gpu_t *parent_gpu) 271 { 272 UVM_ASSERT(parent_gpu->replayable_faults_supported); 273 274 fault_buffer_reinit_replayable_faults(parent_gpu); 275 } 276 277 void uvm_gpu_fault_buffer_deinit(uvm_parent_gpu_t *parent_gpu) 278 { 279 NV_STATUS status = NV_OK; 280 281 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 282 283 if (parent_gpu->non_replayable_faults_supported) 284 uvm_gpu_fault_buffer_deinit_non_replayable_faults(parent_gpu); 285 286 fault_buffer_deinit_replayable_faults(parent_gpu); 287 288 if (parent_gpu->fault_buffer_info.rm_info.faultBufferHandle) { 289 status = uvm_rm_locked_call(nvUvmInterfaceOwnPageFaultIntr(parent_gpu->rm_device, NV_FALSE)); 290 UVM_ASSERT(status == NV_OK); 291 292 uvm_rm_locked_call_void(nvUvmInterfaceDestroyFaultInfo(parent_gpu->rm_device, 293 &parent_gpu->fault_buffer_info.rm_info)); 294 295 parent_gpu->fault_buffer_info.rm_info.faultBufferHandle = 0; 296 } 297 } 298 299 bool uvm_gpu_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu) 300 { 301 uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable; 302 303 UVM_ASSERT(parent_gpu->replayable_faults_supported); 304 305 // Fast path 1: we left some faults unserviced in the buffer in the last pass 306 if (replayable_faults->cached_get != replayable_faults->cached_put) 307 return true; 308 309 // Fast path 2: read the valid bit of the fault buffer entry pointed by the 310 // cached get pointer 311 if (!parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, replayable_faults->cached_get)) { 312 // Slow path: read the put pointer from the GPU register via BAR0 313 // over PCIe 314 replayable_faults->cached_put = parent_gpu->fault_buffer_hal->read_put(parent_gpu); 315 316 // No interrupt pending 317 if (replayable_faults->cached_get == replayable_faults->cached_put) 318 return false; 319 } 320 321 return true; 322 } 323 324 // Push a fault cancel method on the given client. Any failure during this 325 // operation may lead to application hang (requiring manual Ctrl+C from the 326 // user) or system crash (requiring reboot). 327 // In that case we log an error message. 328 // 329 // gpc_id and client_id aren't used if global_cancel is true. 330 // 331 // This function acquires both the given tracker and the replay tracker 332 static NV_STATUS push_cancel_on_gpu(uvm_gpu_t *gpu, 333 uvm_gpu_phys_address_t instance_ptr, 334 bool global_cancel, 335 NvU32 gpc_id, 336 NvU32 client_id, 337 uvm_tracker_t *tracker) 338 { 339 NV_STATUS status; 340 uvm_push_t push; 341 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 342 343 if (global_cancel) { 344 status = uvm_push_begin_acquire(gpu->channel_manager, 345 UVM_CHANNEL_TYPE_MEMOPS, 346 &replayable_faults->replay_tracker, 347 &push, 348 "Cancel targeting instance_ptr {0x%llx:%s}\n", 349 instance_ptr.address, 350 uvm_aperture_string(instance_ptr.aperture)); 351 } else { 352 status = uvm_push_begin_acquire(gpu->channel_manager, 353 UVM_CHANNEL_TYPE_MEMOPS, 354 &replayable_faults->replay_tracker, 355 &push, 356 "Cancel targeting instance_ptr {0x%llx:%s} gpc %u client %u\n", 357 instance_ptr.address, 358 uvm_aperture_string(instance_ptr.aperture), 359 gpc_id, 360 client_id); 361 } 362 363 UVM_ASSERT(status == NV_OK); 364 if (status != NV_OK) { 365 UVM_ERR_PRINT("Failed to create push and acquire replay tracker before pushing cancel: %s, GPU %s\n", 366 nvstatusToString(status), 367 uvm_gpu_name(gpu)); 368 return status; 369 } 370 371 uvm_push_acquire_tracker(&push, tracker); 372 373 if (global_cancel) 374 gpu->parent->host_hal->cancel_faults_global(&push, instance_ptr); 375 else 376 gpu->parent->host_hal->cancel_faults_targeted(&push, instance_ptr, gpc_id, client_id); 377 378 // We don't need to put the cancel in the GPU replay tracker since we wait 379 // on it immediately. 380 status = uvm_push_end_and_wait(&push); 381 382 UVM_ASSERT(status == NV_OK); 383 if (status != NV_OK) 384 UVM_ERR_PRINT("Failed to wait for pushed cancel: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 385 386 uvm_tracker_clear(&replayable_faults->replay_tracker); 387 388 return status; 389 } 390 391 static NV_STATUS push_cancel_on_gpu_targeted(uvm_gpu_t *gpu, 392 uvm_gpu_phys_address_t instance_ptr, 393 NvU32 gpc_id, 394 NvU32 client_id, 395 uvm_tracker_t *tracker) 396 { 397 return push_cancel_on_gpu(gpu, instance_ptr, false, gpc_id, client_id, tracker); 398 } 399 400 static NV_STATUS push_cancel_on_gpu_global(uvm_gpu_t *gpu, uvm_gpu_phys_address_t instance_ptr, uvm_tracker_t *tracker) 401 { 402 UVM_ASSERT(!gpu->parent->smc.enabled); 403 404 return push_cancel_on_gpu(gpu, instance_ptr, true, 0, 0, tracker); 405 } 406 407 // Volta implements a targeted VA fault cancel that simplifies the fault cancel 408 // process. You only need to specify the address, type, and mmu_engine_id for 409 // the access to be cancelled. Caller must hold the VA space lock for the access 410 // to be cancelled. 411 static NV_STATUS cancel_fault_precise_va(uvm_gpu_t *gpu, 412 uvm_fault_buffer_entry_t *fault_entry, 413 uvm_fault_cancel_va_mode_t cancel_va_mode) 414 { 415 NV_STATUS status; 416 uvm_gpu_va_space_t *gpu_va_space; 417 uvm_gpu_phys_address_t pdb; 418 uvm_push_t push; 419 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 420 NvU64 offset; 421 422 UVM_ASSERT(gpu->parent->replayable_faults_supported); 423 UVM_ASSERT(fault_entry->fatal_reason != UvmEventFatalReasonInvalid); 424 UVM_ASSERT(!fault_entry->filtered); 425 426 gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(fault_entry->va_space, gpu->parent); 427 UVM_ASSERT(gpu_va_space); 428 pdb = uvm_page_tree_pdb(&gpu_va_space->page_tables)->addr; 429 430 // Record fatal fault event 431 uvm_tools_record_gpu_fatal_fault(gpu->id, fault_entry->va_space, fault_entry, fault_entry->fatal_reason); 432 433 status = uvm_push_begin_acquire(gpu->channel_manager, 434 UVM_CHANNEL_TYPE_MEMOPS, 435 &replayable_faults->replay_tracker, 436 &push, 437 "Precise cancel targeting PDB {0x%llx:%s} VA 0x%llx VEID %u with access type %s", 438 pdb.address, 439 uvm_aperture_string(pdb.aperture), 440 fault_entry->fault_address, 441 fault_entry->fault_source.ve_id, 442 uvm_fault_access_type_string(fault_entry->fault_access_type)); 443 if (status != NV_OK) { 444 UVM_ERR_PRINT("Failed to create push and acquire replay tracker before pushing cancel: %s, GPU %s\n", 445 nvstatusToString(status), 446 uvm_gpu_name(gpu)); 447 return status; 448 } 449 450 // UVM aligns fault addresses to PAGE_SIZE as it is the smallest mapping 451 // and coherence tracking granularity. However, the cancel method requires 452 // the original address (4K-aligned) reported in the packet, which is lost 453 // at this point. Since the access permissions are the same for the whole 454 // 64K page, we issue a cancel per 4K range to make sure that the HW sees 455 // the address reported in the packet. 456 for (offset = 0; offset < PAGE_SIZE; offset += UVM_PAGE_SIZE_4K) { 457 gpu->parent->host_hal->cancel_faults_va(&push, pdb, fault_entry, cancel_va_mode); 458 fault_entry->fault_address += UVM_PAGE_SIZE_4K; 459 } 460 fault_entry->fault_address = UVM_PAGE_ALIGN_DOWN(fault_entry->fault_address - 1); 461 462 // We don't need to put the cancel in the GPU replay tracker since we wait 463 // on it immediately. 464 status = uvm_push_end_and_wait(&push); 465 if (status != NV_OK) { 466 UVM_ERR_PRINT("Failed to wait for pushed VA global fault cancel: %s, GPU %s\n", 467 nvstatusToString(status), uvm_gpu_name(gpu)); 468 } 469 470 uvm_tracker_clear(&replayable_faults->replay_tracker); 471 472 return status; 473 } 474 475 static NV_STATUS push_replay_on_gpu(uvm_gpu_t *gpu, uvm_fault_replay_type_t type, uvm_fault_service_batch_context_t *batch_context) 476 { 477 NV_STATUS status; 478 uvm_push_t push; 479 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 480 uvm_tracker_t *tracker = NULL; 481 482 if (batch_context) 483 tracker = &batch_context->tracker; 484 485 status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, tracker, &push, 486 "Replaying faults"); 487 if (status != NV_OK) 488 return status; 489 490 gpu->parent->host_hal->replay_faults(&push, type); 491 492 // Do not count REPLAY_TYPE_START_ACK_ALL's toward the replay count. 493 // REPLAY_TYPE_START_ACK_ALL's are issued for cancels, and the cancel 494 // algorithm checks to make sure that no REPLAY_TYPE_START's have been 495 // issued using batch_context->replays. 496 if (batch_context && type != UVM_FAULT_REPLAY_TYPE_START_ACK_ALL) { 497 uvm_tools_broadcast_replay(gpu, &push, batch_context->batch_id, UVM_FAULT_CLIENT_TYPE_GPC); 498 ++batch_context->num_replays; 499 } 500 501 uvm_push_end(&push); 502 503 // Add this push to the GPU's replay_tracker so cancel can wait on it. 504 status = uvm_tracker_add_push_safe(&replayable_faults->replay_tracker, &push); 505 506 if (uvm_procfs_is_debug_enabled()) { 507 if (type == UVM_FAULT_REPLAY_TYPE_START) 508 ++replayable_faults->stats.num_replays; 509 else 510 ++replayable_faults->stats.num_replays_ack_all; 511 } 512 513 return status; 514 } 515 516 static void write_get(uvm_parent_gpu_t *parent_gpu, NvU32 get) 517 { 518 uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable; 519 520 UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock)); 521 522 // Write get on the GPU only if it's changed. 523 if (replayable_faults->cached_get == get) 524 return; 525 526 replayable_faults->cached_get = get; 527 528 // Update get pointer on the GPU 529 parent_gpu->fault_buffer_hal->write_get(parent_gpu, get); 530 } 531 532 static NV_STATUS fault_buffer_flush_locked(uvm_gpu_t *gpu, 533 uvm_gpu_buffer_flush_mode_t flush_mode, 534 uvm_fault_replay_type_t fault_replay, 535 uvm_fault_service_batch_context_t *batch_context) 536 { 537 NvU32 get; 538 NvU32 put; 539 uvm_spin_loop_t spin; 540 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 541 542 UVM_ASSERT(uvm_sem_is_locked(&gpu->parent->isr.replayable_faults.service_lock)); 543 UVM_ASSERT(gpu->parent->replayable_faults_supported); 544 545 // Read PUT pointer from the GPU if requested 546 if (flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT) 547 replayable_faults->cached_put = gpu->parent->fault_buffer_hal->read_put(gpu->parent); 548 549 get = replayable_faults->cached_get; 550 put = replayable_faults->cached_put; 551 552 while (get != put) { 553 // Wait until valid bit is set 554 UVM_SPIN_WHILE(!gpu->parent->fault_buffer_hal->entry_is_valid(gpu->parent, get), &spin); 555 556 gpu->parent->fault_buffer_hal->entry_clear_valid(gpu->parent, get); 557 ++get; 558 if (get == replayable_faults->max_faults) 559 get = 0; 560 } 561 562 write_get(gpu->parent, get); 563 564 // Issue fault replay 565 return push_replay_on_gpu(gpu, fault_replay, batch_context); 566 } 567 568 NV_STATUS uvm_gpu_fault_buffer_flush(uvm_gpu_t *gpu) 569 { 570 NV_STATUS status = NV_OK; 571 572 UVM_ASSERT(gpu->parent->replayable_faults_supported); 573 574 // Disables replayable fault interrupts and fault servicing 575 uvm_gpu_replayable_faults_isr_lock(gpu->parent); 576 577 status = fault_buffer_flush_locked(gpu, 578 UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT, 579 UVM_FAULT_REPLAY_TYPE_START, 580 NULL); 581 582 // This will trigger the top half to start servicing faults again, if the 583 // replay brought any back in 584 uvm_gpu_replayable_faults_isr_unlock(gpu->parent); 585 return status; 586 } 587 588 static inline int cmp_fault_instance_ptr(const uvm_fault_buffer_entry_t *a, 589 const uvm_fault_buffer_entry_t *b) 590 { 591 int result = uvm_gpu_phys_addr_cmp(a->instance_ptr, b->instance_ptr); 592 // On Volta+ we need to sort by {instance_ptr + subctx_id} pair since it can 593 // map to a different VA space 594 if (result != 0) 595 return result; 596 return UVM_CMP_DEFAULT(a->fault_source.ve_id, b->fault_source.ve_id); 597 } 598 599 // Compare two VA spaces 600 static inline int cmp_va_space(const uvm_va_space_t *a, const uvm_va_space_t *b) 601 { 602 return UVM_CMP_DEFAULT(a, b); 603 } 604 605 // Compare two virtual addresses 606 static inline int cmp_addr(NvU64 a, NvU64 b) 607 { 608 return UVM_CMP_DEFAULT(a, b); 609 } 610 611 // Compare two fault access types 612 static inline int cmp_access_type(uvm_fault_access_type_t a, uvm_fault_access_type_t b) 613 { 614 UVM_ASSERT(a >= 0 && a < UVM_FAULT_ACCESS_TYPE_COUNT); 615 UVM_ASSERT(b >= 0 && b < UVM_FAULT_ACCESS_TYPE_COUNT); 616 617 // Check that fault access type enum values are ordered by "intrusiveness" 618 BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG <= UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK); 619 BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK <= UVM_FAULT_ACCESS_TYPE_WRITE); 620 BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_WRITE <= UVM_FAULT_ACCESS_TYPE_READ); 621 BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_READ <= UVM_FAULT_ACCESS_TYPE_PREFETCH); 622 623 return b - a; 624 } 625 626 typedef enum 627 { 628 // Fetch a batch of faults from the buffer. 629 FAULT_FETCH_MODE_BATCH_ALL, 630 631 // Fetch a batch of faults from the buffer. Stop at the first entry that is 632 // not ready yet 633 FAULT_FETCH_MODE_BATCH_READY, 634 635 // Fetch all faults in the buffer before PUT. Wait for all faults to become 636 // ready 637 FAULT_FETCH_MODE_ALL, 638 } fault_fetch_mode_t; 639 640 static void fetch_fault_buffer_merge_entry(uvm_fault_buffer_entry_t *current_entry, 641 uvm_fault_buffer_entry_t *last_entry) 642 { 643 UVM_ASSERT(last_entry->num_instances > 0); 644 645 ++last_entry->num_instances; 646 uvm_fault_access_type_mask_set(&last_entry->access_type_mask, current_entry->fault_access_type); 647 648 if (current_entry->fault_access_type > last_entry->fault_access_type) { 649 // If the new entry has a higher access type, it becomes the 650 // fault to be serviced. Add the previous one to the list of instances 651 current_entry->access_type_mask = last_entry->access_type_mask; 652 current_entry->num_instances = last_entry->num_instances; 653 last_entry->filtered = true; 654 655 // We only merge faults from different uTLBs if the new fault has an 656 // access type with the same or lower level of intrusiveness. 657 UVM_ASSERT(current_entry->fault_source.utlb_id == last_entry->fault_source.utlb_id); 658 659 list_replace(&last_entry->merged_instances_list, ¤t_entry->merged_instances_list); 660 list_add(&last_entry->merged_instances_list, ¤t_entry->merged_instances_list); 661 } 662 else { 663 // Add the new entry to the list of instances for reporting purposes 664 current_entry->filtered = true; 665 list_add(¤t_entry->merged_instances_list, &last_entry->merged_instances_list); 666 } 667 } 668 669 static bool fetch_fault_buffer_try_merge_entry(uvm_fault_buffer_entry_t *current_entry, 670 uvm_fault_service_batch_context_t *batch_context, 671 uvm_fault_utlb_info_t *current_tlb, 672 bool is_same_instance_ptr) 673 { 674 uvm_fault_buffer_entry_t *last_tlb_entry = current_tlb->last_fault; 675 uvm_fault_buffer_entry_t *last_global_entry = batch_context->last_fault; 676 677 // Check the last coalesced fault and the coalesced fault that was 678 // originated from this uTLB 679 const bool is_last_tlb_fault = current_tlb->num_pending_faults > 0 && 680 cmp_fault_instance_ptr(current_entry, last_tlb_entry) == 0 && 681 current_entry->fault_address == last_tlb_entry->fault_address; 682 683 // We only merge faults from different uTLBs if the new fault has an 684 // access type with the same or lower level of intrusiveness. This is to 685 // avoid having to update num_pending_faults on both uTLBs and recomputing 686 // last_fault. 687 const bool is_last_fault = is_same_instance_ptr && 688 current_entry->fault_address == last_global_entry->fault_address && 689 current_entry->fault_access_type <= last_global_entry->fault_access_type; 690 691 if (is_last_tlb_fault) { 692 fetch_fault_buffer_merge_entry(current_entry, last_tlb_entry); 693 if (current_entry->fault_access_type > last_tlb_entry->fault_access_type) 694 current_tlb->last_fault = current_entry; 695 696 return true; 697 } 698 else if (is_last_fault) { 699 fetch_fault_buffer_merge_entry(current_entry, last_global_entry); 700 if (current_entry->fault_access_type > last_global_entry->fault_access_type) 701 batch_context->last_fault = current_entry; 702 703 return true; 704 } 705 706 return false; 707 } 708 709 // Fetch entries from the fault buffer, decode them and store them in the batch 710 // context. We implement the fetch modes described above. 711 // 712 // When possible, we coalesce duplicate entries to minimize the fault handling 713 // overhead. Basically, we merge faults with the same instance pointer and page 714 // virtual address. We keep track of the last fault per uTLB to detect 715 // duplicates due to local reuse and the last fault in the whole batch to 716 // detect reuse across CTAs. 717 // 718 // We will service the first fault entry with the most "intrusive" (atomic > 719 // write > read > prefetch) access type*. That fault entry is called the 720 // "representative". The rest of filtered faults have the "filtered" flag set 721 // and are added to a list in the representative fault entry for reporting 722 // purposes. The representative fault entry also contains a mask with all the 723 // access types that produced a fault on the page. 724 // 725 // *We only merge faults from different uTLBs if the new fault has an access 726 // type with the same or lower level of intrusiveness. 727 // 728 // This optimization cannot be performed during fault cancel on Pascal GPUs 729 // (fetch_mode == FAULT_FETCH_MODE_ALL) since we need accurate tracking of all 730 // the faults in each uTLB in order to guarantee precise fault attribution. 731 static void fetch_fault_buffer_entries(uvm_gpu_t *gpu, 732 uvm_fault_service_batch_context_t *batch_context, 733 fault_fetch_mode_t fetch_mode) 734 { 735 NvU32 get; 736 NvU32 put; 737 NvU32 fault_index; 738 NvU32 num_coalesced_faults; 739 NvU32 utlb_id; 740 uvm_fault_buffer_entry_t *fault_cache; 741 uvm_spin_loop_t spin; 742 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 743 const bool in_pascal_cancel_path = (!gpu->parent->fault_cancel_va_supported && fetch_mode == FAULT_FETCH_MODE_ALL); 744 const bool may_filter = uvm_perf_fault_coalesce && !in_pascal_cancel_path; 745 746 UVM_ASSERT(uvm_sem_is_locked(&gpu->parent->isr.replayable_faults.service_lock)); 747 UVM_ASSERT(gpu->parent->replayable_faults_supported); 748 749 fault_cache = batch_context->fault_cache; 750 751 get = replayable_faults->cached_get; 752 753 // Read put pointer from GPU and cache it 754 if (get == replayable_faults->cached_put) 755 replayable_faults->cached_put = gpu->parent->fault_buffer_hal->read_put(gpu->parent); 756 757 put = replayable_faults->cached_put; 758 759 batch_context->is_single_instance_ptr = true; 760 batch_context->last_fault = NULL; 761 762 fault_index = 0; 763 num_coalesced_faults = 0; 764 765 // Clear uTLB counters 766 for (utlb_id = 0; utlb_id <= batch_context->max_utlb_id; ++utlb_id) { 767 batch_context->utlbs[utlb_id].num_pending_faults = 0; 768 batch_context->utlbs[utlb_id].has_fatal_faults = false; 769 } 770 batch_context->max_utlb_id = 0; 771 772 if (get == put) 773 goto done; 774 775 // Parse until get != put and have enough space to cache. 776 while ((get != put) && 777 (fetch_mode == FAULT_FETCH_MODE_ALL || fault_index < gpu->parent->fault_buffer_info.max_batch_size)) { 778 bool is_same_instance_ptr = true; 779 uvm_fault_buffer_entry_t *current_entry = &fault_cache[fault_index]; 780 uvm_fault_utlb_info_t *current_tlb; 781 782 // We cannot just wait for the last entry (the one pointed by put) to 783 // become valid, we have to do it individually since entries can be 784 // written out of order 785 UVM_SPIN_WHILE(!gpu->parent->fault_buffer_hal->entry_is_valid(gpu->parent, get), &spin) { 786 // We have some entry to work on. Let's do the rest later. 787 if (fetch_mode != FAULT_FETCH_MODE_ALL && 788 fetch_mode != FAULT_FETCH_MODE_BATCH_ALL && 789 fault_index > 0) 790 goto done; 791 } 792 793 // Prevent later accesses being moved above the read of the valid bit 794 smp_mb__after_atomic(); 795 796 // Got valid bit set. Let's cache. 797 gpu->parent->fault_buffer_hal->parse_entry(gpu->parent, get, current_entry); 798 799 // The GPU aligns the fault addresses to 4k, but all of our tracking is 800 // done in PAGE_SIZE chunks which might be larger. 801 current_entry->fault_address = UVM_PAGE_ALIGN_DOWN(current_entry->fault_address); 802 803 // Make sure that all fields in the entry are properly initialized 804 current_entry->is_fatal = (current_entry->fault_type >= UVM_FAULT_TYPE_FATAL); 805 806 if (current_entry->is_fatal) { 807 // Record the fatal fault event later as we need the va_space locked 808 current_entry->fatal_reason = UvmEventFatalReasonInvalidFaultType; 809 } 810 else { 811 current_entry->fatal_reason = UvmEventFatalReasonInvalid; 812 } 813 814 current_entry->va_space = NULL; 815 current_entry->filtered = false; 816 817 if (current_entry->fault_source.utlb_id > batch_context->max_utlb_id) { 818 UVM_ASSERT(current_entry->fault_source.utlb_id < replayable_faults->utlb_count); 819 batch_context->max_utlb_id = current_entry->fault_source.utlb_id; 820 } 821 822 current_tlb = &batch_context->utlbs[current_entry->fault_source.utlb_id]; 823 824 if (fault_index > 0) { 825 UVM_ASSERT(batch_context->last_fault); 826 is_same_instance_ptr = cmp_fault_instance_ptr(current_entry, batch_context->last_fault) == 0; 827 828 // Coalesce duplicate faults when possible 829 if (may_filter && !current_entry->is_fatal) { 830 bool merged = fetch_fault_buffer_try_merge_entry(current_entry, 831 batch_context, 832 current_tlb, 833 is_same_instance_ptr); 834 if (merged) 835 goto next_fault; 836 } 837 } 838 839 if (batch_context->is_single_instance_ptr && !is_same_instance_ptr) 840 batch_context->is_single_instance_ptr = false; 841 842 current_entry->num_instances = 1; 843 current_entry->access_type_mask = uvm_fault_access_type_mask_bit(current_entry->fault_access_type); 844 INIT_LIST_HEAD(¤t_entry->merged_instances_list); 845 846 ++current_tlb->num_pending_faults; 847 current_tlb->last_fault = current_entry; 848 batch_context->last_fault = current_entry; 849 850 ++num_coalesced_faults; 851 852 next_fault: 853 ++fault_index; 854 ++get; 855 if (get == replayable_faults->max_faults) 856 get = 0; 857 } 858 859 done: 860 write_get(gpu->parent, get); 861 862 batch_context->num_cached_faults = fault_index; 863 batch_context->num_coalesced_faults = num_coalesced_faults; 864 } 865 866 // Sort comparator for pointers to fault buffer entries that sorts by 867 // instance pointer 868 static int cmp_sort_fault_entry_by_instance_ptr(const void *_a, const void *_b) 869 { 870 const uvm_fault_buffer_entry_t **a = (const uvm_fault_buffer_entry_t **)_a; 871 const uvm_fault_buffer_entry_t **b = (const uvm_fault_buffer_entry_t **)_b; 872 873 return cmp_fault_instance_ptr(*a, *b); 874 } 875 876 // Sort comparator for pointers to fault buffer entries that sorts by va_space, 877 // fault address and fault access type 878 static int cmp_sort_fault_entry_by_va_space_address_access_type(const void *_a, const void *_b) 879 { 880 const uvm_fault_buffer_entry_t **a = (const uvm_fault_buffer_entry_t **)_a; 881 const uvm_fault_buffer_entry_t **b = (const uvm_fault_buffer_entry_t **)_b; 882 883 int result; 884 885 result = cmp_va_space((*a)->va_space, (*b)->va_space); 886 if (result != 0) 887 return result; 888 889 result = cmp_addr((*a)->fault_address, (*b)->fault_address); 890 if (result != 0) 891 return result; 892 893 return cmp_access_type((*a)->fault_access_type, (*b)->fault_access_type); 894 } 895 896 // Translate all instance pointers to VA spaces. Since the buffer is ordered by 897 // instance_ptr, we minimize the number of translations 898 // 899 // This function returns NV_WARN_MORE_PROCESSING_REQUIRED if a fault buffer 900 // flush occurred and executed successfully, or the error code if it failed. 901 // NV_OK otherwise. 902 static NV_STATUS translate_instance_ptrs(uvm_gpu_t *gpu, 903 uvm_fault_service_batch_context_t *batch_context) 904 { 905 NvU32 i; 906 NV_STATUS status; 907 908 for (i = 0; i < batch_context->num_coalesced_faults; ++i) { 909 uvm_fault_buffer_entry_t *current_entry; 910 911 current_entry = batch_context->ordered_fault_cache[i]; 912 913 // If this instance pointer matches the previous instance pointer, just 914 // copy over the already-translated va_space and move on. 915 if (i != 0 && cmp_fault_instance_ptr(current_entry, batch_context->ordered_fault_cache[i - 1]) == 0) { 916 current_entry->va_space = batch_context->ordered_fault_cache[i - 1]->va_space; 917 continue; 918 } 919 920 status = uvm_gpu_fault_entry_to_va_space(gpu, current_entry, ¤t_entry->va_space); 921 if (status != NV_OK) { 922 if (status == NV_ERR_PAGE_TABLE_NOT_AVAIL) { 923 // The channel is valid but the subcontext is not. This can only 924 // happen if the subcontext is torn down before its work is 925 // complete while other subcontexts in the same TSG are still 926 // executing. This is a violation of the programming model. We 927 // have limited options since the VA space is gone, meaning we 928 // can't target the PDB for cancel even if we wanted to. So 929 // we'll just throw away precise attribution and cancel this 930 // fault using the SW method, which validates that the intended 931 // context (TSG) is still running so we don't cancel an innocent 932 // context. 933 UVM_ASSERT(!current_entry->va_space); 934 UVM_ASSERT(gpu->max_subcontexts > 0); 935 936 if (gpu->parent->smc.enabled) { 937 status = push_cancel_on_gpu_targeted(gpu, 938 current_entry->instance_ptr, 939 current_entry->fault_source.gpc_id, 940 current_entry->fault_source.client_id, 941 &batch_context->tracker); 942 } 943 else { 944 status = push_cancel_on_gpu_global(gpu, current_entry->instance_ptr, &batch_context->tracker); 945 } 946 947 if (status != NV_OK) 948 return status; 949 950 // Fall through and let the flush restart fault processing 951 } 952 else { 953 UVM_ASSERT(status == NV_ERR_INVALID_CHANNEL); 954 } 955 956 // If the channel is gone then we're looking at a stale fault entry. 957 // The fault must have been resolved already (serviced or 958 // cancelled), so we can just flush the fault buffer. 959 status = fault_buffer_flush_locked(gpu, 960 UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT, 961 UVM_FAULT_REPLAY_TYPE_START, 962 batch_context); 963 if (status != NV_OK) 964 return status; 965 966 return NV_WARN_MORE_PROCESSING_REQUIRED; 967 } 968 else { 969 UVM_ASSERT(current_entry->va_space); 970 } 971 } 972 973 return NV_OK; 974 } 975 976 // Fault cache preprocessing for fault coalescing 977 // 978 // This function generates an ordered view of the given fault_cache in which 979 // faults are sorted by VA space, fault address (aligned to 4K) and access type 980 // "intrusiveness". In order to minimize the number of instance_ptr to VA space 981 // translations we perform a first sort by instance_ptr. 982 // 983 // This function returns NV_WARN_MORE_PROCESSING_REQUIRED if a fault buffer 984 // flush occurred during instance_ptr translation and executed successfully, or 985 // the error code if it failed. NV_OK otherwise. 986 // 987 // Current scheme: 988 // 1) sort by instance_ptr 989 // 2) translate all instance_ptrs to VA spaces 990 // 3) sort by va_space, fault address (fault_address is page-aligned at this 991 // point) and access type 992 static NV_STATUS preprocess_fault_batch(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context) 993 { 994 NV_STATUS status; 995 NvU32 i, j; 996 uvm_fault_buffer_entry_t **ordered_fault_cache = batch_context->ordered_fault_cache; 997 998 UVM_ASSERT(batch_context->num_coalesced_faults > 0); 999 UVM_ASSERT(batch_context->num_cached_faults >= batch_context->num_coalesced_faults); 1000 1001 // Generate an ordered view of the fault cache in ordered_fault_cache. 1002 // We sort the pointers, not the entries in fault_cache 1003 1004 // Initialize pointers before they are sorted. We only sort one instance per 1005 // coalesced fault 1006 for (i = 0, j = 0; i < batch_context->num_cached_faults; ++i) { 1007 if (!batch_context->fault_cache[i].filtered) 1008 ordered_fault_cache[j++] = &batch_context->fault_cache[i]; 1009 } 1010 UVM_ASSERT(j == batch_context->num_coalesced_faults); 1011 1012 // 1) if the fault batch contains more than one, sort by instance_ptr 1013 if (!batch_context->is_single_instance_ptr) { 1014 sort(ordered_fault_cache, 1015 batch_context->num_coalesced_faults, 1016 sizeof(*ordered_fault_cache), 1017 cmp_sort_fault_entry_by_instance_ptr, 1018 NULL); 1019 } 1020 1021 // 2) translate all instance_ptrs to VA spaces 1022 status = translate_instance_ptrs(gpu, batch_context); 1023 if (status != NV_OK) 1024 return status; 1025 1026 // 3) sort by va_space, fault address (GPU already reports 4K-aligned 1027 // address) and access type 1028 sort(ordered_fault_cache, 1029 batch_context->num_coalesced_faults, 1030 sizeof(*ordered_fault_cache), 1031 cmp_sort_fault_entry_by_va_space_address_access_type, 1032 NULL); 1033 1034 return NV_OK; 1035 } 1036 1037 static bool check_fault_entry_duplicate(const uvm_fault_buffer_entry_t *current_entry, 1038 const uvm_fault_buffer_entry_t *previous_entry) 1039 { 1040 bool is_duplicate = false; 1041 1042 if (previous_entry) { 1043 is_duplicate = (current_entry->va_space == previous_entry->va_space) && 1044 (current_entry->fault_address == previous_entry->fault_address); 1045 } 1046 1047 return is_duplicate; 1048 } 1049 1050 static void fault_entry_duplicate_flags(uvm_fault_buffer_entry_t *current_entry, 1051 const uvm_fault_buffer_entry_t *previous_entry) 1052 { 1053 UVM_ASSERT(previous_entry); 1054 UVM_ASSERT(check_fault_entry_duplicate(current_entry, previous_entry)); 1055 1056 // Propagate the is_invalid_prefetch flag across all prefetch faults 1057 // on the page 1058 if (previous_entry->is_invalid_prefetch) 1059 current_entry->is_invalid_prefetch = true; 1060 1061 // If a page is throttled, all faults on the page must be skipped 1062 if (previous_entry->is_throttled) 1063 current_entry->is_throttled = true; 1064 } 1065 1066 static void update_batch_context(uvm_fault_service_batch_context_t *batch_context, 1067 uvm_fault_buffer_entry_t *current_entry, 1068 const uvm_fault_buffer_entry_t *previous_entry) 1069 { 1070 bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry); 1071 uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[current_entry->fault_source.utlb_id]; 1072 1073 UVM_ASSERT(utlb->num_pending_faults > 0); 1074 1075 if (is_duplicate) 1076 batch_context->num_duplicate_faults += current_entry->num_instances; 1077 else 1078 batch_context->num_duplicate_faults += current_entry->num_instances - 1; 1079 1080 if (current_entry->is_invalid_prefetch) 1081 batch_context->num_invalid_prefetch_faults += current_entry->num_instances; 1082 1083 if (current_entry->is_fatal) { 1084 utlb->has_fatal_faults = true; 1085 batch_context->has_fatal_faults = true; 1086 } 1087 1088 if (current_entry->is_throttled) 1089 batch_context->has_throttled_faults = true; 1090 } 1091 1092 // This function computes the maximum access type that can be serviced for the 1093 // reported fault instances given the logical permissions of the VA range. If 1094 // none of the fault instances can be serviced UVM_FAULT_ACCESS_TYPE_COUNT is 1095 // returned instead. 1096 // 1097 // In the case that there are faults that cannot be serviced, this function 1098 // also sets the flags required for fault cancellation. Prefetch faults do not 1099 // need to be cancelled since they disappear on replay. 1100 // 1101 // The UVM driver considers two scenarios for logical permissions violation: 1102 // - All access types are invalid. For example, when faulting from a processor 1103 // that doesn't have access to the preferred location of a range group when it 1104 // is not migratable. In this case all accesses to the page must be cancelled. 1105 // - Write/atomic accesses are invalid. Basically, when trying to modify a 1106 // read-only VA range. In this case we restrict fault cancelling to those types 1107 // of accesses. 1108 // 1109 // Return values: 1110 // - service_access_type: highest access type that can be serviced. 1111 static uvm_fault_access_type_t check_fault_access_permissions(uvm_gpu_t *gpu, 1112 uvm_va_block_t *va_block, 1113 uvm_va_block_context_t *va_block_context, 1114 uvm_fault_buffer_entry_t *fault_entry, 1115 bool allow_migration) 1116 { 1117 NV_STATUS perm_status; 1118 1119 perm_status = uvm_va_block_check_logical_permissions(va_block, 1120 va_block_context, 1121 gpu->id, 1122 uvm_va_block_cpu_page_index(va_block, 1123 fault_entry->fault_address), 1124 fault_entry->fault_access_type, 1125 allow_migration); 1126 if (perm_status == NV_OK) 1127 return fault_entry->fault_access_type; 1128 1129 if (fault_entry->fault_access_type == UVM_FAULT_ACCESS_TYPE_PREFETCH) { 1130 fault_entry->is_invalid_prefetch = true; 1131 return UVM_FAULT_ACCESS_TYPE_COUNT; 1132 } 1133 1134 // At this point we know that some fault instances cannot be serviced 1135 fault_entry->is_fatal = true; 1136 fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(perm_status); 1137 1138 if (fault_entry->fault_access_type > UVM_FAULT_ACCESS_TYPE_READ) { 1139 fault_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_WRITE_AND_ATOMIC; 1140 1141 // If there are pending read accesses on the same page, we have to 1142 // service them before we can cancel the write/atomic faults. So we 1143 // retry with read fault access type. 1144 if (uvm_fault_access_type_mask_test(fault_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ)) { 1145 perm_status = uvm_va_block_check_logical_permissions(va_block, 1146 va_block_context, 1147 gpu->id, 1148 uvm_va_block_cpu_page_index(va_block, 1149 fault_entry->fault_address), 1150 UVM_FAULT_ACCESS_TYPE_READ, 1151 allow_migration); 1152 if (perm_status == NV_OK) 1153 return UVM_FAULT_ACCESS_TYPE_READ; 1154 1155 // If that didn't succeed, cancel all faults 1156 fault_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL; 1157 fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(perm_status); 1158 } 1159 } 1160 else { 1161 fault_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL; 1162 } 1163 1164 return UVM_FAULT_ACCESS_TYPE_COUNT; 1165 } 1166 1167 // We notify the fault event for all faults within the block so that the 1168 // performance heuristics are updated. Then, all required actions for the block 1169 // data are performed by the performance heuristics code. 1170 // 1171 // Fatal faults are flagged as fatal for later cancellation. Servicing is not 1172 // interrupted on fatal faults due to insufficient permissions or invalid 1173 // addresses. 1174 // 1175 // Return codes: 1176 // - NV_OK if all faults were handled (both fatal and non-fatal) 1177 // - NV_ERR_MORE_PROCESSING_REQUIRED if servicing needs allocation retry 1178 // - NV_ERR_NO_MEMORY if the faults could not be serviced due to OOM 1179 // - Any other value is a UVM-global error 1180 static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu, 1181 uvm_va_block_t *va_block, 1182 uvm_va_block_retry_t *va_block_retry, 1183 uvm_fault_service_batch_context_t *batch_context, 1184 NvU32 first_fault_index, 1185 NvU32 *block_faults) 1186 { 1187 NV_STATUS status = NV_OK; 1188 NvU32 i; 1189 uvm_page_index_t first_page_index; 1190 uvm_page_index_t last_page_index; 1191 NvU32 page_fault_count = 0; 1192 uvm_range_group_range_iter_t iter; 1193 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 1194 uvm_fault_buffer_entry_t **ordered_fault_cache = batch_context->ordered_fault_cache; 1195 uvm_service_block_context_t *block_context = &replayable_faults->block_service_context; 1196 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 1197 NvU64 end; 1198 1199 // Check that all uvm_fault_access_type_t values can fit into an NvU8 1200 BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_COUNT > (int)(NvU8)-1); 1201 1202 uvm_assert_mutex_locked(&va_block->lock); 1203 1204 *block_faults = 0; 1205 1206 first_page_index = PAGES_PER_UVM_VA_BLOCK; 1207 last_page_index = 0; 1208 1209 // Initialize fault service block context 1210 uvm_processor_mask_zero(&block_context->resident_processors); 1211 block_context->thrashing_pin_count = 0; 1212 block_context->read_duplicate_count = 0; 1213 1214 uvm_range_group_range_migratability_iter_first(va_space, va_block->start, va_block->end, &iter); 1215 1216 // The first entry is guaranteed to fall within this block 1217 UVM_ASSERT(ordered_fault_cache[first_fault_index]->va_space == va_space); 1218 UVM_ASSERT(ordered_fault_cache[first_fault_index]->fault_address >= va_block->start); 1219 UVM_ASSERT(ordered_fault_cache[first_fault_index]->fault_address <= va_block->end); 1220 1221 if (uvm_va_block_is_hmm(va_block)) { 1222 uvm_hmm_find_policy_end(va_block, 1223 &block_context->block_context, 1224 ordered_fault_cache[first_fault_index]->fault_address, 1225 &end); 1226 } 1227 else { 1228 block_context->block_context.policy = uvm_va_range_get_policy(va_block->va_range); 1229 end = va_block->end; 1230 } 1231 1232 // Scan the sorted array and notify the fault event for all fault entries 1233 // in the block 1234 for (i = first_fault_index; 1235 i < batch_context->num_coalesced_faults && 1236 ordered_fault_cache[i]->va_space == va_space && 1237 ordered_fault_cache[i]->fault_address <= end; 1238 ++i) { 1239 uvm_fault_buffer_entry_t *current_entry = ordered_fault_cache[i]; 1240 const uvm_fault_buffer_entry_t *previous_entry = NULL; 1241 bool read_duplicate; 1242 uvm_processor_id_t new_residency; 1243 uvm_perf_thrashing_hint_t thrashing_hint; 1244 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, current_entry->fault_address); 1245 bool is_duplicate = false; 1246 uvm_fault_access_type_t service_access_type; 1247 NvU32 service_access_type_mask; 1248 1249 UVM_ASSERT(current_entry->fault_access_type == 1250 uvm_fault_access_type_mask_highest(current_entry->access_type_mask)); 1251 1252 current_entry->is_fatal = false; 1253 current_entry->is_throttled = false; 1254 current_entry->is_invalid_prefetch = false; 1255 1256 if (i > first_fault_index) { 1257 previous_entry = ordered_fault_cache[i - 1]; 1258 is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry); 1259 } 1260 1261 if (block_context->num_retries == 0) { 1262 uvm_perf_event_notify_gpu_fault(&va_space->perf_events, 1263 va_block, 1264 gpu->id, 1265 block_context->block_context.policy->preferred_location, 1266 current_entry, 1267 batch_context->batch_id, 1268 is_duplicate); 1269 } 1270 1271 // Service the most intrusive fault per page, only. Waive the rest 1272 if (is_duplicate) { 1273 fault_entry_duplicate_flags(current_entry, previous_entry); 1274 1275 // The previous fault was non-fatal so the page has been already 1276 // serviced 1277 if (!previous_entry->is_fatal) 1278 goto next; 1279 } 1280 1281 // Ensure that the migratability iterator covers the current fault 1282 // address 1283 while (iter.end < current_entry->fault_address) 1284 uvm_range_group_range_migratability_iter_next(va_space, &iter, va_block->end); 1285 1286 UVM_ASSERT(iter.start <= current_entry->fault_address && iter.end >= current_entry->fault_address); 1287 1288 service_access_type = check_fault_access_permissions(gpu, 1289 va_block, 1290 &block_context->block_context, 1291 current_entry, 1292 iter.migratable); 1293 1294 // Do not exit early due to logical errors such as access permission 1295 // violation. 1296 if (service_access_type == UVM_FAULT_ACCESS_TYPE_COUNT) 1297 goto next; 1298 1299 if (service_access_type != current_entry->fault_access_type) { 1300 // Some of the fault instances cannot be serviced due to invalid 1301 // access permissions. Recompute the access type service mask to 1302 // service the rest. 1303 UVM_ASSERT(service_access_type < current_entry->fault_access_type); 1304 service_access_type_mask = uvm_fault_access_type_mask_bit(service_access_type); 1305 } 1306 else { 1307 service_access_type_mask = current_entry->access_type_mask; 1308 } 1309 1310 // If the GPU already has the necessary access permission, the fault 1311 // does not need to be serviced 1312 if (uvm_va_block_page_is_gpu_authorized(va_block, 1313 page_index, 1314 gpu->id, 1315 uvm_fault_access_type_to_prot(service_access_type))) 1316 goto next; 1317 1318 thrashing_hint = uvm_perf_thrashing_get_hint(va_block, current_entry->fault_address, gpu->id); 1319 if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) { 1320 // Throttling is implemented by sleeping in the fault handler on 1321 // the CPU and by continuing to process faults on other pages on 1322 // the GPU 1323 current_entry->is_throttled = true; 1324 goto next; 1325 } 1326 else if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) { 1327 if (block_context->thrashing_pin_count++ == 0) 1328 uvm_page_mask_zero(&block_context->thrashing_pin_mask); 1329 1330 uvm_page_mask_set(&block_context->thrashing_pin_mask, page_index); 1331 } 1332 1333 // Compute new residency and update the masks 1334 new_residency = uvm_va_block_select_residency(va_block, 1335 &block_context->block_context, 1336 page_index, 1337 gpu->id, 1338 service_access_type_mask, 1339 block_context->block_context.policy, 1340 &thrashing_hint, 1341 UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS, 1342 &read_duplicate); 1343 1344 if (!uvm_processor_mask_test_and_set(&block_context->resident_processors, new_residency)) 1345 uvm_page_mask_zero(&block_context->per_processor_masks[uvm_id_value(new_residency)].new_residency); 1346 1347 uvm_page_mask_set(&block_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index); 1348 1349 if (read_duplicate) { 1350 if (block_context->read_duplicate_count++ == 0) 1351 uvm_page_mask_zero(&block_context->read_duplicate_mask); 1352 1353 uvm_page_mask_set(&block_context->read_duplicate_mask, page_index); 1354 } 1355 1356 ++page_fault_count; 1357 1358 block_context->access_type[page_index] = service_access_type; 1359 1360 if (page_index < first_page_index) 1361 first_page_index = page_index; 1362 if (page_index > last_page_index) 1363 last_page_index = page_index; 1364 1365 next: 1366 // Only update counters the first time since logical permissions cannot 1367 // change while we hold the VA space lock 1368 // TODO: Bug 1750144: That might not be true with HMM. 1369 if (block_context->num_retries == 0) 1370 update_batch_context(batch_context, current_entry, previous_entry); 1371 } 1372 1373 // Apply the changes computed in the fault service block context, if there 1374 // are pages to be serviced 1375 if (page_fault_count > 0) { 1376 block_context->region = uvm_va_block_region(first_page_index, last_page_index + 1); 1377 status = uvm_va_block_service_locked(gpu->id, va_block, va_block_retry, block_context); 1378 } 1379 1380 *block_faults = i - first_fault_index; 1381 1382 ++block_context->num_retries; 1383 1384 if (status == NV_OK && batch_context->has_fatal_faults) 1385 status = uvm_va_block_set_cancel(va_block, &block_context->block_context, gpu); 1386 1387 return status; 1388 } 1389 1390 // We notify the fault event for all faults within the block so that the 1391 // performance heuristics are updated. The VA block lock is taken for the whole 1392 // fault servicing although it might be temporarily dropped and re-taken if 1393 // memory eviction is required. 1394 // 1395 // See the comments for function service_fault_batch_block_locked for 1396 // implementation details and error codes. 1397 static NV_STATUS service_fault_batch_block(uvm_gpu_t *gpu, 1398 uvm_va_block_t *va_block, 1399 uvm_fault_service_batch_context_t *batch_context, 1400 NvU32 first_fault_index, 1401 NvU32 *block_faults) 1402 { 1403 NV_STATUS status; 1404 uvm_va_block_retry_t va_block_retry; 1405 NV_STATUS tracker_status; 1406 uvm_service_block_context_t *fault_block_context = &gpu->parent->fault_buffer_info.replayable.block_service_context; 1407 1408 fault_block_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS; 1409 fault_block_context->num_retries = 0; 1410 1411 uvm_mutex_lock(&va_block->lock); 1412 1413 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, &va_block_retry, 1414 service_fault_batch_block_locked(gpu, 1415 va_block, 1416 &va_block_retry, 1417 batch_context, 1418 first_fault_index, 1419 block_faults)); 1420 1421 tracker_status = uvm_tracker_add_tracker_safe(&batch_context->tracker, &va_block->tracker); 1422 1423 uvm_mutex_unlock(&va_block->lock); 1424 1425 return status == NV_OK? tracker_status: status; 1426 } 1427 1428 typedef enum 1429 { 1430 // Use this mode when calling from the normal fault servicing path 1431 FAULT_SERVICE_MODE_REGULAR, 1432 1433 // Use this mode when servicing faults from the fault cancelling algorithm. 1434 // In this mode no replays are issued 1435 FAULT_SERVICE_MODE_CANCEL, 1436 } fault_service_mode_t; 1437 1438 static NV_STATUS service_fault_batch_ats(uvm_gpu_va_space_t *gpu_va_space, 1439 struct mm_struct *mm, 1440 uvm_fault_service_batch_context_t *batch_context, 1441 NvU32 first_fault_index, 1442 NvU32 *block_faults) 1443 { 1444 NV_STATUS status; 1445 uvm_gpu_t *gpu = gpu_va_space->gpu; 1446 uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate; 1447 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index]; 1448 const uvm_fault_buffer_entry_t *previous_entry = first_fault_index > 0 ? 1449 batch_context->ordered_fault_cache[first_fault_index - 1] : NULL; 1450 bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry); 1451 1452 if (is_duplicate) 1453 fault_entry_duplicate_flags(current_entry, previous_entry); 1454 1455 // Generate fault events for all fault packets 1456 uvm_perf_event_notify_gpu_fault(¤t_entry->va_space->perf_events, 1457 NULL, 1458 gpu->id, 1459 UVM_ID_INVALID, 1460 current_entry, 1461 batch_context->batch_id, 1462 is_duplicate); 1463 1464 // The VA isn't managed. See if ATS knows about it, unless it is a 1465 // duplicate and the previous fault was non-fatal so the page has 1466 // already been serviced 1467 // 1468 // TODO: Bug 2103669: Service more than one ATS fault at a time so we 1469 // don't do an unconditional VA range lookup for every ATS fault. 1470 if (!is_duplicate || previous_entry->is_fatal) 1471 status = uvm_ats_service_fault_entry(gpu_va_space, current_entry, ats_invalidate); 1472 else 1473 status = NV_OK; 1474 1475 (*block_faults)++; 1476 1477 update_batch_context(batch_context, current_entry, previous_entry); 1478 1479 return status; 1480 } 1481 1482 static void service_fault_batch_fatal(uvm_gpu_t *gpu, 1483 uvm_fault_service_batch_context_t *batch_context, 1484 NvU32 first_fault_index, 1485 NV_STATUS status, 1486 NvU32 *block_faults) 1487 { 1488 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index]; 1489 const uvm_fault_buffer_entry_t *previous_entry = first_fault_index > 0 ? 1490 batch_context->ordered_fault_cache[first_fault_index - 1] : NULL; 1491 bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry); 1492 1493 if (is_duplicate) 1494 fault_entry_duplicate_flags(current_entry, previous_entry); 1495 1496 // The VA block cannot be found, set the fatal fault flag, 1497 // unless it is a prefetch fault 1498 if (current_entry->fault_access_type == UVM_FAULT_ACCESS_TYPE_PREFETCH) { 1499 current_entry->is_invalid_prefetch = true; 1500 } 1501 else { 1502 current_entry->is_fatal = true; 1503 current_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status); 1504 current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL; 1505 } 1506 1507 update_batch_context(batch_context, current_entry, previous_entry); 1508 1509 uvm_perf_event_notify_gpu_fault(¤t_entry->va_space->perf_events, 1510 NULL, 1511 gpu->id, 1512 UVM_ID_INVALID, 1513 current_entry, 1514 batch_context->batch_id, 1515 is_duplicate); 1516 1517 (*block_faults)++; 1518 } 1519 1520 static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space, 1521 uvm_gpu_va_space_t *gpu_va_space, 1522 uvm_fault_service_batch_context_t *batch_context, 1523 NvU32 first_fault_index, 1524 NvU32 *block_faults) 1525 { 1526 NV_STATUS status; 1527 uvm_va_range_t *va_range; 1528 uvm_va_block_t *va_block; 1529 uvm_gpu_t *gpu = gpu_va_space->gpu; 1530 uvm_va_block_context_t *va_block_context = 1531 &gpu->parent->fault_buffer_info.replayable.block_service_context.block_context; 1532 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index]; 1533 struct mm_struct *mm = va_block_context->mm; 1534 NvU64 fault_address = current_entry->fault_address; 1535 1536 (*block_faults) = 0; 1537 1538 va_range = uvm_va_range_find(va_space, fault_address); 1539 status = uvm_va_block_find_create_in_range(va_space, va_range, fault_address, va_block_context, &va_block); 1540 if (status == NV_OK) { 1541 status = service_fault_batch_block(gpu, va_block, batch_context, first_fault_index, block_faults); 1542 } 1543 else if ((status == NV_ERR_INVALID_ADDRESS) && uvm_ats_can_service_faults(gpu_va_space, mm)) { 1544 status = service_fault_batch_ats(gpu_va_space, mm, batch_context, first_fault_index, block_faults); 1545 } 1546 else { 1547 service_fault_batch_fatal(gpu_va_space->gpu, batch_context, first_fault_index, status, block_faults); 1548 1549 // Do not fail due to logical errors 1550 status = NV_OK; 1551 } 1552 1553 return status; 1554 } 1555 1556 // Scan the ordered view of faults and group them by different va_blocks 1557 // (managed faults) and service faults for each va_block, in batch. 1558 // Service non-managed faults one at a time as they are encountered during the 1559 // scan. 1560 // 1561 // This function returns NV_WARN_MORE_PROCESSING_REQUIRED if the fault buffer 1562 // was flushed because the needs_fault_buffer_flush flag was set on some GPU VA 1563 // space 1564 static NV_STATUS service_fault_batch(uvm_gpu_t *gpu, 1565 fault_service_mode_t service_mode, 1566 uvm_fault_service_batch_context_t *batch_context) 1567 { 1568 NV_STATUS status = NV_OK; 1569 NvU32 i; 1570 uvm_va_space_t *va_space = NULL; 1571 uvm_gpu_va_space_t *gpu_va_space = NULL; 1572 uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate; 1573 struct mm_struct *mm = NULL; 1574 const bool replay_per_va_block = service_mode != FAULT_SERVICE_MODE_CANCEL && 1575 gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK; 1576 uvm_va_block_context_t *va_block_context = 1577 &gpu->parent->fault_buffer_info.replayable.block_service_context.block_context; 1578 1579 UVM_ASSERT(gpu->parent->replayable_faults_supported); 1580 1581 ats_invalidate->write_faults_in_batch = false; 1582 1583 for (i = 0; i < batch_context->num_coalesced_faults;) { 1584 NvU32 block_faults; 1585 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i]; 1586 uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[current_entry->fault_source.utlb_id]; 1587 1588 UVM_ASSERT(current_entry->va_space); 1589 1590 if (current_entry->va_space != va_space) { 1591 // Fault on a different va_space, drop the lock of the old one... 1592 if (va_space != NULL) { 1593 // TLB entries are invalidated per GPU VA space 1594 status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker); 1595 if (status != NV_OK) 1596 goto fail; 1597 1598 uvm_va_space_up_read(va_space); 1599 uvm_va_space_mm_release_unlock(va_space, mm); 1600 mm = NULL; 1601 } 1602 1603 va_space = current_entry->va_space; 1604 1605 // ... and take the lock of the new one 1606 1607 // If an mm is registered with the VA space, we have to retain it 1608 // in order to lock it before locking the VA space. It is guaranteed 1609 // to remain valid until we release. If no mm is registered, we 1610 // can only service managed faults, not ATS/HMM faults. 1611 mm = uvm_va_space_mm_retain_lock(va_space); 1612 va_block_context->mm = mm; 1613 1614 uvm_va_space_down_read(va_space); 1615 1616 gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent); 1617 if (uvm_processor_mask_test_and_clear_atomic(&va_space->needs_fault_buffer_flush, gpu->id)) { 1618 status = fault_buffer_flush_locked(gpu, 1619 UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT, 1620 UVM_FAULT_REPLAY_TYPE_START, 1621 batch_context); 1622 if (status == NV_OK) 1623 status = NV_WARN_MORE_PROCESSING_REQUIRED; 1624 1625 break; 1626 } 1627 1628 // The case where there is no valid GPU VA space for the GPU in this 1629 // VA space is handled next 1630 } 1631 1632 // Some faults could be already fatal if they cannot be handled by 1633 // the UVM driver 1634 if (current_entry->is_fatal) { 1635 ++i; 1636 batch_context->has_fatal_faults = true; 1637 utlb->has_fatal_faults = true; 1638 UVM_ASSERT(utlb->num_pending_faults > 0); 1639 continue; 1640 } 1641 1642 if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) { 1643 // If there is no GPU VA space for the GPU, ignore the fault. This 1644 // can happen if a GPU VA space is destroyed without explicitly 1645 // freeing all memory ranges (destroying the VA range triggers a 1646 // flush of the fault buffer) and there are stale entries in the 1647 // buffer that got fixed by the servicing in a previous batch. 1648 ++i; 1649 continue; 1650 } 1651 1652 status = service_fault_batch_dispatch(va_space, gpu_va_space, batch_context, i, &block_faults); 1653 // TODO: Bug 3900733: clean up locking in service_fault_batch(). 1654 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) { 1655 uvm_va_space_up_read(va_space); 1656 uvm_va_space_mm_release_unlock(va_space, mm); 1657 mm = NULL; 1658 va_space = NULL; 1659 continue; 1660 } 1661 if (status != NV_OK) 1662 goto fail; 1663 1664 i += block_faults; 1665 1666 // Don't issue replays in cancel mode 1667 if (replay_per_va_block && !batch_context->has_fatal_faults) { 1668 status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context); 1669 if (status != NV_OK) 1670 goto fail; 1671 1672 // Increment the batch id if UVM_PERF_FAULT_REPLAY_POLICY_BLOCK 1673 // is used, as we issue a replay after servicing each VA block 1674 // and we can service a number of VA blocks before returning. 1675 ++batch_context->batch_id; 1676 } 1677 } 1678 1679 // Only clobber status if invalidate_status != NV_OK, since status may also 1680 // contain NV_WARN_MORE_PROCESSING_REQUIRED. 1681 if (va_space != NULL) { 1682 NV_STATUS invalidate_status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker); 1683 if (invalidate_status != NV_OK) 1684 status = invalidate_status; 1685 } 1686 1687 fail: 1688 if (va_space != NULL) { 1689 uvm_va_space_up_read(va_space); 1690 uvm_va_space_mm_release_unlock(va_space, mm); 1691 } 1692 1693 return status; 1694 } 1695 1696 // Tells if the given fault entry is the first one in its uTLB 1697 static bool is_first_fault_in_utlb(uvm_fault_service_batch_context_t *batch_context, NvU32 fault_index) 1698 { 1699 NvU32 i; 1700 NvU32 utlb_id = batch_context->fault_cache[fault_index].fault_source.utlb_id; 1701 1702 for (i = 0; i < fault_index; ++i) { 1703 uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i]; 1704 1705 // We have found a prior fault in the same uTLB 1706 if (current_entry->fault_source.utlb_id == utlb_id) 1707 return false; 1708 } 1709 1710 return true; 1711 } 1712 1713 // Compute the number of fatal and non-fatal faults for a page in the given uTLB 1714 static void faults_for_page_in_utlb(uvm_fault_service_batch_context_t *batch_context, 1715 uvm_va_space_t *va_space, 1716 NvU64 addr, 1717 NvU32 utlb_id, 1718 NvU32 *fatal_faults, 1719 NvU32 *non_fatal_faults) 1720 { 1721 NvU32 i; 1722 1723 *fatal_faults = 0; 1724 *non_fatal_faults = 0; 1725 1726 // Fault filtering is not allowed in the TLB-based fault cancel path 1727 UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults); 1728 1729 for (i = 0; i < batch_context->num_cached_faults; ++i) { 1730 uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i]; 1731 1732 if (current_entry->fault_source.utlb_id == utlb_id && 1733 current_entry->va_space == va_space && current_entry->fault_address == addr) { 1734 // We have found the page 1735 if (current_entry->is_fatal) 1736 ++(*fatal_faults); 1737 else 1738 ++(*non_fatal_faults); 1739 } 1740 } 1741 } 1742 1743 // Function that tells if there are addresses (reminder: they are aligned to 4K) 1744 // with non-fatal faults only 1745 static bool no_fatal_pages_in_utlb(uvm_fault_service_batch_context_t *batch_context, 1746 NvU32 start_index, 1747 NvU32 utlb_id) 1748 { 1749 NvU32 i; 1750 1751 // Fault filtering is not allowed in the TLB-based fault cancel path 1752 UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults); 1753 1754 for (i = start_index; i < batch_context->num_cached_faults; ++i) { 1755 uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i]; 1756 1757 if (current_entry->fault_source.utlb_id == utlb_id) { 1758 // We have found a fault for the uTLB 1759 NvU32 fatal_faults; 1760 NvU32 non_fatal_faults; 1761 1762 faults_for_page_in_utlb(batch_context, 1763 current_entry->va_space, 1764 current_entry->fault_address, 1765 utlb_id, 1766 &fatal_faults, 1767 &non_fatal_faults); 1768 1769 if (non_fatal_faults > 0 && fatal_faults == 0) 1770 return true; 1771 } 1772 } 1773 1774 return false; 1775 } 1776 1777 static void record_fatal_fault_helper(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *entry, UvmEventFatalReason reason) 1778 { 1779 uvm_va_space_t *va_space; 1780 1781 va_space = entry->va_space; 1782 UVM_ASSERT(va_space); 1783 uvm_va_space_down_read(va_space); 1784 // Record fatal fault event 1785 uvm_tools_record_gpu_fatal_fault(gpu->parent->id, va_space, entry, reason); 1786 uvm_va_space_up_read(va_space); 1787 } 1788 1789 // This function tries to find and issue a cancel for each uTLB that meets 1790 // the requirements to guarantee precise fault attribution: 1791 // - No new faults can arrive on the uTLB (uTLB is in lockdown) 1792 // - The first fault in the buffer for a specific uTLB is fatal 1793 // - There are no other addresses in the uTLB with non-fatal faults only 1794 // 1795 // This function and the related helpers iterate over faults as read from HW, 1796 // not through the ordered fault view 1797 // 1798 // TODO: Bug 1766754 1799 // This is very costly, although not critical for performance since we are 1800 // cancelling. 1801 // - Build a list with all the faults within a uTLB 1802 // - Sort by uTLB id 1803 static NV_STATUS try_to_cancel_utlbs(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context) 1804 { 1805 NvU32 i; 1806 1807 // Fault filtering is not allowed in the TLB-based fault cancel path 1808 UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults); 1809 1810 for (i = 0; i < batch_context->num_cached_faults; ++i) { 1811 uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i]; 1812 uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[current_entry->fault_source.utlb_id]; 1813 NvU32 gpc_id = current_entry->fault_source.gpc_id; 1814 NvU32 utlb_id = current_entry->fault_source.utlb_id; 1815 NvU32 client_id = current_entry->fault_source.client_id; 1816 1817 // Only fatal faults are considered 1818 if (!current_entry->is_fatal) 1819 continue; 1820 1821 // Only consider uTLBs in lock-down 1822 if (!utlb->in_lockdown) 1823 continue; 1824 1825 // Issue a single cancel per uTLB 1826 if (utlb->cancelled) 1827 continue; 1828 1829 if (is_first_fault_in_utlb(batch_context, i) && 1830 !no_fatal_pages_in_utlb(batch_context, i + 1, utlb_id)) { 1831 NV_STATUS status; 1832 1833 record_fatal_fault_helper(gpu, current_entry, current_entry->fatal_reason); 1834 1835 status = push_cancel_on_gpu_targeted(gpu, 1836 current_entry->instance_ptr, 1837 gpc_id, 1838 client_id, 1839 &batch_context->tracker); 1840 if (status != NV_OK) 1841 return status; 1842 1843 utlb->cancelled = true; 1844 } 1845 } 1846 1847 return NV_OK; 1848 } 1849 1850 static NvU32 find_fatal_fault_in_utlb(uvm_fault_service_batch_context_t *batch_context, 1851 NvU32 utlb_id) 1852 { 1853 NvU32 i; 1854 1855 // Fault filtering is not allowed in the TLB-based fault cancel path 1856 UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults); 1857 1858 for (i = 0; i < batch_context->num_cached_faults; ++i) { 1859 if (batch_context->fault_cache[i].is_fatal && 1860 batch_context->fault_cache[i].fault_source.utlb_id == utlb_id) 1861 return i; 1862 } 1863 1864 return i; 1865 } 1866 1867 static NvU32 is_fatal_fault_in_buffer(uvm_fault_service_batch_context_t *batch_context, 1868 uvm_fault_buffer_entry_t *fault) 1869 { 1870 NvU32 i; 1871 1872 // Fault filtering is not allowed in the TLB-based fault cancel path 1873 UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults); 1874 1875 for (i = 0; i < batch_context->num_cached_faults; ++i) { 1876 uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i]; 1877 if (cmp_fault_instance_ptr(current_entry, fault) == 0 && 1878 current_entry->fault_address == fault->fault_address && 1879 current_entry->fault_access_type == fault->fault_access_type && 1880 current_entry->fault_source.utlb_id == fault->fault_source.utlb_id) { 1881 return true; 1882 } 1883 } 1884 1885 return false; 1886 } 1887 1888 typedef enum 1889 { 1890 // Only cancel faults flagged as fatal 1891 FAULT_CANCEL_MODE_FATAL, 1892 1893 // Cancel all faults in the batch unconditionally 1894 FAULT_CANCEL_MODE_ALL, 1895 } fault_cancel_mode_t; 1896 1897 // Cancel faults in the given fault service batch context. The function provides 1898 // two different modes depending on the value of cancel_mode: 1899 // - If cancel_mode == FAULT_CANCEL_MODE_FATAL, only faults flagged as fatal 1900 // will be cancelled. In this case, the reason reported to tools is the one 1901 // contained in the fault entry itself. 1902 // - If cancel_mode == FAULT_CANCEL_MODE_ALL, all faults will be cancelled 1903 // unconditionally. In this case, the reason reported to tools for non-fatal 1904 // faults is the one passed to this function. 1905 static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu, 1906 uvm_fault_service_batch_context_t *batch_context, 1907 fault_cancel_mode_t cancel_mode, 1908 UvmEventFatalReason reason) 1909 { 1910 NV_STATUS status = NV_OK; 1911 NV_STATUS fault_status; 1912 uvm_va_space_t *va_space = NULL; 1913 NvU32 i; 1914 1915 UVM_ASSERT(gpu->parent->fault_cancel_va_supported); 1916 if (cancel_mode == FAULT_CANCEL_MODE_ALL) 1917 UVM_ASSERT(reason != UvmEventFatalReasonInvalid); 1918 1919 for (i = 0; i < batch_context->num_coalesced_faults; ++i) { 1920 uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i]; 1921 1922 UVM_ASSERT(current_entry->va_space); 1923 1924 if (current_entry->va_space != va_space) { 1925 // Fault on a different va_space, drop the lock of the old one... 1926 if (va_space != NULL) 1927 uvm_va_space_up_read(va_space); 1928 1929 va_space = current_entry->va_space; 1930 1931 // ... and take the lock of the new one 1932 uvm_va_space_down_read(va_space); 1933 1934 // We don't need to check whether a buffer flush is required 1935 // (due to VA range destruction). 1936 // - For cancel_mode == FAULT_CANCEL_MODE_FATAL, once a fault is 1937 // flagged as fatal we need to cancel it, even if its VA range no 1938 // longer exists. 1939 // - For cancel_mode == FAULT_CANCEL_MODE_ALL we don't care about 1940 // any of this, we just want to trigger RC in RM. 1941 } 1942 1943 if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) { 1944 // If there is no GPU VA space for the GPU, ignore the fault. 1945 // This can happen if the GPU VA did not exist in 1946 // service_fault_batch(), or it was destroyed since then. 1947 // This is to avoid targetting a PDB that might have been reused 1948 // by another process. 1949 continue; 1950 } 1951 1952 // Cancel the fault 1953 if (cancel_mode == FAULT_CANCEL_MODE_ALL || current_entry->is_fatal) { 1954 uvm_fault_cancel_va_mode_t cancel_va_mode = current_entry->replayable.cancel_va_mode; 1955 1956 // If cancelling unconditionally and the fault was not fatal, 1957 // set the cancel reason passed to this function 1958 if (!current_entry->is_fatal) { 1959 current_entry->fatal_reason = reason; 1960 cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL; 1961 } 1962 1963 status = cancel_fault_precise_va(gpu, current_entry, cancel_va_mode); 1964 if (status != NV_OK) 1965 break; 1966 } 1967 } 1968 1969 if (va_space != NULL) 1970 uvm_va_space_up_read(va_space); 1971 1972 // After cancelling the fatal faults, the fault buffer is flushed to remove 1973 // any potential duplicated fault that may have been added while processing 1974 // the faults in this batch. This flush also avoids doing unnecessary 1975 // processing after the fatal faults have been cancelled, so all the rest 1976 // are unlikely to remain after a replay because the context is probably in 1977 // the process of dying. 1978 fault_status = fault_buffer_flush_locked(gpu, 1979 UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT, 1980 UVM_FAULT_REPLAY_TYPE_START, 1981 batch_context); 1982 1983 // We report the first encountered error. 1984 if (status == NV_OK) 1985 status = fault_status; 1986 1987 return status; 1988 } 1989 1990 // Function called when the system has found a global error and needs to 1991 // trigger RC in RM. 1992 static void cancel_fault_batch_tlb(uvm_gpu_t *gpu, 1993 uvm_fault_service_batch_context_t *batch_context, 1994 UvmEventFatalReason reason) 1995 { 1996 NvU32 i; 1997 1998 for (i = 0; i < batch_context->num_coalesced_faults; ++i) { 1999 NV_STATUS status = NV_OK; 2000 uvm_fault_buffer_entry_t *current_entry; 2001 uvm_fault_buffer_entry_t *coalesced_entry; 2002 2003 current_entry = batch_context->ordered_fault_cache[i]; 2004 2005 // The list iteration below skips the entry used as 'head'. 2006 // Report the 'head' entry explicitly. 2007 uvm_va_space_down_read(current_entry->va_space); 2008 uvm_tools_record_gpu_fatal_fault(gpu->parent->id, current_entry->va_space, current_entry, reason); 2009 2010 list_for_each_entry(coalesced_entry, ¤t_entry->merged_instances_list, merged_instances_list) 2011 uvm_tools_record_gpu_fatal_fault(gpu->parent->id, current_entry->va_space, coalesced_entry, reason); 2012 uvm_va_space_up_read(current_entry->va_space); 2013 2014 // We need to cancel each instance pointer to correctly handle faults from multiple contexts. 2015 status = push_cancel_on_gpu_global(gpu, current_entry->instance_ptr, &batch_context->tracker); 2016 if (status != NV_OK) 2017 break; 2018 } 2019 } 2020 2021 static void cancel_fault_batch(uvm_gpu_t *gpu, 2022 uvm_fault_service_batch_context_t *batch_context, 2023 UvmEventFatalReason reason) 2024 { 2025 if (gpu->parent->fault_cancel_va_supported) { 2026 cancel_faults_precise_va(gpu, batch_context, FAULT_CANCEL_MODE_ALL, reason); 2027 return; 2028 } 2029 2030 cancel_fault_batch_tlb(gpu, batch_context, reason); 2031 } 2032 2033 2034 // Current fault cancel algorithm 2035 // 2036 // 1- Disable prefetching to avoid new requests keep coming and flooding the 2037 // buffer. 2038 // LOOP 2039 // 2- Record one fatal fault per uTLB to check if it shows up after the replay 2040 // 3- Flush fault buffer (REPLAY_TYPE_START_ACK_ALL to prevent new faults from 2041 // coming to TLBs with pending faults) 2042 // 4- Wait for replay to finish 2043 // 5- Fetch all faults from buffer 2044 // 6- Check what uTLBs are in lockdown mode and can be cancelled 2045 // 7- Preprocess faults (order per va_space, fault address, access type) 2046 // 8- Service all non-fatal faults and mark all non-serviceable faults as fatal 2047 // 6.1- If fatal faults are not found, we are done 2048 // 9- Search for a uTLB which can be targeted for cancel, as described in 2049 // try_to_cancel_utlbs. If found, cancel it. 2050 // END LOOP 2051 // 10- Re-enable prefetching 2052 // 2053 // NOTE: prefetch faults MUST NOT trigger fault cancel. We make sure that no 2054 // prefetch faults are left in the buffer by disabling prefetching and 2055 // flushing the fault buffer afterwards (prefetch faults are not replayed and, 2056 // therefore, will not show up again) 2057 static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context) 2058 { 2059 NV_STATUS status; 2060 NV_STATUS tracker_status; 2061 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 2062 bool first = true; 2063 2064 UVM_ASSERT(gpu->parent->replayable_faults_supported); 2065 2066 // 1) Disable prefetching to avoid new requests keep coming and flooding 2067 // the buffer 2068 if (gpu->parent->fault_buffer_info.prefetch_faults_enabled) 2069 gpu->parent->arch_hal->disable_prefetch_faults(gpu->parent); 2070 2071 while (1) { 2072 NvU32 utlb_id; 2073 2074 // 2) Record one fatal fault per uTLB to check if it shows up after 2075 // the replay. This is used to handle the case in which the uTLB is 2076 // being cancelled from behind our backs by RM. See the comment in 2077 // step 6. 2078 for (utlb_id = 0; utlb_id <= batch_context->max_utlb_id; ++utlb_id) { 2079 uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[utlb_id]; 2080 2081 if (!first && utlb->has_fatal_faults) { 2082 NvU32 idx = find_fatal_fault_in_utlb(batch_context, utlb_id); 2083 UVM_ASSERT(idx < batch_context->num_cached_faults); 2084 2085 utlb->prev_fatal_fault = batch_context->fault_cache[idx]; 2086 } 2087 else { 2088 utlb->prev_fatal_fault.fault_address = (NvU64)-1; 2089 } 2090 } 2091 first = false; 2092 2093 // 3) Flush fault buffer. After this call, all faults from any of the 2094 // faulting uTLBs are before PUT. New faults from other uTLBs can keep 2095 // arriving. Therefore, in each iteration we just try to cancel faults 2096 // from uTLBs that contained fatal faults in the previous iterations 2097 // and will cause the TLB to stop generating new page faults after the 2098 // following replay with type UVM_FAULT_REPLAY_TYPE_START_ACK_ALL 2099 status = fault_buffer_flush_locked(gpu, 2100 UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT, 2101 UVM_FAULT_REPLAY_TYPE_START_ACK_ALL, 2102 batch_context); 2103 if (status != NV_OK) 2104 break; 2105 2106 // 4) Wait for replay to finish 2107 status = uvm_tracker_wait(&replayable_faults->replay_tracker); 2108 if (status != NV_OK) 2109 break; 2110 2111 batch_context->num_invalid_prefetch_faults = 0; 2112 batch_context->num_replays = 0; 2113 batch_context->has_fatal_faults = false; 2114 batch_context->has_throttled_faults = false; 2115 2116 // 5) Fetch all faults from buffer 2117 fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_ALL); 2118 ++batch_context->batch_id; 2119 2120 UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults); 2121 2122 // No more faults left, we are done 2123 if (batch_context->num_cached_faults == 0) 2124 break; 2125 2126 // 6) Check what uTLBs are in lockdown mode and can be cancelled 2127 for (utlb_id = 0; utlb_id <= batch_context->max_utlb_id; ++utlb_id) { 2128 uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[utlb_id]; 2129 2130 utlb->in_lockdown = false; 2131 utlb->cancelled = false; 2132 2133 if (utlb->prev_fatal_fault.fault_address != (NvU64)-1) { 2134 // If a previously-reported fault shows up again we can "safely" 2135 // assume that the uTLB that contains it is in lockdown mode 2136 // and no new translations will show up before cancel. 2137 // A fatal fault could only be removed behind our backs by RM 2138 // issuing a cancel, which only happens when RM is resetting the 2139 // engine. That means the instance pointer can't generate any 2140 // new faults, so we won't have an ABA problem where a new 2141 // fault arrives with the same state. 2142 if (is_fatal_fault_in_buffer(batch_context, &utlb->prev_fatal_fault)) 2143 utlb->in_lockdown = true; 2144 } 2145 } 2146 2147 // 7) Preprocess faults 2148 status = preprocess_fault_batch(gpu, batch_context); 2149 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) 2150 continue; 2151 else if (status != NV_OK) 2152 break; 2153 2154 // 8) Service all non-fatal faults and mark all non-serviceable faults 2155 // as fatal 2156 status = service_fault_batch(gpu, FAULT_SERVICE_MODE_CANCEL, batch_context); 2157 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) 2158 continue; 2159 2160 UVM_ASSERT(batch_context->num_replays == 0); 2161 if (status == NV_ERR_NO_MEMORY) 2162 continue; 2163 else if (status != NV_OK) 2164 break; 2165 2166 // No more fatal faults left, we are done 2167 if (!batch_context->has_fatal_faults) 2168 break; 2169 2170 // 9) Search for uTLBs that contain fatal faults and meet the 2171 // requirements to be cancelled 2172 try_to_cancel_utlbs(gpu, batch_context); 2173 } 2174 2175 // 10) Re-enable prefetching 2176 if (gpu->parent->fault_buffer_info.prefetch_faults_enabled) 2177 gpu->parent->arch_hal->enable_prefetch_faults(gpu->parent); 2178 2179 if (status == NV_OK) 2180 status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context); 2181 2182 tracker_status = uvm_tracker_wait(&batch_context->tracker); 2183 2184 return status == NV_OK? tracker_status: status; 2185 } 2186 2187 static NV_STATUS cancel_faults_precise(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context) 2188 { 2189 UVM_ASSERT(batch_context->has_fatal_faults); 2190 if (gpu->parent->fault_cancel_va_supported) { 2191 return cancel_faults_precise_va(gpu, 2192 batch_context, 2193 FAULT_CANCEL_MODE_FATAL, 2194 UvmEventFatalReasonInvalid); 2195 } 2196 2197 return cancel_faults_precise_tlb(gpu, batch_context); 2198 } 2199 2200 static void enable_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu, uvm_fault_service_batch_context_t *batch_context) 2201 { 2202 if (!parent_gpu->prefetch_fault_supported) 2203 return; 2204 2205 // If more than 66% of faults are invalid prefetch accesses, disable 2206 // prefetch faults for a while. 2207 // Some tests rely on this logic (and ratio) to correctly disable prefetch 2208 // fault reporting. If the logic changes, the tests will have to be changed. 2209 if (parent_gpu->fault_buffer_info.prefetch_faults_enabled && 2210 uvm_perf_reenable_prefetch_faults_lapse_msec > 0 && 2211 ((batch_context->num_invalid_prefetch_faults * 3 > parent_gpu->fault_buffer_info.max_batch_size * 2) || 2212 (uvm_enable_builtin_tests && 2213 parent_gpu->rm_info.isSimulated && 2214 batch_context->num_invalid_prefetch_faults > 5))) { 2215 uvm_gpu_disable_prefetch_faults(parent_gpu); 2216 } 2217 else if (!parent_gpu->fault_buffer_info.prefetch_faults_enabled) { 2218 NvU64 lapse = NV_GETTIME() - parent_gpu->fault_buffer_info.disable_prefetch_faults_timestamp; 2219 2220 // Reenable prefetch faults after some time 2221 if (lapse > ((NvU64)uvm_perf_reenable_prefetch_faults_lapse_msec * (1000 * 1000))) 2222 uvm_gpu_enable_prefetch_faults(parent_gpu); 2223 } 2224 } 2225 2226 void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu) 2227 { 2228 NvU32 num_replays = 0; 2229 NvU32 num_batches = 0; 2230 NvU32 num_throttled = 0; 2231 NV_STATUS status = NV_OK; 2232 uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable; 2233 uvm_fault_service_batch_context_t *batch_context = &replayable_faults->batch_service_context; 2234 2235 UVM_ASSERT(gpu->parent->replayable_faults_supported); 2236 2237 uvm_tracker_init(&batch_context->tracker); 2238 2239 // Process all faults in the buffer 2240 while (1) { 2241 if (num_throttled >= uvm_perf_fault_max_throttle_per_service || 2242 num_batches >= uvm_perf_fault_max_batches_per_service) { 2243 break; 2244 } 2245 2246 batch_context->num_invalid_prefetch_faults = 0; 2247 batch_context->num_duplicate_faults = 0; 2248 batch_context->num_replays = 0; 2249 batch_context->has_fatal_faults = false; 2250 batch_context->has_throttled_faults = false; 2251 2252 fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_BATCH_READY); 2253 if (batch_context->num_cached_faults == 0) 2254 break; 2255 2256 ++batch_context->batch_id; 2257 2258 status = preprocess_fault_batch(gpu, batch_context); 2259 2260 num_replays += batch_context->num_replays; 2261 2262 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) 2263 continue; 2264 else if (status != NV_OK) 2265 break; 2266 2267 status = service_fault_batch(gpu, FAULT_SERVICE_MODE_REGULAR, batch_context); 2268 2269 // We may have issued replays even if status != NV_OK if 2270 // UVM_PERF_FAULT_REPLAY_POLICY_BLOCK is being used or the fault buffer 2271 // was flushed 2272 num_replays += batch_context->num_replays; 2273 2274 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) 2275 continue; 2276 2277 enable_disable_prefetch_faults(gpu->parent, batch_context); 2278 2279 if (status != NV_OK) { 2280 // Unconditionally cancel all faults to trigger RC. This will not 2281 // provide precise attribution, but this case handles global 2282 // errors such as OOM or ECC where it's not reasonable to 2283 // guarantee precise attribution. We ignore the return value of 2284 // the cancel operation since this path is already returning an 2285 // error code. 2286 cancel_fault_batch(gpu, batch_context, uvm_tools_status_to_fatal_fault_reason(status)); 2287 break; 2288 } 2289 2290 if (batch_context->has_fatal_faults) { 2291 status = uvm_tracker_wait(&batch_context->tracker); 2292 if (status == NV_OK) 2293 status = cancel_faults_precise(gpu, batch_context); 2294 2295 break; 2296 } 2297 2298 if (replayable_faults->replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BATCH) { 2299 status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context); 2300 if (status != NV_OK) 2301 break; 2302 ++num_replays; 2303 } 2304 else if (replayable_faults->replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BATCH_FLUSH) { 2305 uvm_gpu_buffer_flush_mode_t flush_mode = UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT; 2306 2307 if (batch_context->num_duplicate_faults * 100 > 2308 batch_context->num_cached_faults * replayable_faults->replay_update_put_ratio) { 2309 flush_mode = UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT; 2310 } 2311 2312 status = fault_buffer_flush_locked(gpu, flush_mode, UVM_FAULT_REPLAY_TYPE_START, batch_context); 2313 if (status != NV_OK) 2314 break; 2315 ++num_replays; 2316 status = uvm_tracker_wait(&replayable_faults->replay_tracker); 2317 if (status != NV_OK) 2318 break; 2319 } 2320 2321 if (batch_context->has_throttled_faults) 2322 ++num_throttled; 2323 2324 ++num_batches; 2325 } 2326 2327 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) 2328 status = NV_OK; 2329 2330 // Make sure that we issue at least one replay if no replay has been 2331 // issued yet to avoid dropping faults that do not show up in the buffer 2332 if ((status == NV_OK && replayable_faults->replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_ONCE) || 2333 num_replays == 0) 2334 status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context); 2335 2336 uvm_tracker_deinit(&batch_context->tracker); 2337 2338 if (status != NV_OK) 2339 UVM_DBG_PRINT("Error servicing replayable faults on GPU: %s\n", uvm_gpu_name(gpu)); 2340 } 2341 2342 void uvm_gpu_enable_prefetch_faults(uvm_parent_gpu_t *parent_gpu) 2343 { 2344 UVM_ASSERT(parent_gpu->isr.replayable_faults.handling); 2345 UVM_ASSERT(parent_gpu->prefetch_fault_supported); 2346 2347 if (!parent_gpu->fault_buffer_info.prefetch_faults_enabled) { 2348 parent_gpu->arch_hal->enable_prefetch_faults(parent_gpu); 2349 parent_gpu->fault_buffer_info.prefetch_faults_enabled = true; 2350 } 2351 } 2352 2353 void uvm_gpu_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu) 2354 { 2355 UVM_ASSERT(parent_gpu->isr.replayable_faults.handling); 2356 UVM_ASSERT(parent_gpu->prefetch_fault_supported); 2357 2358 if (parent_gpu->fault_buffer_info.prefetch_faults_enabled) { 2359 parent_gpu->arch_hal->disable_prefetch_faults(parent_gpu); 2360 parent_gpu->fault_buffer_info.prefetch_faults_enabled = false; 2361 parent_gpu->fault_buffer_info.disable_prefetch_faults_timestamp = NV_GETTIME(); 2362 } 2363 } 2364 2365 const char *uvm_perf_fault_replay_policy_string(uvm_perf_fault_replay_policy_t replay_policy) 2366 { 2367 BUILD_BUG_ON(UVM_PERF_FAULT_REPLAY_POLICY_MAX != 4); 2368 2369 switch (replay_policy) { 2370 UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_BLOCK); 2371 UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_BATCH); 2372 UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_BATCH_FLUSH); 2373 UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_ONCE); 2374 UVM_ENUM_STRING_DEFAULT(); 2375 } 2376 } 2377 2378 NV_STATUS uvm_test_get_prefetch_faults_reenable_lapse(UVM_TEST_GET_PREFETCH_FAULTS_REENABLE_LAPSE_PARAMS *params, 2379 struct file *filp) 2380 { 2381 params->reenable_lapse = uvm_perf_reenable_prefetch_faults_lapse_msec; 2382 2383 return NV_OK; 2384 } 2385 2386 NV_STATUS uvm_test_set_prefetch_faults_reenable_lapse(UVM_TEST_SET_PREFETCH_FAULTS_REENABLE_LAPSE_PARAMS *params, 2387 struct file *filp) 2388 { 2389 uvm_perf_reenable_prefetch_faults_lapse_msec = params->reenable_lapse; 2390 2391 return NV_OK; 2392 } 2393 2394 NV_STATUS uvm_test_drain_replayable_faults(UVM_TEST_DRAIN_REPLAYABLE_FAULTS_PARAMS *params, struct file *filp) 2395 { 2396 uvm_gpu_t *gpu; 2397 NV_STATUS status = NV_OK; 2398 uvm_spin_loop_t spin; 2399 bool pending = true; 2400 uvm_va_space_t *va_space = uvm_va_space_get(filp); 2401 2402 gpu = uvm_va_space_retain_gpu_by_uuid(va_space, ¶ms->gpu_uuid); 2403 if (!gpu) 2404 return NV_ERR_INVALID_DEVICE; 2405 2406 uvm_spin_loop_init(&spin); 2407 2408 do { 2409 uvm_gpu_replayable_faults_isr_lock(gpu->parent); 2410 pending = uvm_gpu_replayable_faults_pending(gpu->parent); 2411 uvm_gpu_replayable_faults_isr_unlock(gpu->parent); 2412 2413 if (!pending) 2414 break; 2415 2416 if (fatal_signal_pending(current)) { 2417 status = NV_ERR_SIGNAL_PENDING; 2418 break; 2419 } 2420 2421 UVM_SPIN_LOOP(&spin); 2422 } while (uvm_spin_loop_elapsed(&spin) < params->timeout_ns); 2423 2424 if (pending && status == NV_OK) 2425 status = NV_ERR_TIMEOUT; 2426 2427 uvm_gpu_release(gpu); 2428 2429 return status; 2430 } 2431