1 /******************************************************************************* 2 Copyright (c) 2017-2022 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 *******************************************************************************/ 22 23 #include "nv_uvm_interface.h" 24 #include "uvm_common.h" 25 #include "uvm_api.h" 26 #include "uvm_gpu_non_replayable_faults.h" 27 #include "uvm_gpu.h" 28 #include "uvm_hal.h" 29 #include "uvm_lock.h" 30 #include "uvm_tools.h" 31 #include "uvm_user_channel.h" 32 #include "uvm_va_space_mm.h" 33 #include "uvm_va_block.h" 34 #include "uvm_va_range.h" 35 #include "uvm_kvmalloc.h" 36 #include "uvm_ats_faults.h" 37 38 // In the context of a CUDA application using Unified Memory, it is sometimes 39 // assumed that there is a single type of fault, originated by a memory 40 // load/store in a SM (Graphics Engine), which itself can be traced back to a 41 // memory access in a CUDA kernel written by a developer. In reality, faults can 42 // also be triggered by other parts of the GPU i.e. by other engines, as the 43 // result of developer-facing APIs, or operations initiated by a user-mode 44 // driver. The Graphics Engine faults are called replayable faults, while the 45 // rest are called non-replayable. The differences between the two types of 46 // faults go well beyond the engine originating the fault. 47 // 48 // A non-replayable fault originates in an engine other than Graphics. UVM 49 // services non-replayable faults from the Copy and PBDMA (Host/ESCHED) Engines. 50 // Non-replayable faults originated in other engines are considered fatal, and 51 // do not reach the UVM driver. While UVM can distinguish between faults 52 // originated in the Copy Engine and faults originated in the PBDMA Engine, in 53 // practice they are all processed in the same way. Replayable fault support in 54 // Graphics was introduced in Pascal, and non-replayable fault support in CE and 55 // PBDMA Engines was introduced in Volta; all non-replayable faults were fatal 56 // before Volta. 57 // 58 // An example of a Copy Engine non-replayable fault is a memory copy between two 59 // virtual addresses on a GPU, in which either the source or destination 60 // pointers are not currently mapped to a physical address in the page tables of 61 // the GPU. An example of a PBDMA non-replayable fault is a semaphore acquire in 62 // which the semaphore virtual address passed as argument is currently not 63 // mapped to any physical address. 64 // 65 // Non-replayable faults originated in the CE and PBDMA Engines result in HW 66 // preempting the channel associated with the fault, a mechanism called "fault 67 // and switch". More precisely, the switching out affects not only the channel 68 // that caused the fault, but all the channels in the same Time Slice Group 69 // (TSG). SW intervention is required so all the channels in the TSG can be 70 // scheduled again, but channels in other TSGs can be scheduled and resume their 71 // normal execution. In the case of the non-replayable faults serviced by UVM, 72 // the driver clears a channel's faulted bit upon successful servicing, but it 73 // is only when the servicing has completed for all the channels in the TSG that 74 // they are all allowed to be switched in. Non-replayable faults originated in 75 // engines other than CE and PBDMA are fatal because these other units lack 76 // hardware support for the "fault and switch" and restart mechanisms just 77 // described. 78 // On the other hand, replayable faults block preemption of the channel until 79 // software (UVM) services the fault. This is sometimes known as "fault and 80 // stall". Note that replayable faults prevent the execution of other channels, 81 // which are stalled until the fault is serviced. 82 // 83 // The "non-replayable" naming alludes to the fact that, historically, these 84 // faults indicated a fatal condition so there was no recovery ("replay") 85 // process, and SW could not ignore or drop the fault. As discussed before, this 86 // is no longer the case and while at times the hardware documentation uses the 87 // "fault and replay" expression for CE and PBDMA faults, we reserve that 88 // expression for Graphics faults and favor the term "fault and reschedule" 89 // instead. Replaying a fault does not necessarily imply that UVM has serviced 90 // it. For example, the UVM driver may choose to ignore the replayable faults 91 // associated with a GPU for some period of time if it detects that there is 92 // thrashing going on, and the GPU needs to be throttled. The fault entries 93 // corresponding to the ignored faults are never saved by UVM, but new entries 94 // (and new interrupts) will be generated by hardware each time after UVM issues 95 // a replay. 96 // 97 // While replayable faults are always the responsibility of UVM, the servicing 98 // of non-replayable faults is split between RM and UVM. In the case of 99 // replayable faults, UVM has sole SW ownership of the hardware buffer 100 // containing the faults, and it is responsible for updating the GET pointer to 101 // signal the hardware that a number of faults have been read. UVM also reads 102 // the PUT pointer value written by hardware. But in the case of non-replayable 103 // faults, UVM reads the fault entries out of a regular CPU buffer, shared with 104 // RM, called "shadow buffer". RM is responsible for accessing the actual 105 // non-replayable hardware buffer, reading the PUT pointer, updating the GET 106 // pointer, and moving CE and PBDMA faults from the hardware buffer to the 107 // shadow buffer. Because the Resource Manager owns the HW buffer, UVM needs to 108 // call RM when servicing a non-replayable fault, first to figure out if there 109 // is a pending fault, and then to read entries from the shadow buffer. 110 // 111 // Once UVM has parsed a non-replayable fault entry corresponding to managed 112 // memory, and identified the VA block associated with it, the servicing logic 113 // for that block is identical to that of a replayable fault, see 114 // uvm_va_block_service_locked. Another similarity between the two types of 115 // faults is that they use the same entry format, uvm_fault_buffer_entry_t. 116 117 118 // There is no error handling in this function. The caller is in charge of 119 // calling uvm_gpu_fault_buffer_deinit_non_replayable_faults on failure. 120 NV_STATUS uvm_gpu_fault_buffer_init_non_replayable_faults(uvm_parent_gpu_t *parent_gpu) 121 { 122 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable; 123 124 UVM_ASSERT(parent_gpu->non_replayable_faults_supported); 125 126 non_replayable_faults->shadow_buffer_copy = NULL; 127 non_replayable_faults->fault_cache = NULL; 128 129 non_replayable_faults->max_faults = parent_gpu->fault_buffer_info.rm_info.nonReplayable.bufferSize / 130 parent_gpu->fault_buffer_hal->entry_size(parent_gpu); 131 132 non_replayable_faults->shadow_buffer_copy = 133 uvm_kvmalloc_zero(parent_gpu->fault_buffer_info.rm_info.nonReplayable.bufferSize); 134 if (!non_replayable_faults->shadow_buffer_copy) 135 return NV_ERR_NO_MEMORY; 136 137 non_replayable_faults->fault_cache = uvm_kvmalloc_zero(non_replayable_faults->max_faults * 138 sizeof(*non_replayable_faults->fault_cache)); 139 if (!non_replayable_faults->fault_cache) 140 return NV_ERR_NO_MEMORY; 141 142 uvm_tracker_init(&non_replayable_faults->clear_faulted_tracker); 143 uvm_tracker_init(&non_replayable_faults->fault_service_tracker); 144 145 return NV_OK; 146 } 147 148 void uvm_gpu_fault_buffer_deinit_non_replayable_faults(uvm_parent_gpu_t *parent_gpu) 149 { 150 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable; 151 152 if (non_replayable_faults->fault_cache) { 153 UVM_ASSERT(uvm_tracker_is_empty(&non_replayable_faults->clear_faulted_tracker)); 154 uvm_tracker_deinit(&non_replayable_faults->clear_faulted_tracker); 155 156 UVM_ASSERT(uvm_tracker_is_empty(&non_replayable_faults->fault_service_tracker)); 157 uvm_tracker_deinit(&non_replayable_faults->fault_service_tracker); 158 } 159 160 uvm_kvfree(non_replayable_faults->shadow_buffer_copy); 161 uvm_kvfree(non_replayable_faults->fault_cache); 162 non_replayable_faults->shadow_buffer_copy = NULL; 163 non_replayable_faults->fault_cache = NULL; 164 } 165 166 bool uvm_gpu_non_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu) 167 { 168 NV_STATUS status; 169 NvBool has_pending_faults; 170 171 UVM_ASSERT(parent_gpu->isr.non_replayable_faults.handling); 172 173 status = nvUvmInterfaceHasPendingNonReplayableFaults(&parent_gpu->fault_buffer_info.rm_info, 174 &has_pending_faults); 175 UVM_ASSERT(status == NV_OK); 176 177 return has_pending_faults == NV_TRUE; 178 } 179 180 static NvU32 fetch_non_replayable_fault_buffer_entries(uvm_gpu_t *gpu) 181 { 182 NV_STATUS status; 183 NvU32 i = 0; 184 NvU32 cached_faults = 0; 185 uvm_fault_buffer_entry_t *fault_cache; 186 NvU32 entry_size = gpu->parent->fault_buffer_hal->entry_size(gpu->parent); 187 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable; 188 char *current_hw_entry = (char *)non_replayable_faults->shadow_buffer_copy; 189 190 fault_cache = non_replayable_faults->fault_cache; 191 192 UVM_ASSERT(uvm_sem_is_locked(&gpu->parent->isr.non_replayable_faults.service_lock)); 193 UVM_ASSERT(gpu->parent->non_replayable_faults_supported); 194 195 status = nvUvmInterfaceGetNonReplayableFaults(&gpu->parent->fault_buffer_info.rm_info, 196 non_replayable_faults->shadow_buffer_copy, 197 &cached_faults); 198 UVM_ASSERT(status == NV_OK); 199 200 // Parse all faults 201 for (i = 0; i < cached_faults; ++i) { 202 uvm_fault_buffer_entry_t *fault_entry = &non_replayable_faults->fault_cache[i]; 203 204 gpu->parent->fault_buffer_hal->parse_non_replayable_entry(gpu->parent, current_hw_entry, fault_entry); 205 206 // The GPU aligns the fault addresses to 4k, but all of our tracking is 207 // done in PAGE_SIZE chunks which might be larger. 208 fault_entry->fault_address = UVM_PAGE_ALIGN_DOWN(fault_entry->fault_address); 209 210 // Make sure that all fields in the entry are properly initialized 211 fault_entry->va_space = NULL; 212 fault_entry->is_fatal = (fault_entry->fault_type >= UVM_FAULT_TYPE_FATAL); 213 fault_entry->filtered = false; 214 215 fault_entry->num_instances = 1; 216 fault_entry->access_type_mask = uvm_fault_access_type_mask_bit(fault_entry->fault_access_type); 217 INIT_LIST_HEAD(&fault_entry->merged_instances_list); 218 fault_entry->non_replayable.buffer_index = i; 219 220 if (fault_entry->is_fatal) { 221 // Record the fatal fault event later as we need the va_space locked 222 fault_entry->fatal_reason = UvmEventFatalReasonInvalidFaultType; 223 } 224 else { 225 fault_entry->fatal_reason = UvmEventFatalReasonInvalid; 226 } 227 228 current_hw_entry += entry_size; 229 } 230 231 return cached_faults; 232 } 233 234 // In SRIOV, the UVM (guest) driver does not have access to the privileged 235 // registers used to clear the faulted bit. Instead, UVM requests host RM to do 236 // the clearing on its behalf, using a SW method. 237 static bool use_clear_faulted_channel_sw_method(uvm_gpu_t *gpu) 238 { 239 if (uvm_gpu_is_virt_mode_sriov(gpu)) { 240 UVM_ASSERT(gpu->parent->has_clear_faulted_channel_sw_method); 241 return true; 242 } 243 244 return false; 245 } 246 247 static NV_STATUS clear_faulted_method_on_gpu(uvm_gpu_t *gpu, 248 uvm_user_channel_t *user_channel, 249 const uvm_fault_buffer_entry_t *fault_entry, 250 NvU32 batch_id, 251 uvm_tracker_t *tracker) 252 { 253 NV_STATUS status; 254 uvm_push_t push; 255 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable; 256 257 UVM_ASSERT(!fault_entry->is_fatal); 258 259 status = uvm_push_begin_acquire(gpu->channel_manager, 260 UVM_CHANNEL_TYPE_MEMOPS, 261 tracker, 262 &push, 263 "Clearing set bit for address 0x%llx", 264 fault_entry->fault_address); 265 if (status != NV_OK) { 266 UVM_ERR_PRINT("Error acquiring tracker before clearing faulted: %s, GPU %s\n", 267 nvstatusToString(status), 268 uvm_gpu_name(gpu)); 269 return status; 270 } 271 272 if (use_clear_faulted_channel_sw_method(gpu)) 273 gpu->parent->host_hal->clear_faulted_channel_sw_method(&push, user_channel, fault_entry); 274 else 275 gpu->parent->host_hal->clear_faulted_channel_method(&push, user_channel, fault_entry); 276 277 uvm_tools_broadcast_replay(gpu, &push, batch_id, fault_entry->fault_source.client_type); 278 279 uvm_push_end(&push); 280 281 // Add this push to the GPU's clear_faulted_tracker so GPU removal can wait 282 // on it. 283 status = uvm_tracker_add_push_safe(&non_replayable_faults->clear_faulted_tracker, &push); 284 285 // Add this push to the channel's clear_faulted_tracker so user channel 286 // removal can wait on it instead of using the per-GPU tracker, which would 287 // require a lock. 288 if (status == NV_OK) 289 status = uvm_tracker_add_push_safe(&user_channel->clear_faulted_tracker, &push); 290 291 return status; 292 } 293 294 static NV_STATUS clear_faulted_register_on_gpu(uvm_gpu_t *gpu, 295 uvm_user_channel_t *user_channel, 296 const uvm_fault_buffer_entry_t *fault_entry, 297 NvU32 batch_id, 298 uvm_tracker_t *tracker) 299 { 300 NV_STATUS status; 301 302 UVM_ASSERT(!gpu->parent->has_clear_faulted_channel_method); 303 304 // We need to wait for all pending work before writing to the channel 305 // register 306 status = uvm_tracker_wait(tracker); 307 if (status != NV_OK) 308 return status; 309 310 gpu->parent->host_hal->clear_faulted_channel_register(user_channel, fault_entry); 311 312 uvm_tools_broadcast_replay_sync(gpu, batch_id, fault_entry->fault_source.client_type); 313 314 return NV_OK; 315 } 316 317 static NV_STATUS clear_faulted_on_gpu(uvm_gpu_t *gpu, 318 uvm_user_channel_t *user_channel, 319 const uvm_fault_buffer_entry_t *fault_entry, 320 NvU32 batch_id, 321 uvm_tracker_t *tracker) 322 { 323 if (gpu->parent->has_clear_faulted_channel_method || use_clear_faulted_channel_sw_method(gpu)) 324 return clear_faulted_method_on_gpu(gpu, user_channel, fault_entry, batch_id, tracker); 325 326 return clear_faulted_register_on_gpu(gpu, user_channel, fault_entry, batch_id, tracker); 327 } 328 329 static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu, 330 uvm_va_block_t *va_block, 331 uvm_va_block_retry_t *va_block_retry, 332 uvm_fault_buffer_entry_t *fault_entry, 333 uvm_service_block_context_t *service_context) 334 { 335 NV_STATUS status = NV_OK; 336 uvm_page_index_t page_index; 337 uvm_perf_thrashing_hint_t thrashing_hint; 338 uvm_processor_id_t new_residency; 339 bool read_duplicate; 340 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 341 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable; 342 343 UVM_ASSERT(!fault_entry->is_fatal); 344 345 uvm_assert_rwsem_locked(&va_space->lock); 346 347 UVM_ASSERT(fault_entry->va_space == va_space); 348 UVM_ASSERT(fault_entry->fault_address >= va_block->start); 349 UVM_ASSERT(fault_entry->fault_address <= va_block->end); 350 351 service_context->block_context.policy = uvm_va_policy_get(va_block, fault_entry->fault_address); 352 353 if (service_context->num_retries == 0) { 354 // notify event to tools/performance heuristics. For now we use a 355 // unique batch id per fault, since we clear the faulted channel for 356 // each fault. 357 uvm_perf_event_notify_gpu_fault(&va_space->perf_events, 358 va_block, 359 gpu->id, 360 service_context->block_context.policy->preferred_location, 361 fault_entry, 362 ++non_replayable_faults->batch_id, 363 false); 364 } 365 366 // Check logical permissions 367 status = uvm_va_block_check_logical_permissions(va_block, 368 &service_context->block_context, 369 gpu->id, 370 uvm_va_block_cpu_page_index(va_block, 371 fault_entry->fault_address), 372 fault_entry->fault_access_type, 373 uvm_range_group_address_migratable(va_space, 374 fault_entry->fault_address)); 375 if (status != NV_OK) { 376 fault_entry->is_fatal = true; 377 fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status); 378 return NV_OK; 379 } 380 381 // TODO: Bug 1880194: Revisit thrashing detection 382 thrashing_hint.type = UVM_PERF_THRASHING_HINT_TYPE_NONE; 383 384 service_context->read_duplicate_count = 0; 385 service_context->thrashing_pin_count = 0; 386 387 page_index = uvm_va_block_cpu_page_index(va_block, fault_entry->fault_address); 388 389 // Compute new residency and update the masks 390 new_residency = uvm_va_block_select_residency(va_block, 391 &service_context->block_context, 392 page_index, 393 gpu->id, 394 fault_entry->access_type_mask, 395 service_context->block_context.policy, 396 &thrashing_hint, 397 UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS, 398 &read_duplicate); 399 400 // Initialize the minimum necessary state in the fault service context 401 uvm_processor_mask_zero(&service_context->resident_processors); 402 403 // Set new residency and update the masks 404 uvm_processor_mask_set(&service_context->resident_processors, new_residency); 405 406 // The masks need to be fully zeroed as the fault region may grow due to prefetching 407 uvm_page_mask_zero(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency); 408 uvm_page_mask_set(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index); 409 410 if (read_duplicate) { 411 uvm_page_mask_zero(&service_context->read_duplicate_mask); 412 uvm_page_mask_set(&service_context->read_duplicate_mask, page_index); 413 service_context->read_duplicate_count = 1; 414 } 415 416 service_context->access_type[page_index] = fault_entry->fault_access_type; 417 418 service_context->region = uvm_va_block_region_for_page(page_index); 419 420 status = uvm_va_block_service_locked(gpu->id, va_block, va_block_retry, service_context); 421 422 ++service_context->num_retries; 423 424 return status; 425 } 426 427 static NV_STATUS service_managed_fault_in_block(uvm_gpu_t *gpu, 428 uvm_va_block_t *va_block, 429 uvm_fault_buffer_entry_t *fault_entry) 430 { 431 NV_STATUS status, tracker_status; 432 uvm_va_block_retry_t va_block_retry; 433 uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.non_replayable.block_service_context; 434 435 service_context->operation = UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS; 436 service_context->num_retries = 0; 437 438 if (uvm_va_block_is_hmm(va_block)) { 439 uvm_hmm_service_context_init(service_context); 440 uvm_hmm_migrate_begin_wait(va_block); 441 } 442 443 uvm_mutex_lock(&va_block->lock); 444 445 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, &va_block_retry, 446 service_managed_fault_in_block_locked(gpu, 447 va_block, 448 &va_block_retry, 449 fault_entry, 450 service_context)); 451 452 tracker_status = uvm_tracker_add_tracker_safe(&gpu->parent->fault_buffer_info.non_replayable.fault_service_tracker, 453 &va_block->tracker); 454 455 uvm_mutex_unlock(&va_block->lock); 456 457 if (uvm_va_block_is_hmm(va_block)) 458 uvm_hmm_migrate_finish(va_block); 459 460 return status == NV_OK? tracker_status: status; 461 } 462 463 // See uvm_unregister_channel for comments on the the channel destruction 464 // sequence. 465 static void kill_channel_delayed(void *_user_channel) 466 { 467 uvm_user_channel_t *user_channel = (uvm_user_channel_t *)_user_channel; 468 uvm_va_space_t *va_space = user_channel->kill_channel.va_space; 469 470 uvm_va_space_down_read_rm(va_space); 471 if (user_channel->gpu_va_space) { 472 // RM handles the fault, which will do the correct fault reporting in the 473 // kernel logs and will initiate channel teardown 474 NV_STATUS status = nvUvmInterfaceReportNonReplayableFault(uvm_gpu_device_handle(user_channel->gpu), 475 user_channel->kill_channel.fault_packet); 476 UVM_ASSERT(status == NV_OK); 477 } 478 uvm_va_space_up_read_rm(va_space); 479 480 uvm_user_channel_release(user_channel); 481 } 482 483 static void kill_channel_delayed_entry(void *user_channel) 484 { 485 UVM_ENTRY_VOID(kill_channel_delayed(user_channel)); 486 } 487 488 static void schedule_kill_channel(uvm_gpu_t *gpu, 489 uvm_fault_buffer_entry_t *fault_entry, 490 uvm_user_channel_t *user_channel) 491 { 492 uvm_va_space_t *va_space = fault_entry->va_space; 493 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable; 494 void *packet = (char *)non_replayable_faults->shadow_buffer_copy + 495 (fault_entry->non_replayable.buffer_index * gpu->parent->fault_buffer_hal->entry_size(gpu->parent)); 496 497 UVM_ASSERT(gpu); 498 UVM_ASSERT(va_space); 499 UVM_ASSERT(user_channel); 500 501 if (user_channel->kill_channel.scheduled) 502 return; 503 504 user_channel->kill_channel.scheduled = true; 505 user_channel->kill_channel.va_space = va_space; 506 507 // Save the packet to be handled by RM in the channel structure 508 memcpy(user_channel->kill_channel.fault_packet, packet, gpu->parent->fault_buffer_hal->entry_size(gpu->parent)); 509 510 // Retain the channel here so it is not prematurely destroyed. It will be 511 // released after forwarding the fault to RM in kill_channel_delayed. 512 uvm_user_channel_retain(user_channel); 513 514 // Schedule a work item to kill the channel 515 nv_kthread_q_item_init(&user_channel->kill_channel.kill_channel_q_item, 516 kill_channel_delayed_entry, 517 user_channel); 518 519 nv_kthread_q_schedule_q_item(&gpu->parent->isr.kill_channel_q, 520 &user_channel->kill_channel.kill_channel_q_item); 521 } 522 523 static void service_fault_fatal(uvm_fault_buffer_entry_t *fault_entry, NV_STATUS status) 524 { 525 UVM_ASSERT(fault_entry->fault_access_type != UVM_FAULT_ACCESS_TYPE_PREFETCH); 526 527 fault_entry->is_fatal = true; 528 fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status); 529 } 530 531 static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space, 532 struct mm_struct *mm, 533 uvm_fault_buffer_entry_t *fault_entry, 534 NV_STATUS lookup_status) 535 { 536 uvm_gpu_t *gpu = gpu_va_space->gpu; 537 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable; 538 uvm_ats_fault_invalidate_t *ats_invalidate = &non_replayable_faults->ats_invalidate; 539 NV_STATUS status = lookup_status; 540 NV_STATUS fatal_fault_status = NV_ERR_INVALID_ADDRESS; 541 542 UVM_ASSERT(!fault_entry->is_fatal); 543 544 // Avoid dropping fault events when the VA block is not found or cannot be created 545 uvm_perf_event_notify_gpu_fault(&fault_entry->va_space->perf_events, 546 NULL, 547 gpu->id, 548 UVM_ID_INVALID, 549 fault_entry, 550 ++non_replayable_faults->batch_id, 551 false); 552 553 if (status != NV_ERR_INVALID_ADDRESS) 554 return status; 555 556 if (uvm_ats_can_service_faults(gpu_va_space, mm)) { 557 struct vm_area_struct *vma; 558 uvm_va_range_t *va_range_next; 559 NvU64 fault_address = fault_entry->fault_address; 560 uvm_fault_access_type_t fault_access_type = fault_entry->fault_access_type; 561 uvm_ats_fault_context_t *ats_context = &non_replayable_faults->ats_context; 562 563 uvm_page_mask_zero(&ats_context->read_fault_mask); 564 uvm_page_mask_zero(&ats_context->write_fault_mask); 565 566 ats_context->client_type = UVM_FAULT_CLIENT_TYPE_HUB; 567 568 ats_invalidate->write_faults_in_batch = false; 569 570 va_range_next = uvm_va_space_iter_first(gpu_va_space->va_space, fault_entry->fault_address, ~0ULL); 571 572 // The VA isn't managed. See if ATS knows about it. 573 vma = find_vma_intersection(mm, fault_address, fault_address + 1); 574 if (!vma || uvm_ats_check_in_gmmu_region(gpu_va_space->va_space, fault_address, va_range_next)) { 575 576 // Do not return error due to logical errors in the application 577 status = NV_OK; 578 } 579 else { 580 NvU64 base = UVM_VA_BLOCK_ALIGN_DOWN(fault_address); 581 uvm_page_mask_t *faults_serviced_mask = &ats_context->faults_serviced_mask; 582 uvm_page_index_t page_index = (fault_address - base) / PAGE_SIZE; 583 uvm_page_mask_t *fault_mask = (fault_access_type >= UVM_FAULT_ACCESS_TYPE_WRITE) ? 584 &ats_context->write_fault_mask : 585 &ats_context->read_fault_mask; 586 587 uvm_page_mask_set(fault_mask, page_index); 588 589 status = uvm_ats_service_faults(gpu_va_space, vma, base, ats_context); 590 if (status == NV_OK) { 591 // Invalidate ATS TLB entries if needed 592 if (uvm_page_mask_test(faults_serviced_mask, page_index)) { 593 status = uvm_ats_invalidate_tlbs(gpu_va_space, 594 ats_invalidate, 595 &non_replayable_faults->fault_service_tracker); 596 fatal_fault_status = NV_OK; 597 } 598 } 599 else { 600 fatal_fault_status = status; 601 } 602 } 603 } 604 else { 605 fatal_fault_status = status; 606 607 // Do not return error due to logical errors in the application 608 status = NV_OK; 609 } 610 611 if (fatal_fault_status != NV_OK) 612 service_fault_fatal(fault_entry, fatal_fault_status); 613 614 return status; 615 } 616 617 static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_entry) 618 { 619 NV_STATUS status; 620 uvm_user_channel_t *user_channel; 621 uvm_va_block_t *va_block; 622 uvm_va_space_t *va_space = NULL; 623 struct mm_struct *mm; 624 uvm_gpu_va_space_t *gpu_va_space; 625 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable; 626 uvm_va_block_context_t *va_block_context = 627 &gpu->parent->fault_buffer_info.non_replayable.block_service_context.block_context; 628 629 status = uvm_gpu_fault_entry_to_va_space(gpu, fault_entry, &va_space); 630 if (status != NV_OK) { 631 // The VA space lookup will fail if we're running concurrently with 632 // removal of the channel from the VA space (channel unregister, GPU VA 633 // space unregister, VA space destroy, etc). The other thread will stop 634 // the channel and remove the channel from the table, so the faulting 635 // condition will be gone. In the case of replayable faults we need to 636 // flush the buffer, but here we can just ignore the entry and proceed 637 // on. 638 // 639 // Note that we can't have any subcontext issues here, since non- 640 // replayable faults only use the address space of their channel. 641 UVM_ASSERT(status == NV_ERR_INVALID_CHANNEL); 642 UVM_ASSERT(!va_space); 643 return NV_OK; 644 } 645 646 UVM_ASSERT(va_space); 647 648 // If an mm is registered with the VA space, we have to retain it 649 // in order to lock it before locking the VA space. It is guaranteed 650 // to remain valid until we release. If no mm is registered, we 651 // can only service managed faults, not ATS/HMM faults. 652 mm = uvm_va_space_mm_retain_lock(va_space); 653 va_block_context->mm = mm; 654 655 uvm_va_space_down_read(va_space); 656 657 gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent); 658 659 if (!gpu_va_space) { 660 // The va_space might have gone away. See the comment above. 661 status = NV_OK; 662 goto exit_no_channel; 663 } 664 665 fault_entry->va_space = va_space; 666 667 user_channel = uvm_gpu_va_space_get_user_channel(gpu_va_space, fault_entry->instance_ptr); 668 if (!user_channel) { 669 // The channel might have gone away. See the comment above. 670 status = NV_OK; 671 goto exit_no_channel; 672 } 673 674 fault_entry->fault_source.channel_id = user_channel->hw_channel_id; 675 676 if (!fault_entry->is_fatal) { 677 status = uvm_va_block_find_create(fault_entry->va_space, 678 fault_entry->fault_address, 679 va_block_context, 680 &va_block); 681 if (status == NV_OK) 682 status = service_managed_fault_in_block(gpu_va_space->gpu, va_block, fault_entry); 683 else 684 status = service_non_managed_fault(gpu_va_space, mm, fault_entry, status); 685 686 // We are done, we clear the faulted bit on the channel, so it can be 687 // re-scheduled again 688 if (status == NV_OK && !fault_entry->is_fatal) { 689 status = clear_faulted_on_gpu(gpu, 690 user_channel, 691 fault_entry, 692 non_replayable_faults->batch_id, 693 &non_replayable_faults->fault_service_tracker); 694 uvm_tracker_clear(&non_replayable_faults->fault_service_tracker); 695 } 696 } 697 698 if (fault_entry->is_fatal) 699 uvm_tools_record_gpu_fatal_fault(gpu->parent->id, fault_entry->va_space, fault_entry, fault_entry->fatal_reason); 700 701 if (status != NV_OK || fault_entry->is_fatal) 702 schedule_kill_channel(gpu, fault_entry, user_channel); 703 704 exit_no_channel: 705 uvm_va_space_up_read(va_space); 706 uvm_va_space_mm_release_unlock(va_space, mm); 707 708 return status; 709 } 710 711 void uvm_gpu_service_non_replayable_fault_buffer(uvm_gpu_t *gpu) 712 { 713 NV_STATUS status = NV_OK; 714 NvU32 cached_faults; 715 716 // If this handler is modified to handle fewer than all of the outstanding 717 // faults, then special handling will need to be added to uvm_suspend() 718 // to guarantee that fault processing has completed before control is 719 // returned to the RM. 720 while ((cached_faults = fetch_non_replayable_fault_buffer_entries(gpu)) > 0) { 721 NvU32 i; 722 723 // Differently to replayable faults, we do not batch up and preprocess 724 // non-replayable faults since getting multiple faults on the same 725 // memory region is not very likely 726 // 727 // TODO: Bug 2103669: [UVM/ATS] Optimize ATS fault servicing 728 for (i = 0; i < cached_faults; ++i) { 729 status = service_fault(gpu, &gpu->parent->fault_buffer_info.non_replayable.fault_cache[i]); 730 if (status != NV_OK) 731 break; 732 } 733 } 734 735 if (status != NV_OK) 736 UVM_DBG_PRINT("Error servicing non-replayable faults on GPU: %s\n", uvm_gpu_name(gpu)); 737 } 738