1 /******************************************************************************* 2 Copyright (c) 2016-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_api.h" 25 #include "uvm_global.h" 26 #include "uvm_gpu_isr.h" 27 #include "uvm_hal.h" 28 #include "uvm_gpu.h" 29 #include "uvm_gpu_access_counters.h" 30 #include "uvm_gpu_non_replayable_faults.h" 31 #include "uvm_thread_context.h" 32 33 // Level-based vs pulse-based interrupts 34 // ===================================== 35 // Turing switches to pulse-based interrupts for replayable/non-replayable 36 // faults and access counter notifications. Prior GPUs use level-based 37 // interrupts. 38 // 39 // Level-based interrupts are rearmed automatically as long as the interrupt 40 // condition is set. Pulse-based interrupts, on the other hand, are 41 // re-triggered by clearing their interrupt line and forcing the interrupt 42 // condition to be re-evaluated. However, RM re-triggers all top-level 43 // interrupts when exiting its top half. Thus, both level-based and pulse-based 44 // interrupts need to be disabled at interrupt handling boundaries, in order to 45 // avoid interrupt storms. 46 // 47 // Moreover, in order to make sure that pulse-based interrupts are not missed, 48 // we need to clear the interrupt bit and force a interrupt condition 49 // re-evaluation after interrupts are re-enabled. In the case of replayable 50 // faults and access counter notifications the interrupt condition is 51 // re-evaluated by writing to GET. Non-replayable faults work the same way, but 52 // they are currently owned by RM, so UVM doesn't have to do anything. 53 54 // For use by the nv_kthread_q that is servicing the replayable fault bottom 55 // half, only. 56 static void replayable_faults_isr_bottom_half_entry(void *args); 57 58 // For use by the nv_kthread_q that is servicing the replayable fault bottom 59 // half, only. 60 static void non_replayable_faults_isr_bottom_half_entry(void *args); 61 62 // For use by the nv_kthread_q that is servicing the replayable fault bottom 63 // half, only. 64 static void access_counters_isr_bottom_half_entry(void *args); 65 66 // Increments the reference count tracking whether replayable page fault 67 // interrupts should be disabled. The caller is guaranteed that replayable page 68 // faults are disabled upon return. Interrupts might already be disabled prior 69 // to making this call. Each call is ref-counted, so this must be paired with a 70 // call to uvm_parent_gpu_replayable_faults_intr_enable(). 71 // 72 // parent_gpu->isr.interrupts_lock must be held to call this function. 73 static void uvm_parent_gpu_replayable_faults_intr_disable(uvm_parent_gpu_t *parent_gpu); 74 75 // Decrements the reference count tracking whether replayable page fault 76 // interrupts should be disabled. Only once the count reaches 0 are the HW 77 // interrupts actually enabled, so this call does not guarantee that the 78 // interrupts have been re-enabled upon return. 79 // 80 // uvm_parent_gpu_replayable_faults_intr_disable() must have been called prior 81 // to calling this function. 82 // 83 // parent_gpu->isr.interrupts_lock must be held to call this function. 84 static void uvm_parent_gpu_replayable_faults_intr_enable(uvm_parent_gpu_t *parent_gpu); 85 86 static unsigned schedule_replayable_faults_handler(uvm_parent_gpu_t *parent_gpu) 87 { 88 uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock); 89 90 if (parent_gpu->isr.is_suspended) 91 return 0; 92 93 // handling gets set to false for all handlers during removal, so quit if 94 // the GPU is in the process of being removed. 95 if (!parent_gpu->isr.replayable_faults.handling) 96 return 0; 97 98 // Use raw call instead of UVM helper. Ownership will be recorded in the 99 // bottom half. See comment replayable_faults_isr_bottom_half(). 100 if (down_trylock(&parent_gpu->isr.replayable_faults.service_lock.sem) != 0) 101 return 0; 102 103 if (!uvm_gpu_replayable_faults_pending(parent_gpu)) { 104 up(&parent_gpu->isr.replayable_faults.service_lock.sem); 105 return 0; 106 } 107 108 nv_kref_get(&parent_gpu->gpu_kref); 109 110 // Interrupts need to be disabled here to avoid an interrupt storm 111 uvm_parent_gpu_replayable_faults_intr_disable(parent_gpu); 112 113 // Schedule a bottom half, but do *not* release the GPU ISR lock. The bottom 114 // half releases the GPU ISR lock as part of its cleanup. 115 nv_kthread_q_schedule_q_item(&parent_gpu->isr.bottom_half_q, 116 &parent_gpu->isr.replayable_faults.bottom_half_q_item); 117 118 return 1; 119 } 120 121 static unsigned schedule_non_replayable_faults_handler(uvm_parent_gpu_t *parent_gpu) 122 { 123 bool scheduled; 124 125 if (parent_gpu->isr.is_suspended) 126 return 0; 127 128 // handling gets set to false for all handlers during removal, so quit if 129 // the GPU is in the process of being removed. 130 if (!parent_gpu->isr.non_replayable_faults.handling) 131 return 0; 132 133 // Non-replayable_faults are stored in a synchronized circular queue 134 // shared by RM/UVM. Therefore, we can query the number of pending 135 // faults. This type of faults are not replayed and since RM advances 136 // GET to PUT when copying the fault packets to the queue, no further 137 // interrupts will be triggered by the gpu and faults may stay 138 // unserviced. Therefore, if there is a fault in the queue, we schedule 139 // a bottom half unconditionally. 140 if (!uvm_gpu_non_replayable_faults_pending(parent_gpu)) 141 return 0; 142 143 nv_kref_get(&parent_gpu->gpu_kref); 144 145 scheduled = nv_kthread_q_schedule_q_item(&parent_gpu->isr.bottom_half_q, 146 &parent_gpu->isr.non_replayable_faults.bottom_half_q_item) != 0; 147 148 // If the q_item did not get scheduled because it was already 149 // queued, that instance will handle the pending faults. Just 150 // drop the GPU kref. 151 if (!scheduled) 152 uvm_parent_gpu_kref_put(parent_gpu); 153 154 return 1; 155 } 156 157 static unsigned schedule_access_counters_handler(uvm_parent_gpu_t *parent_gpu) 158 { 159 uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock); 160 161 if (parent_gpu->isr.is_suspended) 162 return 0; 163 164 if (!parent_gpu->isr.access_counters.handling_ref_count) 165 return 0; 166 167 if (down_trylock(&parent_gpu->isr.access_counters.service_lock.sem) != 0) 168 return 0; 169 170 if (!uvm_gpu_access_counters_pending(parent_gpu)) { 171 up(&parent_gpu->isr.access_counters.service_lock.sem); 172 return 0; 173 } 174 175 nv_kref_get(&parent_gpu->gpu_kref); 176 177 // Interrupts need to be disabled to avoid an interrupt storm 178 uvm_parent_gpu_access_counters_intr_disable(parent_gpu); 179 180 nv_kthread_q_schedule_q_item(&parent_gpu->isr.bottom_half_q, 181 &parent_gpu->isr.access_counters.bottom_half_q_item); 182 183 return 1; 184 } 185 186 // This is called from RM's top-half ISR (see: the nvidia_isr() function), and UVM is given a 187 // chance to handle the interrupt, before most of the RM processing. UVM communicates what it 188 // did, back to RM, via the return code: 189 // 190 // NV_OK: 191 // UVM handled an interrupt. 192 // 193 // NV_WARN_MORE_PROCESSING_REQUIRED: 194 // UVM did not schedule a bottom half, because it was unable to get the locks it 195 // needed, but there is still UVM work to be done. RM will return "not handled" to the 196 // Linux kernel, *unless* RM handled other faults in its top half. In that case, the 197 // fact that UVM did not handle its interrupt is lost. However, life and interrupt 198 // processing continues anyway: the GPU will soon raise another interrupt, because 199 // that's what it does when there are replayable page faults remaining (GET != PUT in 200 // the fault buffer). 201 // 202 // NV_ERR_NO_INTR_PENDING: 203 // UVM did not find any work to do. Currently this is handled in RM in exactly the same 204 // way as NV_WARN_MORE_PROCESSING_REQUIRED is handled. However, the extra precision is 205 // available for the future. RM's interrupt handling tends to evolve as new chips and 206 // new interrupts get created. 207 208 static NV_STATUS uvm_isr_top_half(const NvProcessorUuid *gpu_uuid) 209 { 210 uvm_parent_gpu_t *parent_gpu; 211 unsigned num_handlers_scheduled = 0; 212 NV_STATUS status = NV_OK; 213 214 if (!in_interrupt() && in_atomic()) { 215 // Early-out if we're not in interrupt context, but memory allocations 216 // require GFP_ATOMIC. This happens with CONFIG_DEBUG_SHIRQ enabled, 217 // where the interrupt handler is called as part of its removal to make 218 // sure it's prepared for being called even when it's being freed. 219 // This breaks the assumption that the UVM driver is called in atomic 220 // context only in the interrupt context, which the thread context 221 // management relies on. 222 return NV_OK; 223 } 224 225 if (!gpu_uuid) { 226 // This can happen early in the main GPU driver initialization, because 227 // that involves testing interrupts before the GPU is fully set up. 228 return NV_ERR_NO_INTR_PENDING; 229 } 230 231 uvm_spin_lock_irqsave(&g_uvm_global.gpu_table_lock); 232 233 parent_gpu = uvm_parent_gpu_get_by_uuid_locked(gpu_uuid); 234 235 if (parent_gpu == NULL) { 236 uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock); 237 return NV_ERR_NO_INTR_PENDING; 238 } 239 240 // We take a reference during the top half, and an additional reference for 241 // each scheduled bottom. References are dropped at the end of the bottom 242 // halves. 243 nv_kref_get(&parent_gpu->gpu_kref); 244 uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock); 245 246 // Now that we got a GPU object, lock it so that it can't be removed without us noticing. 247 uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock); 248 249 ++parent_gpu->isr.interrupt_count; 250 251 num_handlers_scheduled += schedule_replayable_faults_handler(parent_gpu); 252 num_handlers_scheduled += schedule_non_replayable_faults_handler(parent_gpu); 253 num_handlers_scheduled += schedule_access_counters_handler(parent_gpu); 254 255 if (num_handlers_scheduled == 0) { 256 if (parent_gpu->isr.is_suspended) 257 status = NV_ERR_NO_INTR_PENDING; 258 else 259 status = NV_WARN_MORE_PROCESSING_REQUIRED; 260 } 261 262 uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock); 263 264 uvm_parent_gpu_kref_put(parent_gpu); 265 266 return status; 267 } 268 269 NV_STATUS uvm_isr_top_half_entry(const NvProcessorUuid *gpu_uuid) 270 { 271 UVM_ENTRY_RET(uvm_isr_top_half(gpu_uuid)); 272 } 273 274 static NV_STATUS init_queue_on_node(nv_kthread_q_t *queue, const char *name, int node) 275 { 276 #if UVM_THREAD_AFFINITY_SUPPORTED() 277 if (node != -1 && !cpumask_empty(uvm_cpumask_of_node(node))) { 278 NV_STATUS status; 279 280 status = errno_to_nv_status(nv_kthread_q_init_on_node(queue, name, node)); 281 if (status != NV_OK) 282 return status; 283 284 return errno_to_nv_status(set_cpus_allowed_ptr(queue->q_kthread, uvm_cpumask_of_node(node))); 285 } 286 #endif 287 288 return errno_to_nv_status(nv_kthread_q_init(queue, name)); 289 } 290 291 NV_STATUS uvm_parent_gpu_init_isr(uvm_parent_gpu_t *parent_gpu) 292 { 293 NV_STATUS status = NV_OK; 294 char kthread_name[TASK_COMM_LEN + 1]; 295 uvm_va_block_context_t *block_context; 296 297 if (parent_gpu->replayable_faults_supported) { 298 status = uvm_gpu_fault_buffer_init(parent_gpu); 299 if (status != NV_OK) { 300 UVM_ERR_PRINT("Failed to initialize GPU fault buffer: %s, GPU: %s\n", 301 nvstatusToString(status), 302 uvm_parent_gpu_name(parent_gpu)); 303 return status; 304 } 305 306 nv_kthread_q_item_init(&parent_gpu->isr.replayable_faults.bottom_half_q_item, 307 replayable_faults_isr_bottom_half_entry, 308 parent_gpu); 309 310 parent_gpu->isr.replayable_faults.stats.cpu_exec_count = 311 uvm_kvmalloc_zero(sizeof(*parent_gpu->isr.replayable_faults.stats.cpu_exec_count) * num_possible_cpus()); 312 if (!parent_gpu->isr.replayable_faults.stats.cpu_exec_count) 313 return NV_ERR_NO_MEMORY; 314 315 block_context = uvm_va_block_context_alloc(NULL); 316 if (!block_context) 317 return NV_ERR_NO_MEMORY; 318 319 parent_gpu->fault_buffer_info.replayable.block_service_context.block_context = block_context; 320 321 parent_gpu->isr.replayable_faults.handling = true; 322 323 snprintf(kthread_name, sizeof(kthread_name), "UVM GPU%u BH", uvm_parent_id_value(parent_gpu->id)); 324 status = init_queue_on_node(&parent_gpu->isr.bottom_half_q, kthread_name, parent_gpu->closest_cpu_numa_node); 325 if (status != NV_OK) { 326 UVM_ERR_PRINT("Failed in nv_kthread_q_init for bottom_half_q: %s, GPU %s\n", 327 nvstatusToString(status), 328 uvm_parent_gpu_name(parent_gpu)); 329 return status; 330 } 331 332 if (parent_gpu->non_replayable_faults_supported) { 333 nv_kthread_q_item_init(&parent_gpu->isr.non_replayable_faults.bottom_half_q_item, 334 non_replayable_faults_isr_bottom_half_entry, 335 parent_gpu); 336 337 parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count = 338 uvm_kvmalloc_zero(sizeof(*parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count) * 339 num_possible_cpus()); 340 if (!parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count) 341 return NV_ERR_NO_MEMORY; 342 343 block_context = uvm_va_block_context_alloc(NULL); 344 if (!block_context) 345 return NV_ERR_NO_MEMORY; 346 347 parent_gpu->fault_buffer_info.non_replayable.block_service_context.block_context = block_context; 348 349 parent_gpu->isr.non_replayable_faults.handling = true; 350 351 snprintf(kthread_name, sizeof(kthread_name), "UVM GPU%u KC", uvm_parent_id_value(parent_gpu->id)); 352 status = init_queue_on_node(&parent_gpu->isr.kill_channel_q, 353 kthread_name, 354 parent_gpu->closest_cpu_numa_node); 355 if (status != NV_OK) { 356 UVM_ERR_PRINT("Failed in nv_kthread_q_init for kill_channel_q: %s, GPU %s\n", 357 nvstatusToString(status), 358 uvm_parent_gpu_name(parent_gpu)); 359 return status; 360 } 361 } 362 363 if (parent_gpu->access_counters_supported) { 364 status = uvm_gpu_init_access_counters(parent_gpu); 365 if (status != NV_OK) { 366 UVM_ERR_PRINT("Failed to initialize GPU access counters: %s, GPU: %s\n", 367 nvstatusToString(status), 368 uvm_parent_gpu_name(parent_gpu)); 369 return status; 370 } 371 372 block_context = uvm_va_block_context_alloc(NULL); 373 if (!block_context) 374 return NV_ERR_NO_MEMORY; 375 376 parent_gpu->access_counter_buffer_info.batch_service_context.block_service_context.block_context = 377 block_context; 378 379 nv_kthread_q_item_init(&parent_gpu->isr.access_counters.bottom_half_q_item, 380 access_counters_isr_bottom_half_entry, 381 parent_gpu); 382 383 // Access counters interrupts are initially disabled. They are 384 // dynamically enabled when the GPU is registered on a VA space. 385 parent_gpu->isr.access_counters.handling_ref_count = 0; 386 parent_gpu->isr.access_counters.stats.cpu_exec_count = 387 uvm_kvmalloc_zero(sizeof(*parent_gpu->isr.access_counters.stats.cpu_exec_count) * num_possible_cpus()); 388 if (!parent_gpu->isr.access_counters.stats.cpu_exec_count) 389 return NV_ERR_NO_MEMORY; 390 } 391 } 392 393 return NV_OK; 394 } 395 396 void uvm_parent_gpu_flush_bottom_halves(uvm_parent_gpu_t *parent_gpu) 397 { 398 nv_kthread_q_flush(&parent_gpu->isr.bottom_half_q); 399 nv_kthread_q_flush(&parent_gpu->isr.kill_channel_q); 400 } 401 402 void uvm_parent_gpu_disable_isr(uvm_parent_gpu_t *parent_gpu) 403 { 404 UVM_ASSERT(parent_gpu->isr.access_counters.handling_ref_count == 0); 405 406 // Now that the GPU is safely out of the global table, lock the GPU and mark 407 // it as no longer handling interrupts so the top half knows not to schedule 408 // any more bottom halves. 409 uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock); 410 411 uvm_parent_gpu_replayable_faults_intr_disable(parent_gpu); 412 413 parent_gpu->isr.replayable_faults.was_handling = parent_gpu->isr.replayable_faults.handling; 414 parent_gpu->isr.non_replayable_faults.was_handling = parent_gpu->isr.non_replayable_faults.handling; 415 416 parent_gpu->isr.replayable_faults.handling = false; 417 parent_gpu->isr.non_replayable_faults.handling = false; 418 419 uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock); 420 421 // Flush all bottom half ISR work items and stop the nv_kthread_q that is 422 // servicing this GPU's bottom halves. Note that this requires that the 423 // bottom half never take the global lock, since we're holding it here. 424 // 425 // Note that it's safe to call nv_kthread_q_stop() even if 426 // nv_kthread_q_init() failed in uvm_gpu_init_isr(). 427 nv_kthread_q_stop(&parent_gpu->isr.bottom_half_q); 428 nv_kthread_q_stop(&parent_gpu->isr.kill_channel_q); 429 } 430 431 void uvm_parent_gpu_deinit_isr(uvm_parent_gpu_t *parent_gpu) 432 { 433 uvm_va_block_context_t *block_context; 434 435 // Return ownership to RM: 436 if (parent_gpu->isr.replayable_faults.was_handling) { 437 // No user threads could have anything left on 438 // replayable_faults.disable_intr_ref_count since they must retain the 439 // GPU across uvm_parent_gpu_replayable_faults_isr_lock/ 440 // uvm_parent_gpu_replayable_faults_isr_unlock. This means the 441 // uvm_gpu_replayable_faults_disable_intr above could only have raced 442 // with bottom halves. 443 // 444 // If we cleared replayable_faults.handling before the bottom half got 445 // to its uvm_parent_gpu_replayable_faults_isr_unlock, when it 446 // eventually reached uvm_parent_gpu_replayable_faults_isr_unlock it 447 // would have skipped the disable, leaving us with extra ref counts 448 // here. 449 // 450 // In any case we're guaranteed that replayable faults interrupts are 451 // disabled and can't get re-enabled, so we can safely ignore the ref 452 // count value and just clean things up. 453 UVM_ASSERT_MSG(parent_gpu->isr.replayable_faults.disable_intr_ref_count > 0, 454 "%s replayable_faults.disable_intr_ref_count: %llu\n", 455 uvm_parent_gpu_name(parent_gpu), 456 parent_gpu->isr.replayable_faults.disable_intr_ref_count); 457 458 uvm_gpu_fault_buffer_deinit(parent_gpu); 459 } 460 461 if (parent_gpu->access_counters_supported) { 462 // It is safe to deinitialize access counters even if they have not been 463 // successfully initialized. 464 uvm_gpu_deinit_access_counters(parent_gpu); 465 block_context = 466 parent_gpu->access_counter_buffer_info.batch_service_context.block_service_context.block_context; 467 uvm_va_block_context_free(block_context); 468 } 469 470 if (parent_gpu->non_replayable_faults_supported) { 471 block_context = parent_gpu->fault_buffer_info.non_replayable.block_service_context.block_context; 472 uvm_va_block_context_free(block_context); 473 } 474 475 block_context = parent_gpu->fault_buffer_info.replayable.block_service_context.block_context; 476 uvm_va_block_context_free(block_context); 477 uvm_kvfree(parent_gpu->isr.replayable_faults.stats.cpu_exec_count); 478 uvm_kvfree(parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count); 479 uvm_kvfree(parent_gpu->isr.access_counters.stats.cpu_exec_count); 480 } 481 482 static uvm_gpu_t *find_first_valid_gpu(uvm_parent_gpu_t *parent_gpu) 483 { 484 uvm_gpu_t *gpu; 485 486 // When SMC is enabled, there's no longer a 1:1 relationship between the 487 // parent and the partitions. But because all relevant interrupt paths 488 // are shared, as is the fault reporting logic, it's sufficient here 489 // to proceed with any valid uvm_gpu_t, even if the corresponding partition 490 // didn't cause all, or even any of the interrupts. 491 // The bottom half handlers will later find the appropriate partitions by 492 // attributing the notifications to VA spaces as necessary. 493 if (parent_gpu->smc.enabled) { 494 NvU32 sub_processor_index; 495 496 uvm_spin_lock_irqsave(&g_uvm_global.gpu_table_lock); 497 498 sub_processor_index = find_first_bit(parent_gpu->valid_gpus, UVM_PARENT_ID_MAX_SUB_PROCESSORS); 499 500 if (sub_processor_index < UVM_PARENT_ID_MAX_SUB_PROCESSORS) { 501 gpu = parent_gpu->gpus[sub_processor_index]; 502 UVM_ASSERT(gpu != NULL); 503 } 504 else { 505 gpu = NULL; 506 } 507 508 uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock); 509 } 510 else { 511 gpu = parent_gpu->gpus[0]; 512 UVM_ASSERT(gpu != NULL); 513 } 514 515 return gpu; 516 } 517 518 static void replayable_faults_isr_bottom_half(void *args) 519 { 520 uvm_parent_gpu_t *parent_gpu = (uvm_parent_gpu_t *)args; 521 uvm_gpu_t *gpu; 522 unsigned int cpu; 523 524 gpu = find_first_valid_gpu(parent_gpu); 525 if (gpu == NULL) 526 goto put_kref; 527 528 UVM_ASSERT(parent_gpu->replayable_faults_supported); 529 530 // Record the lock ownership 531 // The service_lock semaphore is taken in the top half using a raw 532 // semaphore call (down_trylock()). Here, the lock "ownership" is recorded, 533 // using a direct call to uvm_record_lock(). The pair of the two raw calls 534 // result in an ownership "transfer" between the top and bottom halves. 535 // Due to this ownership transfer, other usages of the service_lock can 536 // use the UVM (un)lock helpers to handle lock ownership and record keeping. 537 uvm_record_lock(&parent_gpu->isr.replayable_faults.service_lock, UVM_LOCK_FLAGS_MODE_SHARED); 538 539 // Multiple bottom halves for replayable faults can be running 540 // concurrently, but only one can be running this function for a given GPU 541 // since we enter with the replayable_faults.service_lock held. 542 cpu = get_cpu(); 543 ++parent_gpu->isr.replayable_faults.stats.bottom_half_count; 544 cpumask_set_cpu(cpu, &parent_gpu->isr.replayable_faults.stats.cpus_used_mask); 545 ++parent_gpu->isr.replayable_faults.stats.cpu_exec_count[cpu]; 546 put_cpu(); 547 548 uvm_gpu_service_replayable_faults(gpu); 549 550 uvm_parent_gpu_replayable_faults_isr_unlock(parent_gpu); 551 552 put_kref: 553 // It is OK to drop a reference on the parent GPU if a bottom half has 554 // been retriggered within uvm_parent_gpu_replayable_faults_isr_unlock, 555 // because the rescheduling added an additional reference. 556 uvm_parent_gpu_kref_put(parent_gpu); 557 } 558 559 static void replayable_faults_isr_bottom_half_entry(void *args) 560 { 561 UVM_ENTRY_VOID(replayable_faults_isr_bottom_half(args)); 562 } 563 564 static void non_replayable_faults_isr_bottom_half(void *args) 565 { 566 uvm_parent_gpu_t *parent_gpu = (uvm_parent_gpu_t *)args; 567 uvm_gpu_t *gpu; 568 unsigned int cpu; 569 570 gpu = find_first_valid_gpu(parent_gpu); 571 if (gpu == NULL) 572 goto put_kref; 573 574 UVM_ASSERT(parent_gpu->non_replayable_faults_supported); 575 576 uvm_parent_gpu_non_replayable_faults_isr_lock(parent_gpu); 577 578 // Multiple bottom halves for non-replayable faults can be running 579 // concurrently, but only one can enter this section for a given GPU 580 // since we acquired the non_replayable_faults.service_lock 581 cpu = get_cpu(); 582 ++parent_gpu->isr.non_replayable_faults.stats.bottom_half_count; 583 cpumask_set_cpu(cpu, &parent_gpu->isr.non_replayable_faults.stats.cpus_used_mask); 584 ++parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count[cpu]; 585 put_cpu(); 586 587 uvm_gpu_service_non_replayable_fault_buffer(gpu); 588 589 uvm_parent_gpu_non_replayable_faults_isr_unlock(parent_gpu); 590 591 put_kref: 592 uvm_parent_gpu_kref_put(parent_gpu); 593 } 594 595 static void non_replayable_faults_isr_bottom_half_entry(void *args) 596 { 597 UVM_ENTRY_VOID(non_replayable_faults_isr_bottom_half(args)); 598 } 599 600 static void access_counters_isr_bottom_half(void *args) 601 { 602 uvm_parent_gpu_t *parent_gpu = (uvm_parent_gpu_t *)args; 603 uvm_gpu_t *gpu; 604 unsigned int cpu; 605 606 gpu = find_first_valid_gpu(parent_gpu); 607 if (gpu == NULL) 608 goto put_kref; 609 610 UVM_ASSERT(parent_gpu->access_counters_supported); 611 612 uvm_record_lock(&parent_gpu->isr.access_counters.service_lock, UVM_LOCK_FLAGS_MODE_SHARED); 613 614 // Multiple bottom halves for counter notifications can be running 615 // concurrently, but only one can be running this function for a given GPU 616 // since we enter with the access_counters_isr_lock held. 617 cpu = get_cpu(); 618 ++parent_gpu->isr.access_counters.stats.bottom_half_count; 619 cpumask_set_cpu(cpu, &parent_gpu->isr.access_counters.stats.cpus_used_mask); 620 ++parent_gpu->isr.access_counters.stats.cpu_exec_count[cpu]; 621 put_cpu(); 622 623 uvm_gpu_service_access_counters(gpu); 624 625 uvm_parent_gpu_access_counters_isr_unlock(parent_gpu); 626 627 put_kref: 628 uvm_parent_gpu_kref_put(parent_gpu); 629 } 630 631 static void access_counters_isr_bottom_half_entry(void *args) 632 { 633 UVM_ENTRY_VOID(access_counters_isr_bottom_half(args)); 634 } 635 636 static void replayable_faults_retrigger_bottom_half(uvm_parent_gpu_t *parent_gpu) 637 { 638 bool retrigger = false; 639 640 // When Confidential Computing is enabled, UVM does not (indirectly) trigger 641 // the replayable fault interrupt by updating GET. This is because, in this 642 // configuration, GET is a dummy register used to inform GSP-RM (the owner 643 // of the HW replayable fault buffer) of the latest entry consumed by the 644 // UVM driver. The real GET register is owned by GSP-RM. 645 // 646 // The retriggering of a replayable faults bottom half happens then 647 // manually, by scheduling a bottom half for later if there is any pending 648 // work in the fault buffer accessible by UVM. The retriggering adddresses 649 // two problematic scenarios caused by GET updates not setting any 650 // interrupt: 651 // 652 // (1) UVM didn't process all the entries up to cached PUT 653 // 654 // (2) UVM did process all the entries up to cached PUT, but GSP-RM 655 // added new entries such that cached PUT is out-of-date 656 // 657 // In both cases, re-enablement of interrupts would have caused the 658 // replayable fault to be triggered in a non-CC setup, because the updated 659 // value of GET is different from PUT. But this not the case in Confidential 660 // Computing, so a bottom half needs to be manually scheduled in order to 661 // ensure that all faults are serviced. 662 // 663 // While in the typical case the retriggering happens within a replayable 664 // fault bottom half, it can also happen within a non-interrupt path such as 665 // uvm_gpu_fault_buffer_flush. 666 if (g_uvm_global.conf_computing_enabled) 667 retrigger = true; 668 669 if (!retrigger) 670 return; 671 672 uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock); 673 674 // If there is pending work, schedule a replayable faults bottom 675 // half. It is valid for a bottom half (q_item) to reschedule itself. 676 (void) schedule_replayable_faults_handler(parent_gpu); 677 678 uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock); 679 } 680 681 void uvm_parent_gpu_replayable_faults_isr_lock(uvm_parent_gpu_t *parent_gpu) 682 { 683 UVM_ASSERT(nv_kref_read(&parent_gpu->gpu_kref) > 0); 684 685 uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock); 686 687 // Bump the disable ref count. This guarantees that the bottom half or 688 // another thread trying to take the replayable_faults.service_lock won't 689 // inadvertently re-enable interrupts during this locking sequence. 690 uvm_parent_gpu_replayable_faults_intr_disable(parent_gpu); 691 692 uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock); 693 694 // Now that we know replayable fault interrupts can't get enabled, take the 695 // lock. 696 uvm_down(&parent_gpu->isr.replayable_faults.service_lock); 697 } 698 699 void uvm_parent_gpu_replayable_faults_isr_unlock(uvm_parent_gpu_t *parent_gpu) 700 { 701 UVM_ASSERT(nv_kref_read(&parent_gpu->gpu_kref) > 0); 702 703 uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock); 704 705 // The following sequence is delicate: 706 // 707 // 1) Enable replayable page fault interrupts 708 // 2) Rearm pulse based interrupts 709 // 3) Unlock GPU isr.replayable_faults.service_lock (mutex) 710 // 4) Unlock isr.interrupts_lock (spin lock) 711 // 712 // ...because the moment that page fault interrupts are reenabled, a top 713 // half might start receiving them. A top-half cannot run on the core 714 // executing this code as interrupts are disabled as long as the 715 // interrupts_lock is held. If it runs on a different core, it's going to 716 // spin waiting for the interrupts_lock to be released by this core before 717 // attempting to acquire the service_lock mutex. Hence there is no risk of 718 // the top-half missing interrupts after they are reenabled, but before the 719 // service_lock mutex is released. 720 721 if (parent_gpu->isr.replayable_faults.handling) { 722 // Turn page fault interrupts back on, unless remove_gpu() has already 723 // removed this GPU from the GPU table. remove_gpu() indicates that 724 // situation by setting gpu->replayable_faults.handling to false. 725 // 726 // This path can only be taken from the bottom half. User threads 727 // calling this function must have previously retained the GPU, so they 728 // can't race with remove_gpu. 729 // 730 // TODO: Bug 1766600: Assert that we're in a bottom half thread, once 731 // that's tracked by the lock assertion code. 732 // 733 // Note that if we're in the bottom half and the GPU was removed before 734 // we checked replayable_faults.handling, we won't drop our interrupt 735 // disable ref count from the corresponding top-half call to 736 // uvm_parent_gpu_replayable_faults_intr_disable. That's ok because 737 // remove_gpu ignores the refcount after waiting for the bottom half to 738 // finish. 739 uvm_parent_gpu_replayable_faults_intr_enable(parent_gpu); 740 741 // Rearm pulse interrupts. This guarantees that the state of the pending 742 // interrupt is current and the top level rearm performed by RM is only 743 // going to trigger it if necessary. This avoids both of the possible 744 // bad cases: 745 // 1) GET != PUT but interrupt state is not pending 746 // This could lead to the interrupt being lost. 747 // 2) GET == PUT but interrupt state is pending 748 // This could lead to an interrupt storm as the top-half would see 749 // no work to be done, but the interrupt would get constantly 750 // retriggered by RM's top level rearm. 751 // clear_replayable_faults is a no-op for architectures that don't 752 // support pulse-based interrupts. 753 parent_gpu->fault_buffer_hal->clear_replayable_faults(parent_gpu, 754 parent_gpu->fault_buffer_info.replayable.cached_get); 755 } 756 757 // This unlock call has to be out-of-order unlock due to interrupts_lock 758 // still being held. Otherwise, it would result in a lock order violation. 759 uvm_up_out_of_order(&parent_gpu->isr.replayable_faults.service_lock); 760 761 uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock); 762 763 replayable_faults_retrigger_bottom_half(parent_gpu); 764 } 765 766 void uvm_parent_gpu_non_replayable_faults_isr_lock(uvm_parent_gpu_t *parent_gpu) 767 { 768 UVM_ASSERT(nv_kref_read(&parent_gpu->gpu_kref) > 0); 769 770 uvm_down(&parent_gpu->isr.non_replayable_faults.service_lock); 771 } 772 773 void uvm_parent_gpu_non_replayable_faults_isr_unlock(uvm_parent_gpu_t *parent_gpu) 774 { 775 UVM_ASSERT(nv_kref_read(&parent_gpu->gpu_kref) > 0); 776 777 uvm_up(&parent_gpu->isr.non_replayable_faults.service_lock); 778 } 779 780 void uvm_parent_gpu_access_counters_isr_lock(uvm_parent_gpu_t *parent_gpu) 781 { 782 // See comments in uvm_parent_gpu_replayable_faults_isr_lock 783 784 uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock); 785 786 uvm_parent_gpu_access_counters_intr_disable(parent_gpu); 787 788 uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock); 789 790 uvm_down(&parent_gpu->isr.access_counters.service_lock); 791 } 792 793 void uvm_parent_gpu_access_counters_isr_unlock(uvm_parent_gpu_t *parent_gpu) 794 { 795 UVM_ASSERT(nv_kref_read(&parent_gpu->gpu_kref) > 0); 796 797 // See comments in uvm_parent_gpu_replayable_faults_isr_unlock 798 799 uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock); 800 801 uvm_parent_gpu_access_counters_intr_enable(parent_gpu); 802 803 if (parent_gpu->isr.access_counters.handling_ref_count > 0) { 804 parent_gpu->access_counter_buffer_hal->clear_access_counter_notifications(parent_gpu, 805 parent_gpu->access_counter_buffer_info.cached_get); 806 } 807 808 // This unlock call has to be out-of-order unlock due to interrupts_lock 809 // still being held. Otherwise, it would result in a lock order violation. 810 uvm_up_out_of_order(&parent_gpu->isr.access_counters.service_lock); 811 812 uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock); 813 } 814 815 static void uvm_parent_gpu_replayable_faults_intr_disable(uvm_parent_gpu_t *parent_gpu) 816 { 817 uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock); 818 819 if (parent_gpu->isr.replayable_faults.handling && parent_gpu->isr.replayable_faults.disable_intr_ref_count == 0) 820 parent_gpu->fault_buffer_hal->disable_replayable_faults(parent_gpu); 821 822 ++parent_gpu->isr.replayable_faults.disable_intr_ref_count; 823 } 824 825 static void uvm_parent_gpu_replayable_faults_intr_enable(uvm_parent_gpu_t *parent_gpu) 826 { 827 uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock); 828 UVM_ASSERT(parent_gpu->isr.replayable_faults.disable_intr_ref_count > 0); 829 830 --parent_gpu->isr.replayable_faults.disable_intr_ref_count; 831 if (parent_gpu->isr.replayable_faults.handling && parent_gpu->isr.replayable_faults.disable_intr_ref_count == 0) 832 parent_gpu->fault_buffer_hal->enable_replayable_faults(parent_gpu); 833 } 834 835 void uvm_parent_gpu_access_counters_intr_disable(uvm_parent_gpu_t *parent_gpu) 836 { 837 uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock); 838 839 // The read of handling_ref_count could race with a write from 840 // gpu_access_counters_enable/disable, since here we may not hold the 841 // ISR lock. But those functions are invoked with the interrupt disabled 842 // (disable_intr_ref_count > 0), so the check always returns false when the 843 // race occurs 844 if (parent_gpu->isr.access_counters.handling_ref_count > 0 && 845 parent_gpu->isr.access_counters.disable_intr_ref_count == 0) { 846 parent_gpu->access_counter_buffer_hal->disable_access_counter_notifications(parent_gpu); 847 } 848 849 ++parent_gpu->isr.access_counters.disable_intr_ref_count; 850 } 851 852 void uvm_parent_gpu_access_counters_intr_enable(uvm_parent_gpu_t *parent_gpu) 853 { 854 uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock); 855 UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.access_counters.service_lock)); 856 UVM_ASSERT(parent_gpu->isr.access_counters.disable_intr_ref_count > 0); 857 858 --parent_gpu->isr.access_counters.disable_intr_ref_count; 859 860 if (parent_gpu->isr.access_counters.handling_ref_count > 0 && 861 parent_gpu->isr.access_counters.disable_intr_ref_count == 0) { 862 parent_gpu->access_counter_buffer_hal->enable_access_counter_notifications(parent_gpu); 863 } 864 } 865