1 /*******************************************************************************
2     Copyright (c) 2016-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_api.h"
25 #include "uvm_global.h"
26 #include "uvm_gpu_isr.h"
27 #include "uvm_hal.h"
28 #include "uvm_gpu.h"
29 #include "uvm_gpu_access_counters.h"
30 #include "uvm_gpu_non_replayable_faults.h"
31 #include "uvm_thread_context.h"
32 
33 // Level-based vs pulse-based interrupts
34 // =====================================
35 // Turing switches to pulse-based interrupts for replayable/non-replayable
36 // faults and access counter notifications. Prior GPUs use level-based
37 // interrupts.
38 //
39 // Level-based interrupts are rearmed automatically as long as the interrupt
40 // condition is set. Pulse-based interrupts, on the other hand, are
41 // re-triggered by clearing their interrupt line and forcing the interrupt
42 // condition to be re-evaluated. However, RM re-triggers all top-level
43 // interrupts when exiting its top half. Thus, both level-based and pulse-based
44 // interrupts need to be disabled at interrupt handling boundaries, in order to
45 // avoid interrupt storms.
46 //
47 // Moreover, in order to make sure that pulse-based interrupts are not missed,
48 // we need to clear the interrupt bit and force a interrupt condition
49 // re-evaluation after interrupts are re-enabled. In the case of replayable
50 // faults and access counter notifications the interrupt condition is
51 // re-evaluated by writing to GET. Non-replayable faults work the same way, but
52 // they are currently owned by RM, so UVM doesn't have to do anything.
53 
54 // For use by the nv_kthread_q that is servicing the replayable fault bottom
55 // half, only.
56 static void replayable_faults_isr_bottom_half_entry(void *args);
57 
58 // For use by the nv_kthread_q that is servicing the replayable fault bottom
59 // half, only.
60 static void non_replayable_faults_isr_bottom_half_entry(void *args);
61 
62 // For use by the nv_kthread_q that is servicing the replayable fault bottom
63 // half, only.
64 static void access_counters_isr_bottom_half_entry(void *args);
65 
66 // Increments the reference count tracking whether replayable page fault
67 // interrupts should be disabled. The caller is guaranteed that replayable page
68 // faults are disabled upon return. Interrupts might already be disabled prior
69 // to making this call. Each call is ref-counted, so this must be paired with a
70 // call to uvm_parent_gpu_replayable_faults_intr_enable().
71 //
72 // parent_gpu->isr.interrupts_lock must be held to call this function.
73 static void uvm_parent_gpu_replayable_faults_intr_disable(uvm_parent_gpu_t *parent_gpu);
74 
75 // Decrements the reference count tracking whether replayable page fault
76 // interrupts should be disabled. Only once the count reaches 0 are the HW
77 // interrupts actually enabled, so this call does not guarantee that the
78 // interrupts have been re-enabled upon return.
79 //
80 // uvm_parent_gpu_replayable_faults_intr_disable() must have been called prior
81 // to calling this function.
82 //
83 // parent_gpu->isr.interrupts_lock must be held to call this function.
84 static void uvm_parent_gpu_replayable_faults_intr_enable(uvm_parent_gpu_t *parent_gpu);
85 
86 static unsigned schedule_replayable_faults_handler(uvm_parent_gpu_t *parent_gpu)
87 {
88     uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock);
89 
90     if (parent_gpu->isr.is_suspended)
91         return 0;
92 
93     // handling gets set to false for all handlers during removal, so quit if
94     // the GPU is in the process of being removed.
95     if (!parent_gpu->isr.replayable_faults.handling)
96         return 0;
97 
98     // Use raw call instead of UVM helper. Ownership will be recorded in the
99     // bottom half. See comment replayable_faults_isr_bottom_half().
100     if (down_trylock(&parent_gpu->isr.replayable_faults.service_lock.sem) != 0)
101         return 0;
102 
103     if (!uvm_gpu_replayable_faults_pending(parent_gpu)) {
104         up(&parent_gpu->isr.replayable_faults.service_lock.sem);
105         return 0;
106     }
107 
108     nv_kref_get(&parent_gpu->gpu_kref);
109 
110     // Interrupts need to be disabled here to avoid an interrupt storm
111     uvm_parent_gpu_replayable_faults_intr_disable(parent_gpu);
112 
113     // Schedule a bottom half, but do *not* release the GPU ISR lock. The bottom
114     // half releases the GPU ISR lock as part of its cleanup.
115     nv_kthread_q_schedule_q_item(&parent_gpu->isr.bottom_half_q,
116                                  &parent_gpu->isr.replayable_faults.bottom_half_q_item);
117 
118     return 1;
119 }
120 
121 static unsigned schedule_non_replayable_faults_handler(uvm_parent_gpu_t *parent_gpu)
122 {
123     bool scheduled;
124 
125     if (parent_gpu->isr.is_suspended)
126         return 0;
127 
128     // handling gets set to false for all handlers during removal, so quit if
129     // the GPU is in the process of being removed.
130     if (!parent_gpu->isr.non_replayable_faults.handling)
131         return 0;
132 
133     // Non-replayable_faults are stored in a synchronized circular queue
134     // shared by RM/UVM. Therefore, we can query the number of pending
135     // faults. This type of faults are not replayed and since RM advances
136     // GET to PUT when copying the fault packets to the queue, no further
137     // interrupts will be triggered by the gpu and faults may stay
138     // unserviced. Therefore, if there is a fault in the queue, we schedule
139     // a bottom half unconditionally.
140     if (!uvm_gpu_non_replayable_faults_pending(parent_gpu))
141         return 0;
142 
143     nv_kref_get(&parent_gpu->gpu_kref);
144 
145     scheduled = nv_kthread_q_schedule_q_item(&parent_gpu->isr.bottom_half_q,
146                                              &parent_gpu->isr.non_replayable_faults.bottom_half_q_item) != 0;
147 
148     // If the q_item did not get scheduled because it was already
149     // queued, that instance will handle the pending faults. Just
150     // drop the GPU kref.
151     if (!scheduled)
152         uvm_parent_gpu_kref_put(parent_gpu);
153 
154     return 1;
155 }
156 
157 static unsigned schedule_access_counters_handler(uvm_parent_gpu_t *parent_gpu)
158 {
159     uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock);
160 
161     if (parent_gpu->isr.is_suspended)
162         return 0;
163 
164     if (!parent_gpu->isr.access_counters.handling_ref_count)
165         return 0;
166 
167     if (down_trylock(&parent_gpu->isr.access_counters.service_lock.sem) != 0)
168         return 0;
169 
170     if (!uvm_gpu_access_counters_pending(parent_gpu)) {
171         up(&parent_gpu->isr.access_counters.service_lock.sem);
172         return 0;
173     }
174 
175     nv_kref_get(&parent_gpu->gpu_kref);
176 
177     // Interrupts need to be disabled to avoid an interrupt storm
178     uvm_parent_gpu_access_counters_intr_disable(parent_gpu);
179 
180     nv_kthread_q_schedule_q_item(&parent_gpu->isr.bottom_half_q,
181                                  &parent_gpu->isr.access_counters.bottom_half_q_item);
182 
183     return 1;
184 }
185 
186 // This is called from RM's top-half ISR (see: the nvidia_isr() function), and UVM is given a
187 // chance to handle the interrupt, before most of the RM processing. UVM communicates what it
188 // did, back to RM, via the return code:
189 //
190 //     NV_OK:
191 //         UVM handled an interrupt.
192 //
193 //     NV_WARN_MORE_PROCESSING_REQUIRED:
194 //         UVM did not schedule a bottom half, because it was unable to get the locks it
195 //         needed, but there is still UVM work to be done. RM will return "not handled" to the
196 //         Linux kernel, *unless* RM handled other faults in its top half. In that case, the
197 //         fact that UVM did not handle its interrupt is lost. However, life and interrupt
198 //         processing continues anyway: the GPU will soon raise another interrupt, because
199 //         that's what it does when there are replayable page faults remaining (GET != PUT in
200 //         the fault buffer).
201 //
202 //     NV_ERR_NO_INTR_PENDING:
203 //         UVM did not find any work to do. Currently this is handled in RM in exactly the same
204 //         way as NV_WARN_MORE_PROCESSING_REQUIRED is handled. However, the extra precision is
205 //         available for the future. RM's interrupt handling tends to evolve as new chips and
206 //         new interrupts get created.
207 
208 static NV_STATUS uvm_isr_top_half(const NvProcessorUuid *gpu_uuid)
209 {
210     uvm_parent_gpu_t *parent_gpu;
211     unsigned num_handlers_scheduled = 0;
212     NV_STATUS status = NV_OK;
213 
214     if (!in_interrupt() && in_atomic()) {
215         // Early-out if we're not in interrupt context, but memory allocations
216         // require GFP_ATOMIC. This happens with CONFIG_DEBUG_SHIRQ enabled,
217         // where the interrupt handler is called as part of its removal to make
218         // sure it's prepared for being called even when it's being freed.
219         // This breaks the assumption that the UVM driver is called in atomic
220         // context only in the interrupt context, which the thread context
221         // management relies on.
222         return NV_OK;
223     }
224 
225     if (!gpu_uuid) {
226         // This can happen early in the main GPU driver initialization, because
227         // that involves testing interrupts before the GPU is fully set up.
228         return NV_ERR_NO_INTR_PENDING;
229     }
230 
231     uvm_spin_lock_irqsave(&g_uvm_global.gpu_table_lock);
232 
233     parent_gpu = uvm_parent_gpu_get_by_uuid_locked(gpu_uuid);
234 
235     if (parent_gpu == NULL) {
236         uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock);
237         return NV_ERR_NO_INTR_PENDING;
238     }
239 
240     // We take a reference during the top half, and an additional reference for
241     // each scheduled bottom. References are dropped at the end of the bottom
242     // halves.
243     nv_kref_get(&parent_gpu->gpu_kref);
244     uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock);
245 
246     // Now that we got a GPU object, lock it so that it can't be removed without us noticing.
247     uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock);
248 
249     ++parent_gpu->isr.interrupt_count;
250 
251     num_handlers_scheduled += schedule_replayable_faults_handler(parent_gpu);
252     num_handlers_scheduled += schedule_non_replayable_faults_handler(parent_gpu);
253     num_handlers_scheduled += schedule_access_counters_handler(parent_gpu);
254 
255     if (num_handlers_scheduled == 0) {
256         if (parent_gpu->isr.is_suspended)
257             status = NV_ERR_NO_INTR_PENDING;
258         else
259             status = NV_WARN_MORE_PROCESSING_REQUIRED;
260     }
261 
262     uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock);
263 
264     uvm_parent_gpu_kref_put(parent_gpu);
265 
266     return status;
267 }
268 
269 NV_STATUS uvm_isr_top_half_entry(const NvProcessorUuid *gpu_uuid)
270 {
271     UVM_ENTRY_RET(uvm_isr_top_half(gpu_uuid));
272 }
273 
274 static NV_STATUS init_queue_on_node(nv_kthread_q_t *queue, const char *name, int node)
275 {
276 #if UVM_THREAD_AFFINITY_SUPPORTED()
277     if (node != -1 && !cpumask_empty(uvm_cpumask_of_node(node))) {
278         NV_STATUS status;
279 
280         status = errno_to_nv_status(nv_kthread_q_init_on_node(queue, name, node));
281         if (status != NV_OK)
282             return status;
283 
284         return errno_to_nv_status(set_cpus_allowed_ptr(queue->q_kthread, uvm_cpumask_of_node(node)));
285     }
286 #endif
287 
288     return errno_to_nv_status(nv_kthread_q_init(queue, name));
289 }
290 
291 NV_STATUS uvm_parent_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
292 {
293     NV_STATUS status = NV_OK;
294     char kthread_name[TASK_COMM_LEN + 1];
295     uvm_va_block_context_t *block_context;
296 
297     if (parent_gpu->replayable_faults_supported) {
298         status = uvm_gpu_fault_buffer_init(parent_gpu);
299         if (status != NV_OK) {
300             UVM_ERR_PRINT("Failed to initialize GPU fault buffer: %s, GPU: %s\n",
301                           nvstatusToString(status),
302                           uvm_parent_gpu_name(parent_gpu));
303             return status;
304         }
305 
306         nv_kthread_q_item_init(&parent_gpu->isr.replayable_faults.bottom_half_q_item,
307                                replayable_faults_isr_bottom_half_entry,
308                                parent_gpu);
309 
310         parent_gpu->isr.replayable_faults.stats.cpu_exec_count =
311             uvm_kvmalloc_zero(sizeof(*parent_gpu->isr.replayable_faults.stats.cpu_exec_count) * num_possible_cpus());
312         if (!parent_gpu->isr.replayable_faults.stats.cpu_exec_count)
313             return NV_ERR_NO_MEMORY;
314 
315         block_context = uvm_va_block_context_alloc(NULL);
316         if (!block_context)
317             return NV_ERR_NO_MEMORY;
318 
319         parent_gpu->fault_buffer_info.replayable.block_service_context.block_context = block_context;
320 
321         parent_gpu->isr.replayable_faults.handling = true;
322 
323         snprintf(kthread_name, sizeof(kthread_name), "UVM GPU%u BH", uvm_parent_id_value(parent_gpu->id));
324         status = init_queue_on_node(&parent_gpu->isr.bottom_half_q, kthread_name, parent_gpu->closest_cpu_numa_node);
325         if (status != NV_OK) {
326             UVM_ERR_PRINT("Failed in nv_kthread_q_init for bottom_half_q: %s, GPU %s\n",
327                           nvstatusToString(status),
328                           uvm_parent_gpu_name(parent_gpu));
329             return status;
330         }
331 
332         if (parent_gpu->non_replayable_faults_supported) {
333             nv_kthread_q_item_init(&parent_gpu->isr.non_replayable_faults.bottom_half_q_item,
334                                    non_replayable_faults_isr_bottom_half_entry,
335                                    parent_gpu);
336 
337             parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count =
338                 uvm_kvmalloc_zero(sizeof(*parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count) *
339                                   num_possible_cpus());
340             if (!parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count)
341                 return NV_ERR_NO_MEMORY;
342 
343             block_context = uvm_va_block_context_alloc(NULL);
344             if (!block_context)
345                 return NV_ERR_NO_MEMORY;
346 
347             parent_gpu->fault_buffer_info.non_replayable.block_service_context.block_context = block_context;
348 
349             parent_gpu->isr.non_replayable_faults.handling = true;
350 
351             snprintf(kthread_name, sizeof(kthread_name), "UVM GPU%u KC", uvm_parent_id_value(parent_gpu->id));
352             status = init_queue_on_node(&parent_gpu->isr.kill_channel_q,
353                                         kthread_name,
354                                         parent_gpu->closest_cpu_numa_node);
355             if (status != NV_OK) {
356                 UVM_ERR_PRINT("Failed in nv_kthread_q_init for kill_channel_q: %s, GPU %s\n",
357                               nvstatusToString(status),
358                               uvm_parent_gpu_name(parent_gpu));
359                 return status;
360             }
361         }
362 
363         if (parent_gpu->access_counters_supported) {
364             status = uvm_gpu_init_access_counters(parent_gpu);
365             if (status != NV_OK) {
366                 UVM_ERR_PRINT("Failed to initialize GPU access counters: %s, GPU: %s\n",
367                               nvstatusToString(status),
368                               uvm_parent_gpu_name(parent_gpu));
369                 return status;
370             }
371 
372             block_context = uvm_va_block_context_alloc(NULL);
373             if (!block_context)
374                 return NV_ERR_NO_MEMORY;
375 
376             parent_gpu->access_counter_buffer_info.batch_service_context.block_service_context.block_context =
377                 block_context;
378 
379             nv_kthread_q_item_init(&parent_gpu->isr.access_counters.bottom_half_q_item,
380                                    access_counters_isr_bottom_half_entry,
381                                    parent_gpu);
382 
383             // Access counters interrupts are initially disabled. They are
384             // dynamically enabled when the GPU is registered on a VA space.
385             parent_gpu->isr.access_counters.handling_ref_count = 0;
386             parent_gpu->isr.access_counters.stats.cpu_exec_count =
387                 uvm_kvmalloc_zero(sizeof(*parent_gpu->isr.access_counters.stats.cpu_exec_count) * num_possible_cpus());
388             if (!parent_gpu->isr.access_counters.stats.cpu_exec_count)
389                 return NV_ERR_NO_MEMORY;
390         }
391     }
392 
393     return NV_OK;
394 }
395 
396 void uvm_parent_gpu_flush_bottom_halves(uvm_parent_gpu_t *parent_gpu)
397 {
398     nv_kthread_q_flush(&parent_gpu->isr.bottom_half_q);
399     nv_kthread_q_flush(&parent_gpu->isr.kill_channel_q);
400 }
401 
402 void uvm_parent_gpu_disable_isr(uvm_parent_gpu_t *parent_gpu)
403 {
404     UVM_ASSERT(parent_gpu->isr.access_counters.handling_ref_count == 0);
405 
406     // Now that the GPU is safely out of the global table, lock the GPU and mark
407     // it as no longer handling interrupts so the top half knows not to schedule
408     // any more bottom halves.
409     uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock);
410 
411     uvm_parent_gpu_replayable_faults_intr_disable(parent_gpu);
412 
413     parent_gpu->isr.replayable_faults.was_handling = parent_gpu->isr.replayable_faults.handling;
414     parent_gpu->isr.non_replayable_faults.was_handling = parent_gpu->isr.non_replayable_faults.handling;
415 
416     parent_gpu->isr.replayable_faults.handling = false;
417     parent_gpu->isr.non_replayable_faults.handling = false;
418 
419     uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock);
420 
421     // Flush all bottom half ISR work items and stop the nv_kthread_q that is
422     // servicing this GPU's bottom halves. Note that this requires that the
423     // bottom half never take the global lock, since we're holding it here.
424     //
425     // Note that it's safe to call nv_kthread_q_stop() even if
426     // nv_kthread_q_init() failed in uvm_gpu_init_isr().
427     nv_kthread_q_stop(&parent_gpu->isr.bottom_half_q);
428     nv_kthread_q_stop(&parent_gpu->isr.kill_channel_q);
429 }
430 
431 void uvm_parent_gpu_deinit_isr(uvm_parent_gpu_t *parent_gpu)
432 {
433     uvm_va_block_context_t *block_context;
434 
435     // Return ownership to RM:
436     if (parent_gpu->isr.replayable_faults.was_handling) {
437         // No user threads could have anything left on
438         // replayable_faults.disable_intr_ref_count since they must retain the
439         // GPU across uvm_parent_gpu_replayable_faults_isr_lock/
440         // uvm_parent_gpu_replayable_faults_isr_unlock. This means the
441         // uvm_gpu_replayable_faults_disable_intr above could only have raced
442         // with bottom halves.
443         //
444         // If we cleared replayable_faults.handling before the bottom half got
445         // to its uvm_parent_gpu_replayable_faults_isr_unlock, when it
446         // eventually reached uvm_parent_gpu_replayable_faults_isr_unlock it
447         // would have skipped the disable, leaving us with extra ref counts
448         // here.
449         //
450         // In any case we're guaranteed that replayable faults interrupts are
451         // disabled and can't get re-enabled, so we can safely ignore the ref
452         // count value and just clean things up.
453         UVM_ASSERT_MSG(parent_gpu->isr.replayable_faults.disable_intr_ref_count > 0,
454                        "%s replayable_faults.disable_intr_ref_count: %llu\n",
455                        uvm_parent_gpu_name(parent_gpu),
456                        parent_gpu->isr.replayable_faults.disable_intr_ref_count);
457 
458         uvm_gpu_fault_buffer_deinit(parent_gpu);
459     }
460 
461     if (parent_gpu->access_counters_supported) {
462         // It is safe to deinitialize access counters even if they have not been
463         // successfully initialized.
464         uvm_gpu_deinit_access_counters(parent_gpu);
465         block_context =
466             parent_gpu->access_counter_buffer_info.batch_service_context.block_service_context.block_context;
467         uvm_va_block_context_free(block_context);
468     }
469 
470     if (parent_gpu->non_replayable_faults_supported) {
471         block_context = parent_gpu->fault_buffer_info.non_replayable.block_service_context.block_context;
472         uvm_va_block_context_free(block_context);
473     }
474 
475     block_context = parent_gpu->fault_buffer_info.replayable.block_service_context.block_context;
476     uvm_va_block_context_free(block_context);
477     uvm_kvfree(parent_gpu->isr.replayable_faults.stats.cpu_exec_count);
478     uvm_kvfree(parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count);
479     uvm_kvfree(parent_gpu->isr.access_counters.stats.cpu_exec_count);
480 }
481 
482 static uvm_gpu_t *find_first_valid_gpu(uvm_parent_gpu_t *parent_gpu)
483 {
484     uvm_gpu_t *gpu;
485 
486     // When SMC is enabled, there's no longer a 1:1 relationship between the
487     // parent and the partitions.  But because all relevant interrupt paths
488     // are shared, as is the fault reporting logic, it's sufficient here
489     // to proceed with any valid uvm_gpu_t, even if the corresponding partition
490     // didn't cause all, or even any of the interrupts.
491     // The bottom half handlers will later find the appropriate partitions by
492     // attributing the notifications to VA spaces as necessary.
493     if (parent_gpu->smc.enabled) {
494         NvU32 sub_processor_index;
495 
496         uvm_spin_lock_irqsave(&g_uvm_global.gpu_table_lock);
497 
498         sub_processor_index = find_first_bit(parent_gpu->valid_gpus, UVM_PARENT_ID_MAX_SUB_PROCESSORS);
499 
500         if (sub_processor_index < UVM_PARENT_ID_MAX_SUB_PROCESSORS) {
501             gpu = parent_gpu->gpus[sub_processor_index];
502             UVM_ASSERT(gpu != NULL);
503         }
504         else {
505             gpu = NULL;
506         }
507 
508         uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock);
509     }
510     else {
511         gpu = parent_gpu->gpus[0];
512         UVM_ASSERT(gpu != NULL);
513     }
514 
515     return gpu;
516 }
517 
518 static void replayable_faults_isr_bottom_half(void *args)
519 {
520     uvm_parent_gpu_t *parent_gpu = (uvm_parent_gpu_t *)args;
521     uvm_gpu_t *gpu;
522     unsigned int cpu;
523 
524     gpu = find_first_valid_gpu(parent_gpu);
525     if (gpu == NULL)
526         goto put_kref;
527 
528     UVM_ASSERT(parent_gpu->replayable_faults_supported);
529 
530     // Record the lock ownership
531     // The service_lock semaphore is taken in the top half using a raw
532     // semaphore call (down_trylock()). Here, the lock "ownership" is recorded,
533     // using a direct call to uvm_record_lock(). The pair of the two raw calls
534     // result in an ownership "transfer" between the top and bottom halves.
535     // Due to this ownership transfer, other usages of the service_lock can
536     // use the UVM (un)lock helpers to handle lock ownership and record keeping.
537     uvm_record_lock(&parent_gpu->isr.replayable_faults.service_lock, UVM_LOCK_FLAGS_MODE_SHARED);
538 
539     // Multiple bottom halves for replayable faults can be running
540     // concurrently, but only one can be running this function for a given GPU
541     // since we enter with the replayable_faults.service_lock held.
542     cpu = get_cpu();
543     ++parent_gpu->isr.replayable_faults.stats.bottom_half_count;
544     cpumask_set_cpu(cpu, &parent_gpu->isr.replayable_faults.stats.cpus_used_mask);
545     ++parent_gpu->isr.replayable_faults.stats.cpu_exec_count[cpu];
546     put_cpu();
547 
548     uvm_gpu_service_replayable_faults(gpu);
549 
550     uvm_parent_gpu_replayable_faults_isr_unlock(parent_gpu);
551 
552 put_kref:
553     // It is OK to drop a reference on the parent GPU if a bottom half has
554     // been retriggered within uvm_parent_gpu_replayable_faults_isr_unlock,
555     // because the rescheduling added an additional reference.
556     uvm_parent_gpu_kref_put(parent_gpu);
557 }
558 
559 static void replayable_faults_isr_bottom_half_entry(void *args)
560 {
561    UVM_ENTRY_VOID(replayable_faults_isr_bottom_half(args));
562 }
563 
564 static void non_replayable_faults_isr_bottom_half(void *args)
565 {
566     uvm_parent_gpu_t *parent_gpu = (uvm_parent_gpu_t *)args;
567     uvm_gpu_t *gpu;
568     unsigned int cpu;
569 
570     gpu = find_first_valid_gpu(parent_gpu);
571     if (gpu == NULL)
572         goto put_kref;
573 
574     UVM_ASSERT(parent_gpu->non_replayable_faults_supported);
575 
576     uvm_parent_gpu_non_replayable_faults_isr_lock(parent_gpu);
577 
578     // Multiple bottom halves for non-replayable faults can be running
579     // concurrently, but only one can enter this section for a given GPU
580     // since we acquired the non_replayable_faults.service_lock
581     cpu = get_cpu();
582     ++parent_gpu->isr.non_replayable_faults.stats.bottom_half_count;
583     cpumask_set_cpu(cpu, &parent_gpu->isr.non_replayable_faults.stats.cpus_used_mask);
584     ++parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count[cpu];
585     put_cpu();
586 
587     uvm_gpu_service_non_replayable_fault_buffer(gpu);
588 
589     uvm_parent_gpu_non_replayable_faults_isr_unlock(parent_gpu);
590 
591 put_kref:
592     uvm_parent_gpu_kref_put(parent_gpu);
593 }
594 
595 static void non_replayable_faults_isr_bottom_half_entry(void *args)
596 {
597    UVM_ENTRY_VOID(non_replayable_faults_isr_bottom_half(args));
598 }
599 
600 static void access_counters_isr_bottom_half(void *args)
601 {
602     uvm_parent_gpu_t *parent_gpu = (uvm_parent_gpu_t *)args;
603     uvm_gpu_t *gpu;
604     unsigned int cpu;
605 
606     gpu = find_first_valid_gpu(parent_gpu);
607     if (gpu == NULL)
608         goto put_kref;
609 
610     UVM_ASSERT(parent_gpu->access_counters_supported);
611 
612     uvm_record_lock(&parent_gpu->isr.access_counters.service_lock, UVM_LOCK_FLAGS_MODE_SHARED);
613 
614     // Multiple bottom halves for counter notifications can be running
615     // concurrently, but only one can be running this function for a given GPU
616     // since we enter with the access_counters_isr_lock held.
617     cpu = get_cpu();
618     ++parent_gpu->isr.access_counters.stats.bottom_half_count;
619     cpumask_set_cpu(cpu, &parent_gpu->isr.access_counters.stats.cpus_used_mask);
620     ++parent_gpu->isr.access_counters.stats.cpu_exec_count[cpu];
621     put_cpu();
622 
623     uvm_gpu_service_access_counters(gpu);
624 
625     uvm_parent_gpu_access_counters_isr_unlock(parent_gpu);
626 
627 put_kref:
628     uvm_parent_gpu_kref_put(parent_gpu);
629 }
630 
631 static void access_counters_isr_bottom_half_entry(void *args)
632 {
633    UVM_ENTRY_VOID(access_counters_isr_bottom_half(args));
634 }
635 
636 static void replayable_faults_retrigger_bottom_half(uvm_parent_gpu_t *parent_gpu)
637 {
638     bool retrigger = false;
639 
640     // When Confidential Computing is enabled, UVM does not (indirectly) trigger
641     // the replayable fault interrupt by updating GET. This is because, in this
642     // configuration, GET is a dummy register used to inform GSP-RM (the owner
643     // of the HW replayable fault buffer) of the latest entry consumed by the
644     // UVM driver. The real GET register is owned by GSP-RM.
645     //
646     // The retriggering of a replayable faults bottom half happens then
647     // manually, by scheduling a bottom half for later if there is any pending
648     // work in the fault buffer accessible by UVM. The retriggering adddresses
649     // two problematic scenarios caused by GET updates not setting any
650     // interrupt:
651     //
652     //   (1) UVM didn't process all the entries up to cached PUT
653     //
654     //   (2) UVM did process all the entries up to cached PUT, but GSP-RM
655     //       added new entries such that cached PUT is out-of-date
656     //
657     // In both cases, re-enablement of interrupts would have caused the
658     // replayable fault to be triggered in a non-CC setup, because the updated
659     // value of GET is different from PUT. But this not the case in Confidential
660     // Computing, so a bottom half needs to be manually scheduled in order to
661     // ensure that all faults are serviced.
662     //
663     // While in the typical case the retriggering happens within a replayable
664     // fault bottom half, it can also happen within a non-interrupt path such as
665     // uvm_gpu_fault_buffer_flush.
666     if (g_uvm_global.conf_computing_enabled)
667         retrigger = true;
668 
669     if (!retrigger)
670         return;
671 
672     uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock);
673 
674     // If there is pending work, schedule a replayable faults bottom
675     // half. It is valid for a bottom half (q_item) to reschedule itself.
676     (void) schedule_replayable_faults_handler(parent_gpu);
677 
678     uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock);
679 }
680 
681 void uvm_parent_gpu_replayable_faults_isr_lock(uvm_parent_gpu_t *parent_gpu)
682 {
683     UVM_ASSERT(nv_kref_read(&parent_gpu->gpu_kref) > 0);
684 
685     uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock);
686 
687     // Bump the disable ref count. This guarantees that the bottom half or
688     // another thread trying to take the replayable_faults.service_lock won't
689     // inadvertently re-enable interrupts during this locking sequence.
690     uvm_parent_gpu_replayable_faults_intr_disable(parent_gpu);
691 
692     uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock);
693 
694     // Now that we know replayable fault interrupts can't get enabled, take the
695     // lock.
696     uvm_down(&parent_gpu->isr.replayable_faults.service_lock);
697 }
698 
699 void uvm_parent_gpu_replayable_faults_isr_unlock(uvm_parent_gpu_t *parent_gpu)
700 {
701     UVM_ASSERT(nv_kref_read(&parent_gpu->gpu_kref) > 0);
702 
703     uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock);
704 
705     // The following sequence is delicate:
706     //
707     //     1) Enable replayable page fault interrupts
708     //     2) Rearm pulse based interrupts
709     //     3) Unlock GPU isr.replayable_faults.service_lock (mutex)
710     //     4) Unlock isr.interrupts_lock (spin lock)
711     //
712     // ...because the moment that page fault interrupts are reenabled, a top
713     // half might start receiving them. A top-half cannot run on the core
714     // executing this code as interrupts are disabled as long as the
715     // interrupts_lock is held. If it runs on a different core, it's going to
716     // spin waiting for the interrupts_lock to be released by this core before
717     // attempting to acquire the service_lock mutex. Hence there is no risk of
718     // the top-half missing interrupts after they are reenabled, but before the
719     // service_lock mutex is released.
720 
721     if (parent_gpu->isr.replayable_faults.handling) {
722         // Turn page fault interrupts back on, unless remove_gpu() has already
723         // removed this GPU from the GPU table. remove_gpu() indicates that
724         // situation by setting gpu->replayable_faults.handling to false.
725         //
726         // This path can only be taken from the bottom half. User threads
727         // calling this function must have previously retained the GPU, so they
728         // can't race with remove_gpu.
729         //
730         // TODO: Bug 1766600: Assert that we're in a bottom half thread, once
731         //       that's tracked by the lock assertion code.
732         //
733         // Note that if we're in the bottom half and the GPU was removed before
734         // we checked replayable_faults.handling, we won't drop our interrupt
735         // disable ref count from the corresponding top-half call to
736         // uvm_parent_gpu_replayable_faults_intr_disable. That's ok because
737         // remove_gpu ignores the refcount after waiting for the bottom half to
738         // finish.
739         uvm_parent_gpu_replayable_faults_intr_enable(parent_gpu);
740 
741         // Rearm pulse interrupts. This guarantees that the state of the pending
742         // interrupt is current and the top level rearm performed by RM is only
743         // going to trigger it if necessary. This avoids both of the possible
744         // bad cases:
745         //  1) GET != PUT but interrupt state is not pending
746         //     This could lead to the interrupt being lost.
747         //  2) GET == PUT but interrupt state is pending
748         //     This could lead to an interrupt storm as the top-half would see
749         //     no work to be done, but the interrupt would get constantly
750         //     retriggered by RM's top level rearm.
751         // clear_replayable_faults is a no-op for architectures that don't
752         // support pulse-based interrupts.
753         parent_gpu->fault_buffer_hal->clear_replayable_faults(parent_gpu,
754                                                               parent_gpu->fault_buffer_info.replayable.cached_get);
755     }
756 
757     // This unlock call has to be out-of-order unlock due to interrupts_lock
758     // still being held. Otherwise, it would result in a lock order violation.
759     uvm_up_out_of_order(&parent_gpu->isr.replayable_faults.service_lock);
760 
761     uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock);
762 
763     replayable_faults_retrigger_bottom_half(parent_gpu);
764 }
765 
766 void uvm_parent_gpu_non_replayable_faults_isr_lock(uvm_parent_gpu_t *parent_gpu)
767 {
768     UVM_ASSERT(nv_kref_read(&parent_gpu->gpu_kref) > 0);
769 
770     uvm_down(&parent_gpu->isr.non_replayable_faults.service_lock);
771 }
772 
773 void uvm_parent_gpu_non_replayable_faults_isr_unlock(uvm_parent_gpu_t *parent_gpu)
774 {
775     UVM_ASSERT(nv_kref_read(&parent_gpu->gpu_kref) > 0);
776 
777     uvm_up(&parent_gpu->isr.non_replayable_faults.service_lock);
778 }
779 
780 void uvm_parent_gpu_access_counters_isr_lock(uvm_parent_gpu_t *parent_gpu)
781 {
782     // See comments in uvm_parent_gpu_replayable_faults_isr_lock
783 
784     uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock);
785 
786     uvm_parent_gpu_access_counters_intr_disable(parent_gpu);
787 
788     uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock);
789 
790     uvm_down(&parent_gpu->isr.access_counters.service_lock);
791 }
792 
793 void uvm_parent_gpu_access_counters_isr_unlock(uvm_parent_gpu_t *parent_gpu)
794 {
795     UVM_ASSERT(nv_kref_read(&parent_gpu->gpu_kref) > 0);
796 
797     // See comments in uvm_parent_gpu_replayable_faults_isr_unlock
798 
799     uvm_spin_lock_irqsave(&parent_gpu->isr.interrupts_lock);
800 
801     uvm_parent_gpu_access_counters_intr_enable(parent_gpu);
802 
803     if (parent_gpu->isr.access_counters.handling_ref_count > 0) {
804         parent_gpu->access_counter_buffer_hal->clear_access_counter_notifications(parent_gpu,
805                                                                                   parent_gpu->access_counter_buffer_info.cached_get);
806     }
807 
808     // This unlock call has to be out-of-order unlock due to interrupts_lock
809     // still being held. Otherwise, it would result in a lock order violation.
810     uvm_up_out_of_order(&parent_gpu->isr.access_counters.service_lock);
811 
812     uvm_spin_unlock_irqrestore(&parent_gpu->isr.interrupts_lock);
813 }
814 
815 static void uvm_parent_gpu_replayable_faults_intr_disable(uvm_parent_gpu_t *parent_gpu)
816 {
817     uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock);
818 
819     if (parent_gpu->isr.replayable_faults.handling && parent_gpu->isr.replayable_faults.disable_intr_ref_count == 0)
820         parent_gpu->fault_buffer_hal->disable_replayable_faults(parent_gpu);
821 
822     ++parent_gpu->isr.replayable_faults.disable_intr_ref_count;
823 }
824 
825 static void uvm_parent_gpu_replayable_faults_intr_enable(uvm_parent_gpu_t *parent_gpu)
826 {
827     uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock);
828     UVM_ASSERT(parent_gpu->isr.replayable_faults.disable_intr_ref_count > 0);
829 
830     --parent_gpu->isr.replayable_faults.disable_intr_ref_count;
831     if (parent_gpu->isr.replayable_faults.handling && parent_gpu->isr.replayable_faults.disable_intr_ref_count == 0)
832         parent_gpu->fault_buffer_hal->enable_replayable_faults(parent_gpu);
833 }
834 
835 void uvm_parent_gpu_access_counters_intr_disable(uvm_parent_gpu_t *parent_gpu)
836 {
837     uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock);
838 
839     // The read of handling_ref_count could race with a write from
840     // gpu_access_counters_enable/disable, since here we may not hold the
841     // ISR lock. But those functions are invoked with the interrupt disabled
842     // (disable_intr_ref_count > 0), so the check always returns false when the
843     // race occurs
844     if (parent_gpu->isr.access_counters.handling_ref_count > 0 &&
845         parent_gpu->isr.access_counters.disable_intr_ref_count == 0) {
846         parent_gpu->access_counter_buffer_hal->disable_access_counter_notifications(parent_gpu);
847     }
848 
849     ++parent_gpu->isr.access_counters.disable_intr_ref_count;
850 }
851 
852 void uvm_parent_gpu_access_counters_intr_enable(uvm_parent_gpu_t *parent_gpu)
853 {
854     uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock);
855     UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.access_counters.service_lock));
856     UVM_ASSERT(parent_gpu->isr.access_counters.disable_intr_ref_count > 0);
857 
858     --parent_gpu->isr.access_counters.disable_intr_ref_count;
859 
860     if (parent_gpu->isr.access_counters.handling_ref_count > 0 &&
861         parent_gpu->isr.access_counters.disable_intr_ref_count == 0) {
862         parent_gpu->access_counter_buffer_hal->enable_access_counter_notifications(parent_gpu);
863     }
864 }
865