1 /******************************************************************************* 2 Copyright (c) 2016-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #ifndef __UVM_GPU_ISR_H__ 25 #define __UVM_GPU_ISR_H__ 26 27 #include "nv-kthread-q.h" 28 #include "uvm_common.h" 29 #include "uvm_lock.h" 30 #include "uvm_forward_decl.h" 31 32 // ISR handling state for a specific interrupt type 33 typedef struct 34 { 35 // Protects against changes to the GPU data structures used by the handling 36 // routines of this interrupt type. 37 uvm_semaphore_t service_lock; 38 39 // Bottom-half to be executed for this interrupt. There is one bottom-half 40 // per interrupt type. 41 nv_kthread_q_item_t bottom_half_q_item; 42 43 union 44 { 45 // Used for replayable and non-replayable faults. 46 struct 47 { 48 // This is set to true during add_gpu(), if the GPU supports the 49 // interrupt. It is set back to false during remove_gpu(). 50 // interrupts_lock must be held in order to write this variable. 51 bool handling; 52 53 // Variable set in uvm_gpu_disable_isr() during remove_gpu() to 54 // indicate if this type of interrupt was being handled by the 55 // driver. 56 bool was_handling; 57 }; 58 59 // Used for access counters. 60 // 61 // If the GPU does not support access counters, the ref count is always 62 // zero. Otherwise, the refcount is incremented when the GPU is 63 // registered in a VA space for the first time, and decremented when 64 // unregistered or the VA space is destroyed. 65 // 66 // Locking: protected by the GPU access counters ISR lock. Naked 67 // accesses are allowed during GPU addition and removal. 68 NvU64 handling_ref_count; 69 }; 70 71 struct 72 { 73 // Number of the bottom-half invocations for this interrupt on a GPU over 74 // its lifetime 75 NvU64 bottom_half_count; 76 77 // A bitmask of the CPUs on which the bottom half has executed. The 78 // corresponding bit gets set once the bottom half executes on that 79 // CPU. 80 // This mask is useful when testing that the bottom half is getting 81 // executed on the correct set of CPUs. 82 struct cpumask cpus_used_mask; 83 84 // An array (one per possible CPU), which holds the number of times the 85 // bottom half has executed on that CPU. 86 NvU64 *cpu_exec_count; 87 } stats; 88 89 // This is the number of times the function that disables this type of 90 // interrupt has been called without a corresponding call to the function 91 // that enables it. If this is > 0, interrupts are disabled. This field is 92 // protected by interrupts_lock. This field is only valid for interrupts 93 // directly owned by UVM: 94 // - replayable_faults 95 // - access_counters 96 NvU64 disable_intr_ref_count; 97 } uvm_intr_handler_t; 98 99 // State for all ISR handling in UVM 100 typedef struct 101 { 102 // This is set by uvm_suspend() and uvm_resume() to indicate whether 103 // top-half ISR processing is suspended for power management. Calls from 104 // the RM's top-half are to be completed without processing when this 105 // flag is set to true. 106 bool is_suspended; 107 108 // There is exactly one nv_kthread_q per GPU. It is used for the ISR bottom 109 // halves. So N CPUs will be servicing M GPUs, in general. There is one 110 // bottom-half per interrupt type. 111 nv_kthread_q_t bottom_half_q; 112 113 // Protects the state of interrupts (enabled/disabled) and whether the GPU is 114 // currently handling them. Taken in both interrupt and process context. 115 uvm_spinlock_irqsave_t interrupts_lock; 116 117 uvm_intr_handler_t replayable_faults; 118 uvm_intr_handler_t non_replayable_faults; 119 uvm_intr_handler_t access_counters; 120 121 // Kernel thread used to kill channels on fatal non-replayable faults. 122 // This is needed because we cannot call into RM from the bottom-half to 123 // avoid deadlocks. 124 nv_kthread_q_t kill_channel_q; 125 126 // Number of top-half ISRs called for this GPU over its lifetime 127 NvU64 interrupt_count; 128 } uvm_isr_info_t; 129 130 // Entry point for interrupt handling. This is called from RM's top half 131 NV_STATUS uvm_isr_top_half_entry(const NvProcessorUuid *gpu_uuid); 132 133 // Initialize ISR handling state 134 NV_STATUS uvm_parent_gpu_init_isr(uvm_parent_gpu_t *parent_gpu); 135 136 // Flush any currently scheduled bottom halves. This is called during GPU 137 // removal. 138 void uvm_parent_gpu_flush_bottom_halves(uvm_parent_gpu_t *parent_gpu); 139 140 // Prevent new bottom halves from being scheduled. This is called during parent 141 // GPU removal. 142 void uvm_parent_gpu_disable_isr(uvm_parent_gpu_t *parent_gpu); 143 144 // Destroy ISR handling state and return interrupt ownership to RM. This is 145 // called during parent GPU removal 146 void uvm_parent_gpu_deinit_isr(uvm_parent_gpu_t *parent_gpu); 147 148 // Take parent_gpu->isr.replayable_faults.service_lock from a non-top/bottom 149 // half thread. This will also disable replayable page fault interrupts (if 150 // supported by the GPU) because the top half attempts to take this lock, and we 151 // would cause an interrupt storm if we didn't disable them first. 152 // 153 // At least one GPU under the parent must have been previously retained. 154 void uvm_parent_gpu_replayable_faults_isr_lock(uvm_parent_gpu_t *parent_gpu); 155 156 // Unlock parent_gpu->isr.replayable_faults.service_lock. This call may 157 // re-enable replayable page fault interrupts. Unlike 158 // uvm_parent_gpu_replayable_faults_isr_lock(), which should only called from 159 // non-top/bottom half threads, this can be called by any thread. 160 void uvm_parent_gpu_replayable_faults_isr_unlock(uvm_parent_gpu_t *parent_gpu); 161 162 // Lock/unlock routines for non-replayable faults. These do not need to prevent 163 // interrupt storms since the GPU fault buffers for non-replayable faults are 164 // managed by RM. Unlike uvm_parent_gpu_replayable_faults_isr_lock, no GPUs 165 // under the parent need to have been previously retained. 166 void uvm_parent_gpu_non_replayable_faults_isr_lock(uvm_parent_gpu_t *parent_gpu); 167 void uvm_parent_gpu_non_replayable_faults_isr_unlock(uvm_parent_gpu_t *parent_gpu); 168 169 // See uvm_parent_gpu_replayable_faults_isr_lock/unlock 170 void uvm_parent_gpu_access_counters_isr_lock(uvm_parent_gpu_t *parent_gpu); 171 void uvm_parent_gpu_access_counters_isr_unlock(uvm_parent_gpu_t *parent_gpu); 172 173 // Increments the reference count tracking whether access counter interrupts 174 // should be disabled. The caller is guaranteed that access counter interrupts 175 // are disabled upon return. Interrupts might already be disabled prior to 176 // making this call. Each call is ref-counted, so this must be paired with a 177 // call to uvm_parent_gpu_access_counters_intr_enable(). 178 // 179 // parent_gpu->isr.interrupts_lock must be held to call this function. 180 void uvm_parent_gpu_access_counters_intr_disable(uvm_parent_gpu_t *parent_gpu); 181 182 // Decrements the reference count tracking whether access counter interrupts 183 // should be disabled. Only once the count reaches 0 are the HW interrupts 184 // actually enabled, so this call does not guarantee that the interrupts have 185 // been re-enabled upon return. 186 // 187 // uvm_parent_gpu_access_counters_intr_disable() must have been called prior to 188 // calling this function. 189 // 190 // NOTE: For pulse-based interrupts, the caller is responsible for re-arming 191 // the interrupt. 192 // 193 // parent_gpu->isr.interrupts_lock must be held to call this function. 194 void uvm_parent_gpu_access_counters_intr_enable(uvm_parent_gpu_t *parent_gpu); 195 196 #endif // __UVM_GPU_ISR_H__ 197