1 /*******************************************************************************
2     Copyright (c) 2016-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_linux.h"
25 #include "uvm_global.h"
26 #include "uvm_gpu.h"
27 #include "uvm_hal.h"
28 #include "uvm_conf_computing.h"
29 #include "nv_uvm_types.h"
30 #include "hwref/volta/gv100/dev_fault.h"
31 #include "hwref/volta/gv100/dev_fb.h"
32 #include "clc369.h"
33 #include "uvm_volta_fault_buffer.h"
34 
35 typedef struct {
36     NvU8 bufferEntry[NVC369_BUF_SIZE];
37 } fault_buffer_entry_c369_t;
38 
39 NvU32 uvm_hal_volta_fault_buffer_read_put(uvm_parent_gpu_t *parent_gpu)
40 {
41     NvU32 put = UVM_GPU_READ_ONCE(*parent_gpu->fault_buffer_info.rm_info.replayable.pFaultBufferPut);
42     NvU32 index = READ_HWVALUE(put, _PFB_PRI_MMU, FAULT_BUFFER_PUT, PTR);
43     UVM_ASSERT(READ_HWVALUE(put, _PFB_PRI_MMU, FAULT_BUFFER_PUT, GETPTR_CORRUPTED) ==
44                NV_PFB_PRI_MMU_FAULT_BUFFER_PUT_GETPTR_CORRUPTED_NO);
45 
46     return index;
47 }
48 
49 NvU32 uvm_hal_volta_fault_buffer_read_get(uvm_parent_gpu_t *parent_gpu)
50 {
51     NvU32 get = UVM_GPU_READ_ONCE(*parent_gpu->fault_buffer_info.rm_info.replayable.pFaultBufferGet);
52     UVM_ASSERT(get < parent_gpu->fault_buffer_info.replayable.max_faults);
53 
54     return READ_HWVALUE(get, _PFB_PRI_MMU, FAULT_BUFFER_GET, PTR);
55 }
56 
57 void uvm_hal_volta_fault_buffer_write_get(uvm_parent_gpu_t *parent_gpu, NvU32 index)
58 {
59     NvU32 get = HWVALUE(_PFB_PRI_MMU, FAULT_BUFFER_GET, PTR, index);
60 
61     UVM_ASSERT(index < parent_gpu->fault_buffer_info.replayable.max_faults);
62 
63     // If HW has detected an overflow condition (PUT == GET - 1 and a fault has
64     // arrived, which is dropped due to no more space in the fault buffer), it
65     // will not deliver any more faults into the buffer until the overflow
66     // condition has been cleared. The overflow condition is cleared by
67     // updating the GET index to indicate space in the buffer and writing 1 to
68     // the OVERFLOW bit in GET. Unfortunately, this can not be done in the same
69     // write because it can collide with an arriving fault on the same cycle,
70     // resulting in the overflow condition being instantly reasserted. However,
71     // if the index is updated first and then the OVERFLOW bit is cleared such
72     // a collision will not cause a reassertion of the overflow condition.
73     UVM_GPU_WRITE_ONCE(*parent_gpu->fault_buffer_info.rm_info.replayable.pFaultBufferGet, get);
74 
75     // Clearing GETPTR_CORRUPTED and OVERFLOW is not needed when GSP-RM owns
76     // the HW replayable fault buffer, because UVM does not write to the actual
77     // GET register; GSP-RM is responsible for clearing the bits in the real
78     // GET register.
79     if (!uvm_parent_gpu_replayable_fault_buffer_is_uvm_owned(parent_gpu))
80         return;
81 
82     // Clear the GETPTR_CORRUPTED and OVERFLOW bits.
83     get |= HWCONST(_PFB_PRI_MMU, FAULT_BUFFER_GET, GETPTR_CORRUPTED, CLEAR) |
84            HWCONST(_PFB_PRI_MMU, FAULT_BUFFER_GET, OVERFLOW, CLEAR);
85     UVM_GPU_WRITE_ONCE(*parent_gpu->fault_buffer_info.rm_info.replayable.pFaultBufferGet, get);
86 }
87 
88 // TODO: Bug  1835884: [uvm] Query the maximum number of subcontexts from RM
89 // ... to validate the ve_id
90 #define MAX_SUBCONTEXTS 64
91 NvU8 uvm_hal_volta_fault_buffer_get_ve_id(NvU16 mmu_engine_id, uvm_mmu_engine_type_t mmu_engine_type)
92 {
93     // Only graphics engines can generate MMU faults from different subcontexts
94     if (mmu_engine_type == UVM_MMU_ENGINE_TYPE_GRAPHICS) {
95         NvU16 ve_id = mmu_engine_id - NV_PFAULT_MMU_ENG_ID_GRAPHICS;
96         UVM_ASSERT(ve_id < MAX_SUBCONTEXTS);
97 
98         return (NvU8)ve_id;
99     }
100     else {
101         return 0;
102     }
103 }
104 
105 static uvm_fault_access_type_t get_fault_access_type(const NvU32 *fault_entry)
106 {
107     NvU32 hw_access_type_value = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, ACCESS_TYPE);
108 
109     switch (hw_access_type_value)
110     {
111         case NV_PFAULT_ACCESS_TYPE_PHYS_READ:
112         case NV_PFAULT_ACCESS_TYPE_VIRT_READ:
113             return UVM_FAULT_ACCESS_TYPE_READ;
114         case NV_PFAULT_ACCESS_TYPE_PHYS_WRITE:
115         case NV_PFAULT_ACCESS_TYPE_VIRT_WRITE:
116             return UVM_FAULT_ACCESS_TYPE_WRITE;
117         case NV_PFAULT_ACCESS_TYPE_PHYS_ATOMIC:
118         case NV_PFAULT_ACCESS_TYPE_VIRT_ATOMIC_STRONG:
119             return UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG;
120         case NV_PFAULT_ACCESS_TYPE_VIRT_ATOMIC_WEAK:
121             return UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK;
122         case NV_PFAULT_ACCESS_TYPE_PHYS_PREFETCH:
123         case NV_PFAULT_ACCESS_TYPE_VIRT_PREFETCH:
124             return UVM_FAULT_ACCESS_TYPE_PREFETCH;
125     }
126 
127     UVM_ASSERT_MSG(false, "Invalid fault access type value: %d\n", hw_access_type_value);
128 
129     return UVM_FAULT_ACCESS_TYPE_COUNT;
130 }
131 
132 static bool is_fault_address_virtual(const NvU32 *fault_entry)
133 {
134     NvU32 hw_access_type_value = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, ACCESS_TYPE);
135 
136     switch (hw_access_type_value)
137     {
138         case NV_PFAULT_ACCESS_TYPE_PHYS_READ:
139         case NV_PFAULT_ACCESS_TYPE_PHYS_WRITE:
140         case NV_PFAULT_ACCESS_TYPE_PHYS_ATOMIC:
141         case NV_PFAULT_ACCESS_TYPE_PHYS_PREFETCH:
142             return false;
143         case NV_PFAULT_ACCESS_TYPE_VIRT_READ:
144         case NV_PFAULT_ACCESS_TYPE_VIRT_WRITE:
145         case NV_PFAULT_ACCESS_TYPE_VIRT_ATOMIC_STRONG:
146         case NV_PFAULT_ACCESS_TYPE_VIRT_ATOMIC_WEAK:
147         case NV_PFAULT_ACCESS_TYPE_VIRT_PREFETCH:
148             return true;
149     }
150 
151     UVM_ASSERT_MSG(false, "Invalid fault access type value: %d\n", hw_access_type_value);
152 
153     return UVM_FAULT_ACCESS_TYPE_COUNT;
154 }
155 
156 uvm_fault_type_t uvm_hal_volta_fault_buffer_get_fault_type(const NvU32 *fault_entry)
157 {
158     NvU32 hw_fault_type_value = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, FAULT_TYPE);
159 
160     switch (hw_fault_type_value)
161     {
162         case NV_PFAULT_FAULT_TYPE_PDE:
163             return UVM_FAULT_TYPE_INVALID_PDE;
164         case NV_PFAULT_FAULT_TYPE_PTE:
165             return UVM_FAULT_TYPE_INVALID_PTE;
166         case NV_PFAULT_FAULT_TYPE_RO_VIOLATION:
167             return UVM_FAULT_TYPE_WRITE;
168         case NV_PFAULT_FAULT_TYPE_ATOMIC_VIOLATION:
169             return UVM_FAULT_TYPE_ATOMIC;
170         case NV_PFAULT_FAULT_TYPE_WO_VIOLATION:
171             return UVM_FAULT_TYPE_READ;
172 
173         case NV_PFAULT_FAULT_TYPE_PDE_SIZE:
174             return UVM_FAULT_TYPE_PDE_SIZE;
175         case NV_PFAULT_FAULT_TYPE_VA_LIMIT_VIOLATION:
176             return UVM_FAULT_TYPE_VA_LIMIT_VIOLATION;
177         case NV_PFAULT_FAULT_TYPE_UNBOUND_INST_BLOCK:
178             return UVM_FAULT_TYPE_UNBOUND_INST_BLOCK;
179         case NV_PFAULT_FAULT_TYPE_PRIV_VIOLATION:
180             return UVM_FAULT_TYPE_PRIV_VIOLATION;
181         case NV_PFAULT_FAULT_TYPE_PITCH_MASK_VIOLATION:
182             return UVM_FAULT_TYPE_PITCH_MASK_VIOLATION;
183         case NV_PFAULT_FAULT_TYPE_WORK_CREATION:
184             return UVM_FAULT_TYPE_WORK_CREATION;
185         case NV_PFAULT_FAULT_TYPE_UNSUPPORTED_APERTURE:
186             return UVM_FAULT_TYPE_UNSUPPORTED_APERTURE;
187         case NV_PFAULT_FAULT_TYPE_COMPRESSION_FAILURE:
188             return UVM_FAULT_TYPE_COMPRESSION_FAILURE;
189         case NV_PFAULT_FAULT_TYPE_UNSUPPORTED_KIND:
190             return UVM_FAULT_TYPE_UNSUPPORTED_KIND;
191         case NV_PFAULT_FAULT_TYPE_REGION_VIOLATION:
192             return UVM_FAULT_TYPE_REGION_VIOLATION;
193         case NV_PFAULT_FAULT_TYPE_POISONED:
194             return UVM_FAULT_TYPE_POISONED;
195     }
196 
197     UVM_ASSERT_MSG(false, "Invalid fault type value: %d\n", hw_fault_type_value);
198 
199     return UVM_FAULT_TYPE_COUNT;
200 }
201 
202 static uvm_fault_client_type_t get_fault_client_type(const NvU32 *fault_entry)
203 {
204     NvU32 hw_client_type_value = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, MMU_CLIENT_TYPE);
205 
206     switch (hw_client_type_value)
207     {
208         case NV_PFAULT_MMU_CLIENT_TYPE_GPC:
209             return UVM_FAULT_CLIENT_TYPE_GPC;
210         case NV_PFAULT_MMU_CLIENT_TYPE_HUB:
211             return UVM_FAULT_CLIENT_TYPE_HUB;
212     }
213 
214     UVM_ASSERT_MSG(false, "Invalid mmu client type value: %d\n", hw_client_type_value);
215 
216     return UVM_FAULT_CLIENT_TYPE_COUNT;
217 }
218 
219 static uvm_aperture_t get_fault_inst_aperture(const NvU32 *fault_entry)
220 {
221     NvU32 hw_aperture_value = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, INST_APERTURE);
222 
223     switch (hw_aperture_value)
224     {
225         case NVC369_BUF_ENTRY_INST_APERTURE_VID_MEM:
226             return UVM_APERTURE_VID;
227         case NVC369_BUF_ENTRY_INST_APERTURE_SYS_MEM_COHERENT:
228         case NVC369_BUF_ENTRY_INST_APERTURE_SYS_MEM_NONCOHERENT:
229              return UVM_APERTURE_SYS;
230     }
231 
232     UVM_ASSERT_MSG(false, "Invalid inst aperture value: %d\n", hw_aperture_value);
233 
234     return UVM_APERTURE_MAX;
235 }
236 
237 static NvU32 *get_fault_buffer_entry(uvm_parent_gpu_t *parent_gpu, NvU32 index)
238 {
239     fault_buffer_entry_c369_t *buffer_start;
240     NvU32 *fault_entry;
241 
242     UVM_ASSERT(index < parent_gpu->fault_buffer_info.replayable.max_faults);
243 
244     buffer_start = (fault_buffer_entry_c369_t *)parent_gpu->fault_buffer_info.rm_info.replayable.bufferAddress;
245     fault_entry = (NvU32 *)&buffer_start[index];
246 
247     return fault_entry;
248 }
249 
250 // See uvm_pascal_fault_buffer.c::get_fault_buffer_entry_metadata
251 static UvmFaultMetadataPacket *get_fault_buffer_entry_metadata(uvm_parent_gpu_t *parent_gpu, NvU32 index)
252 {
253     UvmFaultMetadataPacket *fault_entry_metadata;
254 
255     UVM_ASSERT(index < parent_gpu->fault_buffer_info.replayable.max_faults);
256     UVM_ASSERT(!uvm_parent_gpu_replayable_fault_buffer_is_uvm_owned(parent_gpu));
257 
258     fault_entry_metadata = parent_gpu->fault_buffer_info.rm_info.replayable.bufferMetadata;
259     UVM_ASSERT(fault_entry_metadata != NULL);
260 
261     return fault_entry_metadata + index;
262 }
263 
264 static void parse_fault_entry_common(uvm_parent_gpu_t *parent_gpu,
265                                      NvU32 *fault_entry,
266                                      uvm_fault_buffer_entry_t *buffer_entry)
267 {
268     NvU64 addr_hi, addr_lo;
269     NvU64 timestamp_hi, timestamp_lo;
270     bool replayable_fault_enabled;
271 
272     addr_hi = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, INST_HI);
273     addr_lo = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, INST_LO);
274     buffer_entry->instance_ptr.address = addr_lo + (addr_hi << HWSIZE_MW(C369, BUF_ENTRY, INST_LO));
275     // HW value contains the 4K page number. Shift to build the full address
276     buffer_entry->instance_ptr.address <<= 12;
277 
278     buffer_entry->instance_ptr.aperture = get_fault_inst_aperture(fault_entry);
279 
280     addr_hi = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, ADDR_HI);
281     addr_lo = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, ADDR_LO);
282     // HW value contains the 4K page number. Shift to build the full address
283     buffer_entry->fault_address = (addr_lo + (addr_hi << HWSIZE_MW(C369, BUF_ENTRY, ADDR_LO))) << 12;
284     buffer_entry->fault_address = uvm_parent_gpu_canonical_address(parent_gpu, buffer_entry->fault_address);
285 
286     timestamp_hi = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, TIMESTAMP_HI);
287     timestamp_lo = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, TIMESTAMP_LO);
288     buffer_entry->timestamp = timestamp_lo + (timestamp_hi << HWSIZE_MW(C369, BUF_ENTRY, TIMESTAMP_LO));
289 
290     buffer_entry->fault_type = parent_gpu->fault_buffer_hal->get_fault_type(fault_entry);
291 
292     buffer_entry->fault_access_type = get_fault_access_type(fault_entry);
293 
294     buffer_entry->fault_source.client_type = get_fault_client_type(fault_entry);
295 
296     buffer_entry->fault_source.client_id = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, CLIENT);
297     BUILD_BUG_ON(sizeof(buffer_entry->fault_source.client_id) * 8 < DRF_SIZE_MW(NVC369_BUF_ENTRY_CLIENT));
298 
299     buffer_entry->fault_source.gpc_id = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, GPC_ID);
300     BUILD_BUG_ON(sizeof(buffer_entry->fault_source.gpc_id) * 8 < DRF_SIZE_MW(NVC369_BUF_ENTRY_GPC_ID));
301 
302     buffer_entry->is_replayable = (READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, REPLAYABLE_FAULT) ==
303                                    NVC369_BUF_ENTRY_REPLAYABLE_FAULT_TRUE);
304 
305     // Compute global uTLB id
306     if (buffer_entry->fault_source.client_type == UVM_FAULT_CLIENT_TYPE_GPC) {
307         NvU16 gpc_utlb_id = parent_gpu->arch_hal->mmu_client_id_to_utlb_id(buffer_entry->fault_source.client_id);
308         NvU32 utlb_id;
309         UVM_ASSERT(gpc_utlb_id < parent_gpu->utlb_per_gpc_count);
310 
311         utlb_id = buffer_entry->fault_source.gpc_id * parent_gpu->utlb_per_gpc_count + gpc_utlb_id;
312         UVM_ASSERT(utlb_id < parent_gpu->fault_buffer_info.replayable.utlb_count);
313 
314         buffer_entry->fault_source.utlb_id = utlb_id;
315     }
316     else if (buffer_entry->fault_source.client_type == UVM_FAULT_CLIENT_TYPE_HUB) {
317         buffer_entry->fault_source.utlb_id = 0;
318     }
319 
320     buffer_entry->fault_source.mmu_engine_id = READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, ENGINE_ID);
321     BUILD_BUG_ON(sizeof(buffer_entry->fault_source.mmu_engine_id) * 8 < DRF_SIZE_MW(NVC369_BUF_ENTRY_ENGINE_ID));
322 
323     buffer_entry->fault_source.mmu_engine_type =
324         parent_gpu->arch_hal->mmu_engine_id_to_type(buffer_entry->fault_source.mmu_engine_id);
325 
326     buffer_entry->fault_source.ve_id =
327         parent_gpu->fault_buffer_hal->get_ve_id(buffer_entry->fault_source.mmu_engine_id,
328                                                 buffer_entry->fault_source.mmu_engine_type);
329     BUILD_BUG_ON(1 << (sizeof(buffer_entry->fault_source.ve_id) * 8) < MAX_SUBCONTEXTS);
330 
331     buffer_entry->is_virtual = is_fault_address_virtual(fault_entry);
332 
333     buffer_entry->in_protected_mode = (READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, PROTECTED_MODE) ==
334                                        NVC369_BUF_ENTRY_PROTECTED_MODE_TRUE);
335 
336     replayable_fault_enabled = (READ_HWVALUE_MW(fault_entry, C369, BUF_ENTRY, REPLAYABLE_FAULT_EN) ==
337                                 NVC369_BUF_ENTRY_REPLAYABLE_FAULT_EN_TRUE);
338     UVM_ASSERT_MSG(replayable_fault_enabled, "Fault with REPLAYABLE_FAULT_EN bit unset\n");
339 }
340 
341 NV_STATUS uvm_hal_volta_fault_buffer_parse_replayable_entry(uvm_parent_gpu_t *parent_gpu,
342                                                             NvU32 index,
343                                                             uvm_fault_buffer_entry_t *buffer_entry)
344 {
345     fault_buffer_entry_c369_t entry;
346     NvU32 *fault_entry;
347 
348     BUILD_BUG_ON(sizeof(entry) > UVM_GPU_MMU_MAX_FAULT_PACKET_SIZE);
349 
350     // Valid bit must be set before this function is called
351     UVM_ASSERT(parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, index));
352 
353     fault_entry = get_fault_buffer_entry(parent_gpu, index);
354 
355     // When Confidential Computing is enabled, faults are encrypted by RM, so
356     // they need to be decrypted before they can be parsed
357     if (!uvm_parent_gpu_replayable_fault_buffer_is_uvm_owned(parent_gpu)) {
358         NV_STATUS status;
359         UvmFaultMetadataPacket *fault_entry_metadata = get_fault_buffer_entry_metadata(parent_gpu, index);
360 
361         status = uvm_conf_computing_fault_decrypt(parent_gpu,
362                                                   &entry,
363                                                   fault_entry,
364                                                   fault_entry_metadata->authTag,
365                                                   fault_entry_metadata->valid);
366         if (status != NV_OK) {
367             uvm_global_set_fatal_error(status);
368             return status;
369         }
370 
371         fault_entry = (NvU32 *) &entry;
372     }
373 
374     parse_fault_entry_common(parent_gpu, fault_entry, buffer_entry);
375 
376     UVM_ASSERT(buffer_entry->is_replayable);
377 
378     // Automatically clear valid bit for the entry in the fault buffer
379     parent_gpu->fault_buffer_hal->entry_clear_valid(parent_gpu, index);
380 
381     return NV_OK;
382 }
383 
384 void uvm_hal_volta_fault_buffer_parse_non_replayable_entry(uvm_parent_gpu_t *parent_gpu,
385                                                            void *fault_packet,
386                                                            uvm_fault_buffer_entry_t *buffer_entry)
387 {
388     parse_fault_entry_common(parent_gpu, fault_packet, buffer_entry);
389 
390     // No need to clear the valid bit since the fault buffer for non-replayable
391     // faults is owned by RM and we are just parsing a copy of the packet
392     UVM_ASSERT(!buffer_entry->is_replayable);
393 }
394