1 /*******************************************************************************
2     Copyright (c) 2017-2024 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 *******************************************************************************/
22 
23 #include "nv_uvm_interface.h"
24 #include "uvm_common.h"
25 #include "uvm_api.h"
26 #include "uvm_gpu_non_replayable_faults.h"
27 #include "uvm_gpu.h"
28 #include "uvm_hal.h"
29 #include "uvm_lock.h"
30 #include "uvm_tools.h"
31 #include "uvm_user_channel.h"
32 #include "uvm_va_space_mm.h"
33 #include "uvm_va_block.h"
34 #include "uvm_va_range.h"
35 #include "uvm_kvmalloc.h"
36 #include "uvm_ats_faults.h"
37 
38 // In the context of a CUDA application using Unified Memory, it is sometimes
39 // assumed that there is a single type of fault, originated by a memory
40 // load/store in a SM (Graphics Engine), which itself can be traced back to a
41 // memory access in a CUDA kernel written by a developer. In reality, faults can
42 // also be triggered by other parts of the GPU i.e. by other engines, as the
43 // result of developer-facing APIs, or operations initiated by a user-mode
44 // driver. The Graphics Engine faults are called replayable faults, while the
45 // rest are called non-replayable. The differences between the two types of
46 // faults go well beyond the engine originating the fault.
47 //
48 // A non-replayable fault originates in an engine other than Graphics. UVM
49 // services non-replayable faults from the Copy and PBDMA (Host/ESCHED) Engines.
50 // Non-replayable faults originated in other engines are considered fatal, and
51 // do not reach the UVM driver. While UVM can distinguish between faults
52 // originated in the Copy Engine and faults originated in the PBDMA Engine, in
53 // practice they are all processed in the same way. Replayable fault support in
54 // Graphics was introduced in Pascal, and non-replayable fault support in CE and
55 // PBDMA Engines was introduced in Volta; all non-replayable faults were fatal
56 // before Volta.
57 //
58 // An example of a Copy Engine non-replayable fault is a memory copy between two
59 // virtual addresses on a GPU, in which either the source or destination
60 // pointers are not currently mapped to a physical address in the page tables of
61 // the GPU. An example of a PBDMA non-replayable fault is a semaphore acquire in
62 // which the semaphore virtual address passed as argument is currently not
63 // mapped to any physical address.
64 //
65 // Non-replayable faults originated in the CE and PBDMA Engines result in HW
66 // preempting the channel associated with the fault, a mechanism called "fault
67 // and switch". More precisely, the switching out affects not only the channel
68 // that caused the fault, but all the channels in the same Time Slice Group
69 // (TSG). SW intervention is required so all the channels in the TSG can be
70 // scheduled again, but channels in other TSGs can be scheduled and resume their
71 // normal execution. In the case of the non-replayable faults serviced by UVM,
72 // the driver clears a channel's faulted bit upon successful servicing, but it
73 // is only when the servicing has completed for all the channels in the TSG that
74 // they are all allowed to be switched in.  Non-replayable faults originated in
75 // engines other than CE and PBDMA are fatal because these other units lack
76 // hardware support for the "fault and switch" and restart mechanisms just
77 // described.
78 // On the other hand, replayable faults block preemption of the channel until
79 // software (UVM) services the fault. This is sometimes known as "fault and
80 // stall". Note that replayable faults prevent the execution of other channels,
81 // which are stalled until the fault is serviced.
82 //
83 // The "non-replayable" naming alludes to the fact that, historically, these
84 // faults indicated a fatal condition so there was no recovery ("replay")
85 // process, and SW could not ignore or drop the fault. As discussed before, this
86 // is no longer the case and while at times the hardware documentation uses the
87 // "fault and replay" expression for CE and PBDMA faults, we reserve that
88 // expression for Graphics faults and favor the term "fault and reschedule"
89 // instead. Replaying a fault does not necessarily imply that UVM has serviced
90 // it. For example, the UVM driver may choose to ignore the replayable faults
91 // associated with a GPU for some period of time if it detects that there is
92 // thrashing going on, and the GPU needs to be throttled. The fault entries
93 // corresponding to the ignored faults are never saved by UVM, but new entries
94 // (and new interrupts) will be generated by hardware each time after UVM issues
95 // a replay.
96 //
97 // While replayable faults are always the responsibility of UVM, the servicing
98 // of non-replayable faults is split between RM and UVM. In the case of
99 // replayable faults, UVM has sole SW ownership of the hardware buffer
100 // containing the faults, and it is responsible for updating the GET pointer to
101 // signal the hardware that a number of faults have been read. UVM also reads
102 // the PUT pointer value written by hardware. But in the case of non-replayable
103 // faults, UVM reads the fault entries out of a regular CPU buffer, shared with
104 // RM, called "shadow buffer". RM is responsible for accessing the actual
105 // non-replayable hardware buffer, reading the PUT pointer, updating the GET
106 // pointer, and moving CE and PBDMA faults from the hardware buffer to the
107 // shadow buffer. Because the Resource Manager owns the HW buffer, UVM needs to
108 // call RM when servicing a non-replayable fault, first to figure out if there
109 // is a pending fault, and then to read entries from the shadow buffer.
110 //
111 // Once UVM has parsed a non-replayable fault entry corresponding to managed
112 // memory, and identified the VA block associated with it, the servicing logic
113 // for that block is identical to that of a replayable fault, see
114 // uvm_va_block_service_locked. Another similarity between the two types of
115 // faults is that they use the same entry format, uvm_fault_buffer_entry_t.
116 
117 
118 // There is no error handling in this function. The caller is in charge of
119 // calling uvm_parent_gpu_fault_buffer_deinit_non_replayable_faults on failure.
uvm_parent_gpu_fault_buffer_init_non_replayable_faults(uvm_parent_gpu_t * parent_gpu)120 NV_STATUS uvm_parent_gpu_fault_buffer_init_non_replayable_faults(uvm_parent_gpu_t *parent_gpu)
121 {
122     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable;
123 
124     UVM_ASSERT(parent_gpu->non_replayable_faults_supported);
125 
126     non_replayable_faults->shadow_buffer_copy = NULL;
127     non_replayable_faults->fault_cache        = NULL;
128 
129     non_replayable_faults->max_faults = parent_gpu->fault_buffer_info.rm_info.nonReplayable.bufferSize /
130                                         parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
131 
132     non_replayable_faults->shadow_buffer_copy =
133         uvm_kvmalloc_zero(parent_gpu->fault_buffer_info.rm_info.nonReplayable.bufferSize);
134     if (!non_replayable_faults->shadow_buffer_copy)
135         return NV_ERR_NO_MEMORY;
136 
137     non_replayable_faults->fault_cache = uvm_kvmalloc_zero(non_replayable_faults->max_faults *
138                                                            sizeof(*non_replayable_faults->fault_cache));
139     if (!non_replayable_faults->fault_cache)
140         return NV_ERR_NO_MEMORY;
141 
142     uvm_tracker_init(&non_replayable_faults->clear_faulted_tracker);
143     uvm_tracker_init(&non_replayable_faults->fault_service_tracker);
144 
145     return NV_OK;
146 }
147 
uvm_parent_gpu_fault_buffer_deinit_non_replayable_faults(uvm_parent_gpu_t * parent_gpu)148 void uvm_parent_gpu_fault_buffer_deinit_non_replayable_faults(uvm_parent_gpu_t *parent_gpu)
149 {
150     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable;
151 
152     if (non_replayable_faults->fault_cache) {
153         UVM_ASSERT(uvm_tracker_is_empty(&non_replayable_faults->clear_faulted_tracker));
154         uvm_tracker_deinit(&non_replayable_faults->clear_faulted_tracker);
155 
156         UVM_ASSERT(uvm_tracker_is_empty(&non_replayable_faults->fault_service_tracker));
157         uvm_tracker_deinit(&non_replayable_faults->fault_service_tracker);
158     }
159 
160     uvm_kvfree(non_replayable_faults->shadow_buffer_copy);
161     uvm_kvfree(non_replayable_faults->fault_cache);
162     non_replayable_faults->shadow_buffer_copy = NULL;
163     non_replayable_faults->fault_cache        = NULL;
164 }
165 
uvm_parent_gpu_non_replayable_faults_pending(uvm_parent_gpu_t * parent_gpu)166 bool uvm_parent_gpu_non_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu)
167 {
168     NV_STATUS status;
169     NvBool has_pending_faults;
170 
171     UVM_ASSERT(parent_gpu->isr.non_replayable_faults.handling);
172 
173     status = nvUvmInterfaceHasPendingNonReplayableFaults(&parent_gpu->fault_buffer_info.rm_info,
174                                                          &has_pending_faults);
175     UVM_ASSERT(status == NV_OK);
176 
177     return has_pending_faults == NV_TRUE;
178 }
179 
fetch_non_replayable_fault_buffer_entries(uvm_parent_gpu_t * parent_gpu,NvU32 * cached_faults)180 static NV_STATUS fetch_non_replayable_fault_buffer_entries(uvm_parent_gpu_t *parent_gpu, NvU32 *cached_faults)
181 {
182     NV_STATUS status;
183     NvU32 i;
184     NvU32 entry_size = parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
185     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable;
186     char *current_hw_entry = (char *)non_replayable_faults->shadow_buffer_copy;
187     uvm_fault_buffer_entry_t *fault_entry = non_replayable_faults->fault_cache;
188 
189     UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.non_replayable_faults.service_lock));
190     UVM_ASSERT(parent_gpu->non_replayable_faults_supported);
191 
192     status = nvUvmInterfaceGetNonReplayableFaults(&parent_gpu->fault_buffer_info.rm_info,
193                                                   current_hw_entry,
194                                                   cached_faults);
195 
196     if (status != NV_OK) {
197         UVM_ERR_PRINT("nvUvmInterfaceGetNonReplayableFaults() failed: %s, GPU %s\n",
198                       nvstatusToString(status),
199                       uvm_parent_gpu_name(parent_gpu));
200 
201         uvm_global_set_fatal_error(status);
202         return status;
203     }
204 
205     // Parse all faults
206     for (i = 0; i < *cached_faults; ++i) {
207         parent_gpu->fault_buffer_hal->parse_non_replayable_entry(parent_gpu, current_hw_entry, fault_entry);
208 
209         // The GPU aligns the fault addresses to 4k, but all of our tracking is
210         // done in PAGE_SIZE chunks which might be larger.
211         fault_entry->fault_address = UVM_PAGE_ALIGN_DOWN(fault_entry->fault_address);
212 
213         // Make sure that all fields in the entry are properly initialized
214         fault_entry->va_space = NULL;
215         fault_entry->is_fatal = (fault_entry->fault_type >= UVM_FAULT_TYPE_FATAL);
216         fault_entry->filtered = false;
217 
218         fault_entry->num_instances = 1;
219         fault_entry->access_type_mask = uvm_fault_access_type_mask_bit(fault_entry->fault_access_type);
220         INIT_LIST_HEAD(&fault_entry->merged_instances_list);
221         fault_entry->non_replayable.buffer_index = i;
222 
223         if (fault_entry->is_fatal) {
224             // Record the fatal fault event later as we need the va_space locked
225             fault_entry->fatal_reason = UvmEventFatalReasonInvalidFaultType;
226         }
227         else {
228             fault_entry->fatal_reason = UvmEventFatalReasonInvalid;
229         }
230 
231         current_hw_entry += entry_size;
232         fault_entry++;
233     }
234 
235     return NV_OK;
236 }
237 
use_clear_faulted_channel_sw_method(uvm_gpu_t * gpu)238 static bool use_clear_faulted_channel_sw_method(uvm_gpu_t *gpu)
239 {
240     // If true, UVM uses a SW method to request RM to do the clearing on its
241     // behalf.
242     bool use_sw_method = false;
243 
244     // In SRIOV, the UVM (guest) driver does not have access to the privileged
245     // registers used to clear the faulted bit.
246     if (uvm_parent_gpu_is_virt_mode_sriov(gpu->parent))
247         use_sw_method = true;
248 
249     // In Confidential Computing access to the privileged registers is blocked,
250     // in order to prevent interference between guests, or between the
251     // (untrusted) host and the guests.
252     if (g_uvm_global.conf_computing_enabled)
253         use_sw_method = true;
254 
255     if (use_sw_method)
256         UVM_ASSERT(gpu->parent->has_clear_faulted_channel_sw_method);
257 
258     return use_sw_method;
259 }
260 
clear_faulted_method_on_gpu(uvm_gpu_t * gpu,uvm_user_channel_t * user_channel,const uvm_fault_buffer_entry_t * fault_entry,NvU32 batch_id,uvm_tracker_t * tracker)261 static NV_STATUS clear_faulted_method_on_gpu(uvm_gpu_t *gpu,
262                                              uvm_user_channel_t *user_channel,
263                                              const uvm_fault_buffer_entry_t *fault_entry,
264                                              NvU32 batch_id,
265                                              uvm_tracker_t *tracker)
266 {
267     NV_STATUS status;
268     uvm_push_t push;
269     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
270 
271     UVM_ASSERT(!fault_entry->is_fatal);
272 
273     status = uvm_push_begin_acquire(gpu->channel_manager,
274                                     UVM_CHANNEL_TYPE_MEMOPS,
275                                     tracker,
276                                     &push,
277                                     "Clearing set bit for address 0x%llx",
278                                     fault_entry->fault_address);
279     if (status != NV_OK) {
280         UVM_ERR_PRINT("Error acquiring tracker before clearing faulted: %s, GPU %s\n",
281                       nvstatusToString(status),
282                       uvm_gpu_name(gpu));
283         return status;
284     }
285 
286     if (use_clear_faulted_channel_sw_method(gpu))
287         gpu->parent->host_hal->clear_faulted_channel_sw_method(&push, user_channel, fault_entry);
288     else
289         gpu->parent->host_hal->clear_faulted_channel_method(&push, user_channel, fault_entry);
290 
291     uvm_tools_broadcast_replay(gpu, &push, batch_id, fault_entry->fault_source.client_type);
292 
293     uvm_push_end(&push);
294 
295     // Add this push to the GPU's clear_faulted_tracker so GPU removal can wait
296     // on it.
297     status = uvm_tracker_add_push_safe(&non_replayable_faults->clear_faulted_tracker, &push);
298 
299     // Add this push to the channel's clear_faulted_tracker so user channel
300     // removal can wait on it instead of using the per-GPU tracker, which would
301     // require a lock.
302     if (status == NV_OK)
303         status = uvm_tracker_add_push_safe(&user_channel->clear_faulted_tracker, &push);
304 
305     return status;
306 }
307 
clear_faulted_register_on_gpu(uvm_gpu_t * gpu,uvm_user_channel_t * user_channel,const uvm_fault_buffer_entry_t * fault_entry,NvU32 batch_id,uvm_tracker_t * tracker)308 static NV_STATUS clear_faulted_register_on_gpu(uvm_gpu_t *gpu,
309                                                uvm_user_channel_t *user_channel,
310                                                const uvm_fault_buffer_entry_t *fault_entry,
311                                                NvU32 batch_id,
312                                                uvm_tracker_t *tracker)
313 {
314     NV_STATUS status;
315 
316     UVM_ASSERT(!gpu->parent->has_clear_faulted_channel_method);
317 
318     // We need to wait for all pending work before writing to the channel
319     // register
320     status = uvm_tracker_wait(tracker);
321     if (status != NV_OK)
322         return status;
323 
324     gpu->parent->host_hal->clear_faulted_channel_register(user_channel, fault_entry);
325 
326     uvm_tools_broadcast_replay_sync(gpu, batch_id, fault_entry->fault_source.client_type);
327 
328     return NV_OK;
329 }
330 
clear_faulted_on_gpu(uvm_gpu_t * gpu,uvm_user_channel_t * user_channel,const uvm_fault_buffer_entry_t * fault_entry,NvU32 batch_id,uvm_tracker_t * tracker)331 static NV_STATUS clear_faulted_on_gpu(uvm_gpu_t *gpu,
332                                       uvm_user_channel_t *user_channel,
333                                       const uvm_fault_buffer_entry_t *fault_entry,
334                                       NvU32 batch_id,
335                                       uvm_tracker_t *tracker)
336 {
337     if (gpu->parent->has_clear_faulted_channel_method || use_clear_faulted_channel_sw_method(gpu))
338         return clear_faulted_method_on_gpu(gpu, user_channel, fault_entry, batch_id, tracker);
339 
340     return clear_faulted_register_on_gpu(gpu, user_channel, fault_entry, batch_id, tracker);
341 }
342 
service_managed_fault_in_block_locked(uvm_gpu_t * gpu,uvm_va_block_t * va_block,uvm_va_block_retry_t * va_block_retry,uvm_fault_buffer_entry_t * fault_entry,uvm_service_block_context_t * service_context,const bool hmm_migratable)343 static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,
344                                                        uvm_va_block_t *va_block,
345                                                        uvm_va_block_retry_t *va_block_retry,
346                                                        uvm_fault_buffer_entry_t *fault_entry,
347                                                        uvm_service_block_context_t *service_context,
348                                                        const bool hmm_migratable)
349 {
350     NV_STATUS status = NV_OK;
351     uvm_page_index_t page_index;
352     uvm_perf_thrashing_hint_t thrashing_hint;
353     uvm_processor_id_t new_residency;
354     bool read_duplicate;
355     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
356     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
357     const uvm_va_policy_t *policy;
358 
359     UVM_ASSERT(!fault_entry->is_fatal);
360 
361     uvm_assert_rwsem_locked(&va_space->lock);
362 
363     UVM_ASSERT(fault_entry->va_space == va_space);
364     UVM_ASSERT(fault_entry->fault_address >= va_block->start);
365     UVM_ASSERT(fault_entry->fault_address <= va_block->end);
366 
367     policy = uvm_va_policy_get(va_block, fault_entry->fault_address);
368 
369     if (service_context->num_retries == 0) {
370         // notify event to tools/performance heuristics. For now we use a
371         // unique batch id per fault, since we clear the faulted channel for
372         // each fault.
373         uvm_perf_event_notify_gpu_fault(&va_space->perf_events,
374                                         va_block,
375                                         gpu->id,
376                                         policy->preferred_location,
377                                         fault_entry,
378                                         ++non_replayable_faults->batch_id,
379                                         false);
380     }
381 
382     // Check logical permissions
383     status = uvm_va_block_check_logical_permissions(va_block,
384                                                     service_context->block_context,
385                                                     gpu->id,
386                                                     uvm_va_block_cpu_page_index(va_block,
387                                                                                 fault_entry->fault_address),
388                                                     fault_entry->fault_access_type,
389                                                     uvm_range_group_address_migratable(va_space,
390                                                                                        fault_entry->fault_address));
391     if (status != NV_OK) {
392         fault_entry->is_fatal = true;
393         fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status);
394         return NV_OK;
395     }
396 
397     // TODO: Bug 1880194: Revisit thrashing detection
398     thrashing_hint.type = UVM_PERF_THRASHING_HINT_TYPE_NONE;
399 
400     service_context->read_duplicate_count = 0;
401     service_context->thrashing_pin_count = 0;
402 
403     page_index = uvm_va_block_cpu_page_index(va_block, fault_entry->fault_address);
404 
405     // Compute new residency and update the masks
406     new_residency = uvm_va_block_select_residency(va_block,
407                                                   service_context->block_context,
408                                                   page_index,
409                                                   gpu->id,
410                                                   fault_entry->access_type_mask,
411                                                   policy,
412                                                   &thrashing_hint,
413                                                   UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS,
414                                                   hmm_migratable,
415                                                   &read_duplicate);
416 
417     // Initialize the minimum necessary state in the fault service context
418     uvm_processor_mask_zero(&service_context->resident_processors);
419 
420     // Set new residency and update the masks
421     uvm_processor_mask_set(&service_context->resident_processors, new_residency);
422 
423     // The masks need to be fully zeroed as the fault region may grow due to prefetching
424     uvm_page_mask_zero(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
425     uvm_page_mask_set(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index);
426 
427     if (read_duplicate) {
428         uvm_page_mask_zero(&service_context->read_duplicate_mask);
429         uvm_page_mask_set(&service_context->read_duplicate_mask, page_index);
430         service_context->read_duplicate_count = 1;
431     }
432 
433     service_context->access_type[page_index] = fault_entry->fault_access_type;
434 
435     service_context->region = uvm_va_block_region_for_page(page_index);
436 
437     status = uvm_va_block_service_locked(gpu->id, va_block, va_block_retry, service_context);
438 
439     ++service_context->num_retries;
440 
441     return status;
442 }
443 
service_managed_fault_in_block(uvm_gpu_t * gpu,uvm_va_block_t * va_block,uvm_fault_buffer_entry_t * fault_entry,const bool hmm_migratable)444 static NV_STATUS service_managed_fault_in_block(uvm_gpu_t *gpu,
445                                                 uvm_va_block_t *va_block,
446                                                 uvm_fault_buffer_entry_t *fault_entry,
447                                                 const bool hmm_migratable)
448 {
449     NV_STATUS status, tracker_status;
450     uvm_va_block_retry_t va_block_retry;
451     uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.non_replayable.block_service_context;
452 
453     service_context->operation = UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS;
454     service_context->num_retries = 0;
455 
456     if (uvm_va_block_is_hmm(va_block))
457         uvm_hmm_migrate_begin_wait(va_block);
458 
459     uvm_mutex_lock(&va_block->lock);
460 
461     status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, &va_block_retry,
462                                        service_managed_fault_in_block_locked(gpu,
463                                                                              va_block,
464                                                                              &va_block_retry,
465                                                                              fault_entry,
466                                                                              service_context,
467                                                                              hmm_migratable));
468 
469     tracker_status = uvm_tracker_add_tracker_safe(&gpu->parent->fault_buffer_info.non_replayable.fault_service_tracker,
470                                                   &va_block->tracker);
471 
472     uvm_mutex_unlock(&va_block->lock);
473 
474     if (uvm_va_block_is_hmm(va_block))
475         uvm_hmm_migrate_finish(va_block);
476 
477     return status == NV_OK? tracker_status: status;
478 }
479 
480 // See uvm_unregister_channel for comments on the the channel destruction
481 // sequence.
kill_channel_delayed(void * _user_channel)482 static void kill_channel_delayed(void *_user_channel)
483 {
484     uvm_user_channel_t *user_channel = (uvm_user_channel_t *)_user_channel;
485     uvm_va_space_t *va_space = user_channel->kill_channel.va_space;
486 
487     uvm_va_space_down_read_rm(va_space);
488     if (user_channel->gpu_va_space) {
489         // RM handles the fault, which will do the correct fault reporting in the
490         // kernel logs and will initiate channel teardown
491         NV_STATUS status = nvUvmInterfaceReportNonReplayableFault(uvm_gpu_device_handle(user_channel->gpu),
492                                                                   user_channel->kill_channel.fault_packet);
493         UVM_ASSERT(status == NV_OK);
494     }
495     uvm_va_space_up_read_rm(va_space);
496 
497     uvm_user_channel_release(user_channel);
498 }
499 
kill_channel_delayed_entry(void * user_channel)500 static void kill_channel_delayed_entry(void *user_channel)
501 {
502     UVM_ENTRY_VOID(kill_channel_delayed(user_channel));
503 }
504 
schedule_kill_channel(uvm_gpu_t * gpu,uvm_fault_buffer_entry_t * fault_entry,uvm_user_channel_t * user_channel)505 static void schedule_kill_channel(uvm_gpu_t *gpu,
506                                   uvm_fault_buffer_entry_t *fault_entry,
507                                   uvm_user_channel_t *user_channel)
508 {
509     uvm_va_space_t *va_space = fault_entry->va_space;
510     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
511     void *packet = (char *)non_replayable_faults->shadow_buffer_copy +
512                    (fault_entry->non_replayable.buffer_index * gpu->parent->fault_buffer_hal->entry_size(gpu->parent));
513 
514     UVM_ASSERT(gpu);
515     UVM_ASSERT(va_space);
516     UVM_ASSERT(user_channel);
517 
518     if (user_channel->kill_channel.scheduled)
519         return;
520 
521     user_channel->kill_channel.scheduled = true;
522     user_channel->kill_channel.va_space = va_space;
523 
524     // Save the packet to be handled by RM in the channel structure
525     memcpy(user_channel->kill_channel.fault_packet, packet, gpu->parent->fault_buffer_hal->entry_size(gpu->parent));
526 
527     // Retain the channel here so it is not prematurely destroyed. It will be
528     // released after forwarding the fault to RM in kill_channel_delayed.
529     uvm_user_channel_retain(user_channel);
530 
531     // Schedule a work item to kill the channel
532     nv_kthread_q_item_init(&user_channel->kill_channel.kill_channel_q_item,
533                            kill_channel_delayed_entry,
534                            user_channel);
535 
536     nv_kthread_q_schedule_q_item(&gpu->parent->isr.kill_channel_q,
537                                  &user_channel->kill_channel.kill_channel_q_item);
538 }
539 
service_fault_fatal(uvm_fault_buffer_entry_t * fault_entry,NV_STATUS status)540 static void service_fault_fatal(uvm_fault_buffer_entry_t *fault_entry, NV_STATUS status)
541 {
542     UVM_ASSERT(fault_entry->fault_access_type != UVM_FAULT_ACCESS_TYPE_PREFETCH);
543 
544     fault_entry->is_fatal = true;
545     fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status);
546 }
547 
service_non_managed_fault(uvm_gpu_va_space_t * gpu_va_space,struct mm_struct * mm,uvm_fault_buffer_entry_t * fault_entry,NV_STATUS lookup_status)548 static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,
549                                            struct mm_struct *mm,
550                                            uvm_fault_buffer_entry_t *fault_entry,
551                                            NV_STATUS lookup_status)
552 {
553     uvm_gpu_t *gpu = gpu_va_space->gpu;
554     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
555     uvm_ats_fault_invalidate_t *ats_invalidate = &non_replayable_faults->ats_invalidate;
556     NV_STATUS status = lookup_status;
557     NV_STATUS fatal_fault_status = NV_ERR_INVALID_ADDRESS;
558 
559     UVM_ASSERT(!fault_entry->is_fatal);
560 
561     // Avoid dropping fault events when the VA block is not found or cannot be created
562     uvm_perf_event_notify_gpu_fault(&fault_entry->va_space->perf_events,
563                                     NULL,
564                                     gpu->id,
565                                     UVM_ID_INVALID,
566                                     fault_entry,
567                                     ++non_replayable_faults->batch_id,
568                                     false);
569 
570     if (status != NV_ERR_INVALID_ADDRESS)
571         return status;
572 
573     if (uvm_ats_can_service_faults(gpu_va_space, mm)) {
574         struct vm_area_struct *vma;
575         uvm_va_range_t *va_range_next;
576         NvU64 fault_address = fault_entry->fault_address;
577         uvm_fault_access_type_t fault_access_type = fault_entry->fault_access_type;
578         uvm_ats_fault_context_t *ats_context = &non_replayable_faults->ats_context;
579 
580         uvm_page_mask_zero(&ats_context->read_fault_mask);
581         uvm_page_mask_zero(&ats_context->write_fault_mask);
582 
583         ats_context->client_type = UVM_FAULT_CLIENT_TYPE_HUB;
584 
585         ats_invalidate->tlb_batch_pending = false;
586 
587         va_range_next = uvm_va_space_iter_first(gpu_va_space->va_space, fault_entry->fault_address, ~0ULL);
588 
589         // The VA isn't managed. See if ATS knows about it.
590         vma = find_vma_intersection(mm, fault_address, fault_address + 1);
591         if (!vma || uvm_ats_check_in_gmmu_region(gpu_va_space->va_space, fault_address, va_range_next)) {
592 
593             // Do not return error due to logical errors in the application
594             status = NV_OK;
595         }
596         else {
597             NvU64 base = UVM_VA_BLOCK_ALIGN_DOWN(fault_address);
598             uvm_page_mask_t *faults_serviced_mask = &ats_context->faults_serviced_mask;
599             uvm_page_index_t page_index = (fault_address - base) / PAGE_SIZE;
600             uvm_page_mask_t *fault_mask = (fault_access_type >= UVM_FAULT_ACCESS_TYPE_WRITE) ?
601                                                                                        &ats_context->write_fault_mask :
602                                                                                        &ats_context->read_fault_mask;
603 
604             uvm_page_mask_set(fault_mask, page_index);
605 
606             status = uvm_ats_service_faults(gpu_va_space, vma, base, ats_context);
607             if (status == NV_OK) {
608                 // Invalidate ATS TLB entries if needed
609                 if (uvm_page_mask_test(faults_serviced_mask, page_index)) {
610                     status = uvm_ats_invalidate_tlbs(gpu_va_space,
611                                                      ats_invalidate,
612                                                      &non_replayable_faults->fault_service_tracker);
613                     fatal_fault_status = NV_OK;
614                 }
615             }
616             else {
617                 fatal_fault_status = status;
618             }
619         }
620     }
621     else {
622         fatal_fault_status = status;
623 
624         // Do not return error due to logical errors in the application
625         status = NV_OK;
626     }
627 
628     if (fatal_fault_status != NV_OK)
629         service_fault_fatal(fault_entry, fatal_fault_status);
630 
631     return status;
632 }
633 
service_fault_once(uvm_gpu_t * gpu,uvm_fault_buffer_entry_t * fault_entry,const bool hmm_migratable)634 static NV_STATUS service_fault_once(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_entry, const bool hmm_migratable)
635 {
636     NV_STATUS status;
637     uvm_user_channel_t *user_channel;
638     uvm_va_block_t *va_block;
639     uvm_va_space_t *va_space = NULL;
640     struct mm_struct *mm;
641     uvm_gpu_va_space_t *gpu_va_space;
642     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
643     uvm_va_block_context_t *va_block_context =
644         gpu->parent->fault_buffer_info.non_replayable.block_service_context.block_context;
645 
646     status = uvm_parent_gpu_fault_entry_to_va_space(gpu->parent, fault_entry, &va_space);
647     if (status != NV_OK) {
648         // The VA space lookup will fail if we're running concurrently with
649         // removal of the channel from the VA space (channel unregister, GPU VA
650         // space unregister, VA space destroy, etc). The other thread will stop
651         // the channel and remove the channel from the table, so the faulting
652         // condition will be gone. In the case of replayable faults we need to
653         // flush the buffer, but here we can just ignore the entry and proceed
654         // on.
655         //
656         // Note that we can't have any subcontext issues here, since non-
657         // replayable faults only use the address space of their channel.
658         UVM_ASSERT(status == NV_ERR_INVALID_CHANNEL);
659         UVM_ASSERT(!va_space);
660         return NV_OK;
661     }
662 
663     UVM_ASSERT(va_space);
664 
665     // If an mm is registered with the VA space, we have to retain it
666     // in order to lock it before locking the VA space. It is guaranteed
667     // to remain valid until we release. If no mm is registered, we
668     // can only service managed faults, not ATS/HMM faults.
669     mm = uvm_va_space_mm_retain_lock(va_space);
670     uvm_va_block_context_init(va_block_context, mm);
671 
672     uvm_va_space_down_read(va_space);
673 
674     gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
675 
676     if (!gpu_va_space) {
677         // The va_space might have gone away. See the comment above.
678         status = NV_OK;
679         goto exit_no_channel;
680     }
681 
682     fault_entry->va_space = va_space;
683 
684     user_channel = uvm_gpu_va_space_get_user_channel(gpu_va_space, fault_entry->instance_ptr);
685     if (!user_channel) {
686         // The channel might have gone away. See the comment above.
687         status = NV_OK;
688         goto exit_no_channel;
689     }
690 
691     fault_entry->fault_source.channel_id = user_channel->hw_channel_id;
692 
693     if (!fault_entry->is_fatal) {
694         if (mm) {
695             status = uvm_va_block_find_create(fault_entry->va_space,
696                                               fault_entry->fault_address,
697                                               &va_block_context->hmm.vma,
698                                               &va_block);
699         }
700         else {
701             status = uvm_va_block_find_create_managed(fault_entry->va_space,
702                                                       fault_entry->fault_address,
703                                                       &va_block);
704         }
705         if (status == NV_OK)
706             status = service_managed_fault_in_block(gpu_va_space->gpu, va_block, fault_entry, hmm_migratable);
707         else
708             status = service_non_managed_fault(gpu_va_space, mm, fault_entry, status);
709 
710         // We are done, we clear the faulted bit on the channel, so it can be
711         // re-scheduled again
712         if (status == NV_OK && !fault_entry->is_fatal) {
713             status = clear_faulted_on_gpu(gpu,
714                                           user_channel,
715                                           fault_entry,
716                                           non_replayable_faults->batch_id,
717                                           &non_replayable_faults->fault_service_tracker);
718             uvm_tracker_clear(&non_replayable_faults->fault_service_tracker);
719         }
720     }
721 
722     if (fault_entry->is_fatal)
723         uvm_tools_record_gpu_fatal_fault(gpu->id, fault_entry->va_space, fault_entry, fault_entry->fatal_reason);
724 
725     if (fault_entry->is_fatal ||
726         (status != NV_OK &&
727          status != NV_WARN_MORE_PROCESSING_REQUIRED &&
728          status != NV_WARN_MISMATCHED_TARGET))
729         schedule_kill_channel(gpu, fault_entry, user_channel);
730 
731 exit_no_channel:
732     uvm_va_space_up_read(va_space);
733     uvm_va_space_mm_release_unlock(va_space, mm);
734 
735     if (status != NV_OK &&
736         status != NV_WARN_MORE_PROCESSING_REQUIRED &&
737         status != NV_WARN_MISMATCHED_TARGET)
738         UVM_DBG_PRINT("Error servicing non-replayable faults on GPU: %s\n", uvm_gpu_name(gpu));
739 
740     return status;
741 }
742 
service_fault(uvm_gpu_t * gpu,uvm_fault_buffer_entry_t * fault_entry)743 static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_entry)
744 {
745     uvm_service_block_context_t *service_context =
746         &gpu->parent->fault_buffer_info.non_replayable.block_service_context;
747     NV_STATUS status;
748     bool hmm_migratable = true;
749 
750     service_context->num_retries = 0;
751 
752     do {
753         status = service_fault_once(gpu, fault_entry, hmm_migratable);
754         if (status == NV_WARN_MISMATCHED_TARGET) {
755             hmm_migratable = false;
756             status = NV_WARN_MORE_PROCESSING_REQUIRED;
757         }
758     } while (status == NV_WARN_MORE_PROCESSING_REQUIRED);
759 
760     return status;
761 }
762 
uvm_gpu_service_non_replayable_fault_buffer(uvm_gpu_t * gpu)763 void uvm_gpu_service_non_replayable_fault_buffer(uvm_gpu_t *gpu)
764 {
765     NvU32 cached_faults;
766 
767     // If this handler is modified to handle fewer than all of the outstanding
768     // faults, then special handling will need to be added to uvm_suspend()
769     // to guarantee that fault processing has completed before control is
770     // returned to the RM.
771     do {
772         NV_STATUS status;
773         NvU32 i;
774 
775         status = fetch_non_replayable_fault_buffer_entries(gpu->parent, &cached_faults);
776         if (status != NV_OK)
777             return;
778 
779         // Differently to replayable faults, we do not batch up and preprocess
780         // non-replayable faults since getting multiple faults on the same
781         // memory region is not very likely
782         for (i = 0; i < cached_faults; ++i) {
783             status = service_fault(gpu, &gpu->parent->fault_buffer_info.non_replayable.fault_cache[i]);
784             if (status != NV_OK)
785                 return;
786         }
787     } while (cached_faults > 0);
788 }
789