1 /*******************************************************************************
2     Copyright (c) 2017-2022 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 *******************************************************************************/
22 
23 #include "nv_uvm_interface.h"
24 #include "uvm_common.h"
25 #include "uvm_api.h"
26 #include "uvm_gpu_non_replayable_faults.h"
27 #include "uvm_gpu.h"
28 #include "uvm_hal.h"
29 #include "uvm_lock.h"
30 #include "uvm_tools.h"
31 #include "uvm_user_channel.h"
32 #include "uvm_va_space_mm.h"
33 #include "uvm_va_block.h"
34 #include "uvm_va_range.h"
35 #include "uvm_kvmalloc.h"
36 #include "uvm_ats_faults.h"
37 
38 // In the context of a CUDA application using Unified Memory, it is sometimes
39 // assumed that there is a single type of fault, originated by a memory
40 // load/store in a SM (Graphics Engine), which itself can be traced back to a
41 // memory access in a CUDA kernel written by a developer. In reality, faults can
42 // also be triggered by other parts of the GPU i.e. by other engines, as the
43 // result of developer-facing APIs, or operations initiated by a user-mode
44 // driver. The Graphics Engine faults are called replayable faults, while the
45 // rest are called non-replayable. The differences between the two types of
46 // faults go well beyond the engine originating the fault.
47 //
48 // A non-replayable fault originates in an engine other than Graphics. UVM
49 // services non-replayable faults from the Copy and PBDMA (Host/ESCHED) Engines.
50 // Non-replayable faults originated in other engines are considered fatal, and
51 // do not reach the UVM driver. While UVM can distinguish between faults
52 // originated in the Copy Engine and faults originated in the PBDMA Engine, in
53 // practice they are all processed in the same way. Replayable fault support in
54 // Graphics was introduced in Pascal, and non-replayable fault support in CE and
55 // PBDMA Engines was introduced in Volta; all non-replayable faults were fatal
56 // before Volta.
57 //
58 // An example of a Copy Engine non-replayable fault is a memory copy between two
59 // virtual addresses on a GPU, in which either the source or destination
60 // pointers are not currently mapped to a physical address in the page tables of
61 // the GPU. An example of a PBDMA non-replayable fault is a semaphore acquire in
62 // which the semaphore virtual address passed as argument is currently not
63 // mapped to any physical address.
64 //
65 // Non-replayable faults originated in the CE and PBDMA Engines result in HW
66 // preempting the channel associated with the fault, a mechanism called "fault
67 // and switch". More precisely, the switching out affects not only the channel
68 // that caused the fault, but all the channels in the same Time Slice Group
69 // (TSG). SW intervention is required so all the channels in the TSG can be
70 // scheduled again, but channels in other TSGs can be scheduled and resume their
71 // normal execution. In the case of the non-replayable faults serviced by UVM,
72 // the driver clears a channel's faulted bit upon successful servicing, but it
73 // is only when the servicing has completed for all the channels in the TSG that
74 // they are all allowed to be switched in.  Non-replayable faults originated in
75 // engines other than CE and PBDMA are fatal because these other units lack
76 // hardware support for the "fault and switch" and restart mechanisms just
77 // described.
78 // On the other hand, replayable faults block preemption of the channel until
79 // software (UVM) services the fault. This is sometimes known as "fault and
80 // stall". Note that replayable faults prevent the execution of other channels,
81 // which are stalled until the fault is serviced.
82 //
83 // The "non-replayable" naming alludes to the fact that, historically, these
84 // faults indicated a fatal condition so there was no recovery ("replay")
85 // process, and SW could not ignore or drop the fault. As discussed before, this
86 // is no longer the case and while at times the hardware documentation uses the
87 // "fault and replay" expression for CE and PBDMA faults, we reserve that
88 // expression for Graphics faults and favor the term "fault and reschedule"
89 // instead. Replaying a fault does not necessarily imply that UVM has serviced
90 // it. For example, the UVM driver may choose to ignore the replayable faults
91 // associated with a GPU for some period of time if it detects that there is
92 // thrashing going on, and the GPU needs to be throttled. The fault entries
93 // corresponding to the ignored faults are never saved by UVM, but new entries
94 // (and new interrupts) will be generated by hardware each time after UVM issues
95 // a replay.
96 //
97 // While replayable faults are always the responsibility of UVM, the servicing
98 // of non-replayable faults is split between RM and UVM. In the case of
99 // replayable faults, UVM has sole SW ownership of the hardware buffer
100 // containing the faults, and it is responsible for updating the GET pointer to
101 // signal the hardware that a number of faults have been read. UVM also reads
102 // the PUT pointer value written by hardware. But in the case of non-replayable
103 // faults, UVM reads the fault entries out of a regular CPU buffer, shared with
104 // RM, called "shadow buffer". RM is responsible for accessing the actual
105 // non-replayable hardware buffer, reading the PUT pointer, updating the GET
106 // pointer, and moving CE and PBDMA faults from the hardware buffer to the
107 // shadow buffer. Because the Resource Manager owns the HW buffer, UVM needs to
108 // call RM when servicing a non-replayable fault, first to figure out if there
109 // is a pending fault, and then to read entries from the shadow buffer.
110 //
111 // Once UVM has parsed a non-replayable fault entry corresponding to managed
112 // memory, and identified the VA block associated with it, the servicing logic
113 // for that block is identical to that of a replayable fault, see
114 // uvm_va_block_service_locked. Another similarity between the two types of
115 // faults is that they use the same entry format, uvm_fault_buffer_entry_t.
116 
117 
118 // There is no error handling in this function. The caller is in charge of
119 // calling uvm_gpu_fault_buffer_deinit_non_replayable_faults on failure.
120 NV_STATUS uvm_gpu_fault_buffer_init_non_replayable_faults(uvm_parent_gpu_t *parent_gpu)
121 {
122     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable;
123 
124     UVM_ASSERT(parent_gpu->non_replayable_faults_supported);
125 
126     non_replayable_faults->shadow_buffer_copy = NULL;
127     non_replayable_faults->fault_cache        = NULL;
128 
129     non_replayable_faults->max_faults = parent_gpu->fault_buffer_info.rm_info.nonReplayable.bufferSize /
130                                         parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
131 
132     non_replayable_faults->shadow_buffer_copy =
133         uvm_kvmalloc_zero(parent_gpu->fault_buffer_info.rm_info.nonReplayable.bufferSize);
134     if (!non_replayable_faults->shadow_buffer_copy)
135         return NV_ERR_NO_MEMORY;
136 
137     non_replayable_faults->fault_cache = uvm_kvmalloc_zero(non_replayable_faults->max_faults *
138                                                            sizeof(*non_replayable_faults->fault_cache));
139     if (!non_replayable_faults->fault_cache)
140         return NV_ERR_NO_MEMORY;
141 
142     uvm_tracker_init(&non_replayable_faults->clear_faulted_tracker);
143     uvm_tracker_init(&non_replayable_faults->fault_service_tracker);
144 
145     return NV_OK;
146 }
147 
148 void uvm_gpu_fault_buffer_deinit_non_replayable_faults(uvm_parent_gpu_t *parent_gpu)
149 {
150     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable;
151 
152     if (non_replayable_faults->fault_cache) {
153         UVM_ASSERT(uvm_tracker_is_empty(&non_replayable_faults->clear_faulted_tracker));
154         uvm_tracker_deinit(&non_replayable_faults->clear_faulted_tracker);
155 
156         UVM_ASSERT(uvm_tracker_is_empty(&non_replayable_faults->fault_service_tracker));
157         uvm_tracker_deinit(&non_replayable_faults->fault_service_tracker);
158     }
159 
160     uvm_kvfree(non_replayable_faults->shadow_buffer_copy);
161     uvm_kvfree(non_replayable_faults->fault_cache);
162     non_replayable_faults->shadow_buffer_copy = NULL;
163     non_replayable_faults->fault_cache        = NULL;
164 }
165 
166 bool uvm_gpu_non_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu)
167 {
168     NV_STATUS status;
169     NvBool has_pending_faults;
170 
171     UVM_ASSERT(parent_gpu->isr.non_replayable_faults.handling);
172 
173     status = nvUvmInterfaceHasPendingNonReplayableFaults(&parent_gpu->fault_buffer_info.rm_info,
174                                                          &has_pending_faults);
175     UVM_ASSERT(status == NV_OK);
176 
177     return has_pending_faults == NV_TRUE;
178 }
179 
180 static NvU32 fetch_non_replayable_fault_buffer_entries(uvm_gpu_t *gpu)
181 {
182     NV_STATUS status;
183     NvU32 i = 0;
184     NvU32 cached_faults = 0;
185     uvm_fault_buffer_entry_t *fault_cache;
186     NvU32 entry_size = gpu->parent->fault_buffer_hal->entry_size(gpu->parent);
187     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
188     char *current_hw_entry = (char *)non_replayable_faults->shadow_buffer_copy;
189 
190     fault_cache = non_replayable_faults->fault_cache;
191 
192     UVM_ASSERT(uvm_sem_is_locked(&gpu->parent->isr.non_replayable_faults.service_lock));
193     UVM_ASSERT(gpu->parent->non_replayable_faults_supported);
194 
195     status = nvUvmInterfaceGetNonReplayableFaults(&gpu->parent->fault_buffer_info.rm_info,
196                                                   non_replayable_faults->shadow_buffer_copy,
197                                                   &cached_faults);
198     UVM_ASSERT(status == NV_OK);
199 
200     // Parse all faults
201     for (i = 0; i < cached_faults; ++i) {
202         uvm_fault_buffer_entry_t *fault_entry = &non_replayable_faults->fault_cache[i];
203 
204         gpu->parent->fault_buffer_hal->parse_non_replayable_entry(gpu->parent, current_hw_entry, fault_entry);
205 
206         // The GPU aligns the fault addresses to 4k, but all of our tracking is
207         // done in PAGE_SIZE chunks which might be larger.
208         fault_entry->fault_address = UVM_PAGE_ALIGN_DOWN(fault_entry->fault_address);
209 
210         // Make sure that all fields in the entry are properly initialized
211         fault_entry->va_space = NULL;
212         fault_entry->is_fatal = (fault_entry->fault_type >= UVM_FAULT_TYPE_FATAL);
213         fault_entry->filtered = false;
214 
215         fault_entry->num_instances = 1;
216         fault_entry->access_type_mask = uvm_fault_access_type_mask_bit(fault_entry->fault_access_type);
217         INIT_LIST_HEAD(&fault_entry->merged_instances_list);
218         fault_entry->non_replayable.buffer_index = i;
219 
220         if (fault_entry->is_fatal) {
221             // Record the fatal fault event later as we need the va_space locked
222             fault_entry->fatal_reason = UvmEventFatalReasonInvalidFaultType;
223         }
224         else {
225             fault_entry->fatal_reason = UvmEventFatalReasonInvalid;
226         }
227 
228         current_hw_entry += entry_size;
229     }
230 
231     return cached_faults;
232 }
233 
234 // In SRIOV, the UVM (guest) driver does not have access to the privileged
235 // registers used to clear the faulted bit. Instead, UVM requests host RM to do
236 // the clearing on its behalf, using a SW method.
237 static bool use_clear_faulted_channel_sw_method(uvm_gpu_t *gpu)
238 {
239     if (uvm_gpu_is_virt_mode_sriov(gpu)) {
240         UVM_ASSERT(gpu->parent->has_clear_faulted_channel_sw_method);
241         return true;
242     }
243 
244     return false;
245 }
246 
247 static NV_STATUS clear_faulted_method_on_gpu(uvm_gpu_t *gpu,
248                                              uvm_user_channel_t *user_channel,
249                                              const uvm_fault_buffer_entry_t *fault_entry,
250                                              NvU32 batch_id,
251                                              uvm_tracker_t *tracker)
252 {
253     NV_STATUS status;
254     uvm_push_t push;
255     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
256 
257     UVM_ASSERT(!fault_entry->is_fatal);
258 
259     status = uvm_push_begin_acquire(gpu->channel_manager,
260                                     UVM_CHANNEL_TYPE_MEMOPS,
261                                     tracker,
262                                     &push,
263                                     "Clearing set bit for address 0x%llx",
264                                     fault_entry->fault_address);
265     if (status != NV_OK) {
266         UVM_ERR_PRINT("Error acquiring tracker before clearing faulted: %s, GPU %s\n",
267                       nvstatusToString(status),
268                       uvm_gpu_name(gpu));
269         return status;
270     }
271 
272     if (use_clear_faulted_channel_sw_method(gpu))
273         gpu->parent->host_hal->clear_faulted_channel_sw_method(&push, user_channel, fault_entry);
274     else
275         gpu->parent->host_hal->clear_faulted_channel_method(&push, user_channel, fault_entry);
276 
277     uvm_tools_broadcast_replay(gpu, &push, batch_id, fault_entry->fault_source.client_type);
278 
279     uvm_push_end(&push);
280 
281     // Add this push to the GPU's clear_faulted_tracker so GPU removal can wait
282     // on it.
283     status = uvm_tracker_add_push_safe(&non_replayable_faults->clear_faulted_tracker, &push);
284 
285     // Add this push to the channel's clear_faulted_tracker so user channel
286     // removal can wait on it instead of using the per-GPU tracker, which would
287     // require a lock.
288     if (status == NV_OK)
289         status = uvm_tracker_add_push_safe(&user_channel->clear_faulted_tracker, &push);
290 
291     return status;
292 }
293 
294 static NV_STATUS clear_faulted_register_on_gpu(uvm_gpu_t *gpu,
295                                                uvm_user_channel_t *user_channel,
296                                                const uvm_fault_buffer_entry_t *fault_entry,
297                                                NvU32 batch_id,
298                                                uvm_tracker_t *tracker)
299 {
300     NV_STATUS status;
301 
302     UVM_ASSERT(!gpu->parent->has_clear_faulted_channel_method);
303 
304     // We need to wait for all pending work before writing to the channel
305     // register
306     status = uvm_tracker_wait(tracker);
307     if (status != NV_OK)
308         return status;
309 
310     gpu->parent->host_hal->clear_faulted_channel_register(user_channel, fault_entry);
311 
312     uvm_tools_broadcast_replay_sync(gpu, batch_id, fault_entry->fault_source.client_type);
313 
314     return NV_OK;
315 }
316 
317 static NV_STATUS clear_faulted_on_gpu(uvm_gpu_t *gpu,
318                                       uvm_user_channel_t *user_channel,
319                                       const uvm_fault_buffer_entry_t *fault_entry,
320                                       NvU32 batch_id,
321                                       uvm_tracker_t *tracker)
322 {
323     if (gpu->parent->has_clear_faulted_channel_method || use_clear_faulted_channel_sw_method(gpu))
324         return clear_faulted_method_on_gpu(gpu, user_channel, fault_entry, batch_id, tracker);
325 
326     return clear_faulted_register_on_gpu(gpu, user_channel, fault_entry, batch_id, tracker);
327 }
328 
329 static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,
330                                                        uvm_va_block_t *va_block,
331                                                        uvm_va_block_retry_t *va_block_retry,
332                                                        uvm_fault_buffer_entry_t *fault_entry,
333                                                        uvm_service_block_context_t *service_context)
334 {
335     NV_STATUS status = NV_OK;
336     uvm_page_index_t page_index;
337     uvm_perf_thrashing_hint_t thrashing_hint;
338     uvm_processor_id_t new_residency;
339     bool read_duplicate;
340     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
341     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
342 
343     UVM_ASSERT(!fault_entry->is_fatal);
344 
345     uvm_assert_rwsem_locked(&va_space->lock);
346 
347     UVM_ASSERT(fault_entry->va_space == va_space);
348     UVM_ASSERT(fault_entry->fault_address >= va_block->start);
349     UVM_ASSERT(fault_entry->fault_address <= va_block->end);
350 
351     service_context->block_context.policy = uvm_va_policy_get(va_block, fault_entry->fault_address);
352 
353     if (service_context->num_retries == 0) {
354         // notify event to tools/performance heuristics. For now we use a
355         // unique batch id per fault, since we clear the faulted channel for
356         // each fault.
357         uvm_perf_event_notify_gpu_fault(&va_space->perf_events,
358                                         va_block,
359                                         gpu->id,
360                                         service_context->block_context.policy->preferred_location,
361                                         fault_entry,
362                                         ++non_replayable_faults->batch_id,
363                                         false);
364     }
365 
366     // Check logical permissions
367     status = uvm_va_block_check_logical_permissions(va_block,
368                                                     &service_context->block_context,
369                                                     gpu->id,
370                                                     uvm_va_block_cpu_page_index(va_block,
371                                                                                 fault_entry->fault_address),
372                                                     fault_entry->fault_access_type,
373                                                     uvm_range_group_address_migratable(va_space,
374                                                                                        fault_entry->fault_address));
375     if (status != NV_OK) {
376         fault_entry->is_fatal = true;
377         fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status);
378         return NV_OK;
379     }
380 
381     // TODO: Bug 1880194: Revisit thrashing detection
382     thrashing_hint.type = UVM_PERF_THRASHING_HINT_TYPE_NONE;
383 
384     service_context->read_duplicate_count = 0;
385     service_context->thrashing_pin_count = 0;
386 
387     page_index = uvm_va_block_cpu_page_index(va_block, fault_entry->fault_address);
388 
389     // Compute new residency and update the masks
390     new_residency = uvm_va_block_select_residency(va_block,
391                                                   &service_context->block_context,
392                                                   page_index,
393                                                   gpu->id,
394                                                   fault_entry->access_type_mask,
395                                                   service_context->block_context.policy,
396                                                   &thrashing_hint,
397                                                   UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS,
398                                                   &read_duplicate);
399 
400     // Initialize the minimum necessary state in the fault service context
401     uvm_processor_mask_zero(&service_context->resident_processors);
402 
403     // Set new residency and update the masks
404     uvm_processor_mask_set(&service_context->resident_processors, new_residency);
405 
406     // The masks need to be fully zeroed as the fault region may grow due to prefetching
407     uvm_page_mask_zero(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
408     uvm_page_mask_set(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index);
409 
410     if (read_duplicate) {
411         uvm_page_mask_zero(&service_context->read_duplicate_mask);
412         uvm_page_mask_set(&service_context->read_duplicate_mask, page_index);
413         service_context->read_duplicate_count = 1;
414     }
415 
416     service_context->access_type[page_index] = fault_entry->fault_access_type;
417 
418     service_context->region = uvm_va_block_region_for_page(page_index);
419 
420     status = uvm_va_block_service_locked(gpu->id, va_block, va_block_retry, service_context);
421 
422     ++service_context->num_retries;
423 
424     return status;
425 }
426 
427 static NV_STATUS service_managed_fault_in_block(uvm_gpu_t *gpu,
428                                                 uvm_va_block_t *va_block,
429                                                 uvm_fault_buffer_entry_t *fault_entry)
430 {
431     NV_STATUS status, tracker_status;
432     uvm_va_block_retry_t va_block_retry;
433     uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.non_replayable.block_service_context;
434 
435     service_context->operation = UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS;
436     service_context->num_retries = 0;
437 
438     if (uvm_va_block_is_hmm(va_block)) {
439         uvm_hmm_service_context_init(service_context);
440         uvm_hmm_migrate_begin_wait(va_block);
441     }
442 
443     uvm_mutex_lock(&va_block->lock);
444 
445     status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, &va_block_retry,
446                                        service_managed_fault_in_block_locked(gpu,
447                                                                              va_block,
448                                                                              &va_block_retry,
449                                                                              fault_entry,
450                                                                              service_context));
451 
452     tracker_status = uvm_tracker_add_tracker_safe(&gpu->parent->fault_buffer_info.non_replayable.fault_service_tracker,
453                                                   &va_block->tracker);
454 
455     uvm_mutex_unlock(&va_block->lock);
456 
457     if (uvm_va_block_is_hmm(va_block))
458         uvm_hmm_migrate_finish(va_block);
459 
460     return status == NV_OK? tracker_status: status;
461 }
462 
463 // See uvm_unregister_channel for comments on the the channel destruction
464 // sequence.
465 static void kill_channel_delayed(void *_user_channel)
466 {
467     uvm_user_channel_t *user_channel = (uvm_user_channel_t *)_user_channel;
468     uvm_va_space_t *va_space = user_channel->kill_channel.va_space;
469 
470     uvm_va_space_down_read_rm(va_space);
471     if (user_channel->gpu_va_space) {
472         // RM handles the fault, which will do the correct fault reporting in the
473         // kernel logs and will initiate channel teardown
474         NV_STATUS status = nvUvmInterfaceReportNonReplayableFault(uvm_gpu_device_handle(user_channel->gpu),
475                                                                   user_channel->kill_channel.fault_packet);
476         UVM_ASSERT(status == NV_OK);
477     }
478     uvm_va_space_up_read_rm(va_space);
479 
480     uvm_user_channel_release(user_channel);
481 }
482 
483 static void kill_channel_delayed_entry(void *user_channel)
484 {
485     UVM_ENTRY_VOID(kill_channel_delayed(user_channel));
486 }
487 
488 static void schedule_kill_channel(uvm_gpu_t *gpu,
489                                   uvm_fault_buffer_entry_t *fault_entry,
490                                   uvm_user_channel_t *user_channel)
491 {
492     uvm_va_space_t *va_space = fault_entry->va_space;
493     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
494     void *packet = (char *)non_replayable_faults->shadow_buffer_copy +
495                    (fault_entry->non_replayable.buffer_index * gpu->parent->fault_buffer_hal->entry_size(gpu->parent));
496 
497     UVM_ASSERT(gpu);
498     UVM_ASSERT(va_space);
499     UVM_ASSERT(user_channel);
500 
501     if (user_channel->kill_channel.scheduled)
502         return;
503 
504     user_channel->kill_channel.scheduled = true;
505     user_channel->kill_channel.va_space = va_space;
506 
507     // Save the packet to be handled by RM in the channel structure
508     memcpy(user_channel->kill_channel.fault_packet, packet, gpu->parent->fault_buffer_hal->entry_size(gpu->parent));
509 
510     // Retain the channel here so it is not prematurely destroyed. It will be
511     // released after forwarding the fault to RM in kill_channel_delayed.
512     uvm_user_channel_retain(user_channel);
513 
514     // Schedule a work item to kill the channel
515     nv_kthread_q_item_init(&user_channel->kill_channel.kill_channel_q_item,
516                            kill_channel_delayed_entry,
517                            user_channel);
518 
519     nv_kthread_q_schedule_q_item(&gpu->parent->isr.kill_channel_q,
520                                  &user_channel->kill_channel.kill_channel_q_item);
521 }
522 
523 static void service_fault_fatal(uvm_fault_buffer_entry_t *fault_entry, NV_STATUS status)
524 {
525     UVM_ASSERT(fault_entry->fault_access_type != UVM_FAULT_ACCESS_TYPE_PREFETCH);
526 
527     fault_entry->is_fatal = true;
528     fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status);
529 }
530 
531 static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,
532                                            struct mm_struct *mm,
533                                            uvm_fault_buffer_entry_t *fault_entry,
534                                            NV_STATUS lookup_status)
535 {
536     uvm_gpu_t *gpu = gpu_va_space->gpu;
537     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
538     uvm_ats_fault_invalidate_t *ats_invalidate = &non_replayable_faults->ats_invalidate;
539     NV_STATUS status = lookup_status;
540     NV_STATUS fatal_fault_status = NV_ERR_INVALID_ADDRESS;
541 
542     UVM_ASSERT(!fault_entry->is_fatal);
543 
544     // Avoid dropping fault events when the VA block is not found or cannot be created
545     uvm_perf_event_notify_gpu_fault(&fault_entry->va_space->perf_events,
546                                     NULL,
547                                     gpu->id,
548                                     UVM_ID_INVALID,
549                                     fault_entry,
550                                     ++non_replayable_faults->batch_id,
551                                     false);
552 
553     if (status != NV_ERR_INVALID_ADDRESS)
554         return status;
555 
556     if (uvm_ats_can_service_faults(gpu_va_space, mm)) {
557         struct vm_area_struct *vma;
558         uvm_va_range_t *va_range_next;
559         NvU64 fault_address = fault_entry->fault_address;
560         uvm_fault_access_type_t fault_access_type = fault_entry->fault_access_type;
561         uvm_ats_fault_context_t *ats_context = &non_replayable_faults->ats_context;
562 
563         uvm_page_mask_zero(&ats_context->read_fault_mask);
564         uvm_page_mask_zero(&ats_context->write_fault_mask);
565 
566         ats_context->client_type = UVM_FAULT_CLIENT_TYPE_HUB;
567 
568         ats_invalidate->write_faults_in_batch = false;
569 
570         va_range_next = uvm_va_space_iter_first(gpu_va_space->va_space, fault_entry->fault_address, ~0ULL);
571 
572         // The VA isn't managed. See if ATS knows about it.
573         vma = find_vma_intersection(mm, fault_address, fault_address + 1);
574         if (!vma || uvm_ats_check_in_gmmu_region(gpu_va_space->va_space, fault_address, va_range_next)) {
575 
576             // Do not return error due to logical errors in the application
577             status = NV_OK;
578         }
579         else {
580             NvU64 base = UVM_VA_BLOCK_ALIGN_DOWN(fault_address);
581             uvm_page_mask_t *faults_serviced_mask = &ats_context->faults_serviced_mask;
582             uvm_page_index_t page_index = (fault_address - base) / PAGE_SIZE;
583             uvm_page_mask_t *fault_mask = (fault_access_type >= UVM_FAULT_ACCESS_TYPE_WRITE) ?
584                                                                                        &ats_context->write_fault_mask :
585                                                                                        &ats_context->read_fault_mask;
586 
587             uvm_page_mask_set(fault_mask, page_index);
588 
589             status = uvm_ats_service_faults(gpu_va_space, vma, base, ats_context);
590             if (status == NV_OK) {
591                 // Invalidate ATS TLB entries if needed
592                 if (uvm_page_mask_test(faults_serviced_mask, page_index)) {
593                     status = uvm_ats_invalidate_tlbs(gpu_va_space,
594                                                      ats_invalidate,
595                                                      &non_replayable_faults->fault_service_tracker);
596                     fatal_fault_status = NV_OK;
597                 }
598             }
599             else {
600                 fatal_fault_status = status;
601             }
602         }
603     }
604     else {
605         fatal_fault_status = status;
606 
607         // Do not return error due to logical errors in the application
608         status = NV_OK;
609     }
610 
611     if (fatal_fault_status != NV_OK)
612         service_fault_fatal(fault_entry, fatal_fault_status);
613 
614     return status;
615 }
616 
617 static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_entry)
618 {
619     NV_STATUS status;
620     uvm_user_channel_t *user_channel;
621     uvm_va_block_t *va_block;
622     uvm_va_space_t *va_space = NULL;
623     struct mm_struct *mm;
624     uvm_gpu_va_space_t *gpu_va_space;
625     uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
626     uvm_va_block_context_t *va_block_context =
627         &gpu->parent->fault_buffer_info.non_replayable.block_service_context.block_context;
628 
629     status = uvm_gpu_fault_entry_to_va_space(gpu, fault_entry, &va_space);
630     if (status != NV_OK) {
631         // The VA space lookup will fail if we're running concurrently with
632         // removal of the channel from the VA space (channel unregister, GPU VA
633         // space unregister, VA space destroy, etc). The other thread will stop
634         // the channel and remove the channel from the table, so the faulting
635         // condition will be gone. In the case of replayable faults we need to
636         // flush the buffer, but here we can just ignore the entry and proceed
637         // on.
638         //
639         // Note that we can't have any subcontext issues here, since non-
640         // replayable faults only use the address space of their channel.
641         UVM_ASSERT(status == NV_ERR_INVALID_CHANNEL);
642         UVM_ASSERT(!va_space);
643         return NV_OK;
644     }
645 
646     UVM_ASSERT(va_space);
647 
648     // If an mm is registered with the VA space, we have to retain it
649     // in order to lock it before locking the VA space. It is guaranteed
650     // to remain valid until we release. If no mm is registered, we
651     // can only service managed faults, not ATS/HMM faults.
652     mm = uvm_va_space_mm_retain_lock(va_space);
653     va_block_context->mm = mm;
654 
655     uvm_va_space_down_read(va_space);
656 
657     gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
658 
659     if (!gpu_va_space) {
660         // The va_space might have gone away. See the comment above.
661         status = NV_OK;
662         goto exit_no_channel;
663     }
664 
665     fault_entry->va_space = va_space;
666 
667     user_channel = uvm_gpu_va_space_get_user_channel(gpu_va_space, fault_entry->instance_ptr);
668     if (!user_channel) {
669         // The channel might have gone away. See the comment above.
670         status = NV_OK;
671         goto exit_no_channel;
672     }
673 
674     fault_entry->fault_source.channel_id = user_channel->hw_channel_id;
675 
676     if (!fault_entry->is_fatal) {
677         status = uvm_va_block_find_create(fault_entry->va_space,
678                                           fault_entry->fault_address,
679                                           va_block_context,
680                                           &va_block);
681         if (status == NV_OK)
682             status = service_managed_fault_in_block(gpu_va_space->gpu, va_block, fault_entry);
683         else
684             status = service_non_managed_fault(gpu_va_space, mm, fault_entry, status);
685 
686         // We are done, we clear the faulted bit on the channel, so it can be
687         // re-scheduled again
688         if (status == NV_OK && !fault_entry->is_fatal) {
689             status = clear_faulted_on_gpu(gpu,
690                                           user_channel,
691                                           fault_entry,
692                                           non_replayable_faults->batch_id,
693                                           &non_replayable_faults->fault_service_tracker);
694             uvm_tracker_clear(&non_replayable_faults->fault_service_tracker);
695         }
696     }
697 
698     if (fault_entry->is_fatal)
699         uvm_tools_record_gpu_fatal_fault(gpu->parent->id, fault_entry->va_space, fault_entry, fault_entry->fatal_reason);
700 
701     if (status != NV_OK || fault_entry->is_fatal)
702         schedule_kill_channel(gpu, fault_entry, user_channel);
703 
704 exit_no_channel:
705     uvm_va_space_up_read(va_space);
706     uvm_va_space_mm_release_unlock(va_space, mm);
707 
708     return status;
709 }
710 
711 void uvm_gpu_service_non_replayable_fault_buffer(uvm_gpu_t *gpu)
712 {
713     NV_STATUS status = NV_OK;
714     NvU32 cached_faults;
715 
716     // If this handler is modified to handle fewer than all of the outstanding
717     // faults, then special handling will need to be added to uvm_suspend()
718     // to guarantee that fault processing has completed before control is
719     // returned to the RM.
720     while ((cached_faults = fetch_non_replayable_fault_buffer_entries(gpu)) > 0) {
721         NvU32 i;
722 
723         // Differently to replayable faults, we do not batch up and preprocess
724         // non-replayable faults since getting multiple faults on the same
725         // memory region is not very likely
726         //
727         // TODO: Bug 2103669: [UVM/ATS] Optimize ATS fault servicing
728         for (i = 0; i < cached_faults; ++i) {
729             status = service_fault(gpu, &gpu->parent->fault_buffer_info.non_replayable.fault_cache[i]);
730             if (status != NV_OK)
731                 break;
732         }
733     }
734 
735     if (status != NV_OK)
736         UVM_DBG_PRINT("Error servicing non-replayable faults on GPU: %s\n", uvm_gpu_name(gpu));
737 }
738