1 /*******************************************************************************
2 Copyright (c) 2017-2024 NVIDIA Corporation
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to
6 deal in the Software without restriction, including without limitation the
7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8 sell copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be
12 included in all copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 DEALINGS IN THE SOFTWARE.
21 *******************************************************************************/
22
23 #include "nv_uvm_interface.h"
24 #include "uvm_common.h"
25 #include "uvm_api.h"
26 #include "uvm_gpu_non_replayable_faults.h"
27 #include "uvm_gpu.h"
28 #include "uvm_hal.h"
29 #include "uvm_lock.h"
30 #include "uvm_tools.h"
31 #include "uvm_user_channel.h"
32 #include "uvm_va_space_mm.h"
33 #include "uvm_va_block.h"
34 #include "uvm_va_range.h"
35 #include "uvm_kvmalloc.h"
36 #include "uvm_ats_faults.h"
37
38 // In the context of a CUDA application using Unified Memory, it is sometimes
39 // assumed that there is a single type of fault, originated by a memory
40 // load/store in a SM (Graphics Engine), which itself can be traced back to a
41 // memory access in a CUDA kernel written by a developer. In reality, faults can
42 // also be triggered by other parts of the GPU i.e. by other engines, as the
43 // result of developer-facing APIs, or operations initiated by a user-mode
44 // driver. The Graphics Engine faults are called replayable faults, while the
45 // rest are called non-replayable. The differences between the two types of
46 // faults go well beyond the engine originating the fault.
47 //
48 // A non-replayable fault originates in an engine other than Graphics. UVM
49 // services non-replayable faults from the Copy and PBDMA (Host/ESCHED) Engines.
50 // Non-replayable faults originated in other engines are considered fatal, and
51 // do not reach the UVM driver. While UVM can distinguish between faults
52 // originated in the Copy Engine and faults originated in the PBDMA Engine, in
53 // practice they are all processed in the same way. Replayable fault support in
54 // Graphics was introduced in Pascal, and non-replayable fault support in CE and
55 // PBDMA Engines was introduced in Volta; all non-replayable faults were fatal
56 // before Volta.
57 //
58 // An example of a Copy Engine non-replayable fault is a memory copy between two
59 // virtual addresses on a GPU, in which either the source or destination
60 // pointers are not currently mapped to a physical address in the page tables of
61 // the GPU. An example of a PBDMA non-replayable fault is a semaphore acquire in
62 // which the semaphore virtual address passed as argument is currently not
63 // mapped to any physical address.
64 //
65 // Non-replayable faults originated in the CE and PBDMA Engines result in HW
66 // preempting the channel associated with the fault, a mechanism called "fault
67 // and switch". More precisely, the switching out affects not only the channel
68 // that caused the fault, but all the channels in the same Time Slice Group
69 // (TSG). SW intervention is required so all the channels in the TSG can be
70 // scheduled again, but channels in other TSGs can be scheduled and resume their
71 // normal execution. In the case of the non-replayable faults serviced by UVM,
72 // the driver clears a channel's faulted bit upon successful servicing, but it
73 // is only when the servicing has completed for all the channels in the TSG that
74 // they are all allowed to be switched in. Non-replayable faults originated in
75 // engines other than CE and PBDMA are fatal because these other units lack
76 // hardware support for the "fault and switch" and restart mechanisms just
77 // described.
78 // On the other hand, replayable faults block preemption of the channel until
79 // software (UVM) services the fault. This is sometimes known as "fault and
80 // stall". Note that replayable faults prevent the execution of other channels,
81 // which are stalled until the fault is serviced.
82 //
83 // The "non-replayable" naming alludes to the fact that, historically, these
84 // faults indicated a fatal condition so there was no recovery ("replay")
85 // process, and SW could not ignore or drop the fault. As discussed before, this
86 // is no longer the case and while at times the hardware documentation uses the
87 // "fault and replay" expression for CE and PBDMA faults, we reserve that
88 // expression for Graphics faults and favor the term "fault and reschedule"
89 // instead. Replaying a fault does not necessarily imply that UVM has serviced
90 // it. For example, the UVM driver may choose to ignore the replayable faults
91 // associated with a GPU for some period of time if it detects that there is
92 // thrashing going on, and the GPU needs to be throttled. The fault entries
93 // corresponding to the ignored faults are never saved by UVM, but new entries
94 // (and new interrupts) will be generated by hardware each time after UVM issues
95 // a replay.
96 //
97 // While replayable faults are always the responsibility of UVM, the servicing
98 // of non-replayable faults is split between RM and UVM. In the case of
99 // replayable faults, UVM has sole SW ownership of the hardware buffer
100 // containing the faults, and it is responsible for updating the GET pointer to
101 // signal the hardware that a number of faults have been read. UVM also reads
102 // the PUT pointer value written by hardware. But in the case of non-replayable
103 // faults, UVM reads the fault entries out of a regular CPU buffer, shared with
104 // RM, called "shadow buffer". RM is responsible for accessing the actual
105 // non-replayable hardware buffer, reading the PUT pointer, updating the GET
106 // pointer, and moving CE and PBDMA faults from the hardware buffer to the
107 // shadow buffer. Because the Resource Manager owns the HW buffer, UVM needs to
108 // call RM when servicing a non-replayable fault, first to figure out if there
109 // is a pending fault, and then to read entries from the shadow buffer.
110 //
111 // Once UVM has parsed a non-replayable fault entry corresponding to managed
112 // memory, and identified the VA block associated with it, the servicing logic
113 // for that block is identical to that of a replayable fault, see
114 // uvm_va_block_service_locked. Another similarity between the two types of
115 // faults is that they use the same entry format, uvm_fault_buffer_entry_t.
116
117
118 // There is no error handling in this function. The caller is in charge of
119 // calling uvm_parent_gpu_fault_buffer_deinit_non_replayable_faults on failure.
uvm_parent_gpu_fault_buffer_init_non_replayable_faults(uvm_parent_gpu_t * parent_gpu)120 NV_STATUS uvm_parent_gpu_fault_buffer_init_non_replayable_faults(uvm_parent_gpu_t *parent_gpu)
121 {
122 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable;
123
124 UVM_ASSERT(parent_gpu->non_replayable_faults_supported);
125
126 non_replayable_faults->shadow_buffer_copy = NULL;
127 non_replayable_faults->fault_cache = NULL;
128
129 non_replayable_faults->max_faults = parent_gpu->fault_buffer_info.rm_info.nonReplayable.bufferSize /
130 parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
131
132 non_replayable_faults->shadow_buffer_copy =
133 uvm_kvmalloc_zero(parent_gpu->fault_buffer_info.rm_info.nonReplayable.bufferSize);
134 if (!non_replayable_faults->shadow_buffer_copy)
135 return NV_ERR_NO_MEMORY;
136
137 non_replayable_faults->fault_cache = uvm_kvmalloc_zero(non_replayable_faults->max_faults *
138 sizeof(*non_replayable_faults->fault_cache));
139 if (!non_replayable_faults->fault_cache)
140 return NV_ERR_NO_MEMORY;
141
142 uvm_tracker_init(&non_replayable_faults->clear_faulted_tracker);
143 uvm_tracker_init(&non_replayable_faults->fault_service_tracker);
144
145 return NV_OK;
146 }
147
uvm_parent_gpu_fault_buffer_deinit_non_replayable_faults(uvm_parent_gpu_t * parent_gpu)148 void uvm_parent_gpu_fault_buffer_deinit_non_replayable_faults(uvm_parent_gpu_t *parent_gpu)
149 {
150 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable;
151
152 if (non_replayable_faults->fault_cache) {
153 UVM_ASSERT(uvm_tracker_is_empty(&non_replayable_faults->clear_faulted_tracker));
154 uvm_tracker_deinit(&non_replayable_faults->clear_faulted_tracker);
155
156 UVM_ASSERT(uvm_tracker_is_empty(&non_replayable_faults->fault_service_tracker));
157 uvm_tracker_deinit(&non_replayable_faults->fault_service_tracker);
158 }
159
160 uvm_kvfree(non_replayable_faults->shadow_buffer_copy);
161 uvm_kvfree(non_replayable_faults->fault_cache);
162 non_replayable_faults->shadow_buffer_copy = NULL;
163 non_replayable_faults->fault_cache = NULL;
164 }
165
uvm_parent_gpu_non_replayable_faults_pending(uvm_parent_gpu_t * parent_gpu)166 bool uvm_parent_gpu_non_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu)
167 {
168 NV_STATUS status;
169 NvBool has_pending_faults;
170
171 UVM_ASSERT(parent_gpu->isr.non_replayable_faults.handling);
172
173 status = nvUvmInterfaceHasPendingNonReplayableFaults(&parent_gpu->fault_buffer_info.rm_info,
174 &has_pending_faults);
175 UVM_ASSERT(status == NV_OK);
176
177 return has_pending_faults == NV_TRUE;
178 }
179
fetch_non_replayable_fault_buffer_entries(uvm_parent_gpu_t * parent_gpu,NvU32 * cached_faults)180 static NV_STATUS fetch_non_replayable_fault_buffer_entries(uvm_parent_gpu_t *parent_gpu, NvU32 *cached_faults)
181 {
182 NV_STATUS status;
183 NvU32 i;
184 NvU32 entry_size = parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
185 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &parent_gpu->fault_buffer_info.non_replayable;
186 char *current_hw_entry = (char *)non_replayable_faults->shadow_buffer_copy;
187 uvm_fault_buffer_entry_t *fault_entry = non_replayable_faults->fault_cache;
188
189 UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.non_replayable_faults.service_lock));
190 UVM_ASSERT(parent_gpu->non_replayable_faults_supported);
191
192 status = nvUvmInterfaceGetNonReplayableFaults(&parent_gpu->fault_buffer_info.rm_info,
193 current_hw_entry,
194 cached_faults);
195
196 if (status != NV_OK) {
197 UVM_ERR_PRINT("nvUvmInterfaceGetNonReplayableFaults() failed: %s, GPU %s\n",
198 nvstatusToString(status),
199 uvm_parent_gpu_name(parent_gpu));
200
201 uvm_global_set_fatal_error(status);
202 return status;
203 }
204
205 // Parse all faults
206 for (i = 0; i < *cached_faults; ++i) {
207 parent_gpu->fault_buffer_hal->parse_non_replayable_entry(parent_gpu, current_hw_entry, fault_entry);
208
209 // The GPU aligns the fault addresses to 4k, but all of our tracking is
210 // done in PAGE_SIZE chunks which might be larger.
211 fault_entry->fault_address = UVM_PAGE_ALIGN_DOWN(fault_entry->fault_address);
212
213 // Make sure that all fields in the entry are properly initialized
214 fault_entry->va_space = NULL;
215 fault_entry->is_fatal = (fault_entry->fault_type >= UVM_FAULT_TYPE_FATAL);
216 fault_entry->filtered = false;
217
218 fault_entry->num_instances = 1;
219 fault_entry->access_type_mask = uvm_fault_access_type_mask_bit(fault_entry->fault_access_type);
220 INIT_LIST_HEAD(&fault_entry->merged_instances_list);
221 fault_entry->non_replayable.buffer_index = i;
222
223 if (fault_entry->is_fatal) {
224 // Record the fatal fault event later as we need the va_space locked
225 fault_entry->fatal_reason = UvmEventFatalReasonInvalidFaultType;
226 }
227 else {
228 fault_entry->fatal_reason = UvmEventFatalReasonInvalid;
229 }
230
231 current_hw_entry += entry_size;
232 fault_entry++;
233 }
234
235 return NV_OK;
236 }
237
use_clear_faulted_channel_sw_method(uvm_gpu_t * gpu)238 static bool use_clear_faulted_channel_sw_method(uvm_gpu_t *gpu)
239 {
240 // If true, UVM uses a SW method to request RM to do the clearing on its
241 // behalf.
242 bool use_sw_method = false;
243
244 // In SRIOV, the UVM (guest) driver does not have access to the privileged
245 // registers used to clear the faulted bit.
246 if (uvm_parent_gpu_is_virt_mode_sriov(gpu->parent))
247 use_sw_method = true;
248
249 // In Confidential Computing access to the privileged registers is blocked,
250 // in order to prevent interference between guests, or between the
251 // (untrusted) host and the guests.
252 if (g_uvm_global.conf_computing_enabled)
253 use_sw_method = true;
254
255 if (use_sw_method)
256 UVM_ASSERT(gpu->parent->has_clear_faulted_channel_sw_method);
257
258 return use_sw_method;
259 }
260
clear_faulted_method_on_gpu(uvm_gpu_t * gpu,uvm_user_channel_t * user_channel,const uvm_fault_buffer_entry_t * fault_entry,NvU32 batch_id,uvm_tracker_t * tracker)261 static NV_STATUS clear_faulted_method_on_gpu(uvm_gpu_t *gpu,
262 uvm_user_channel_t *user_channel,
263 const uvm_fault_buffer_entry_t *fault_entry,
264 NvU32 batch_id,
265 uvm_tracker_t *tracker)
266 {
267 NV_STATUS status;
268 uvm_push_t push;
269 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
270
271 UVM_ASSERT(!fault_entry->is_fatal);
272
273 status = uvm_push_begin_acquire(gpu->channel_manager,
274 UVM_CHANNEL_TYPE_MEMOPS,
275 tracker,
276 &push,
277 "Clearing set bit for address 0x%llx",
278 fault_entry->fault_address);
279 if (status != NV_OK) {
280 UVM_ERR_PRINT("Error acquiring tracker before clearing faulted: %s, GPU %s\n",
281 nvstatusToString(status),
282 uvm_gpu_name(gpu));
283 return status;
284 }
285
286 if (use_clear_faulted_channel_sw_method(gpu))
287 gpu->parent->host_hal->clear_faulted_channel_sw_method(&push, user_channel, fault_entry);
288 else
289 gpu->parent->host_hal->clear_faulted_channel_method(&push, user_channel, fault_entry);
290
291 uvm_tools_broadcast_replay(gpu, &push, batch_id, fault_entry->fault_source.client_type);
292
293 uvm_push_end(&push);
294
295 // Add this push to the GPU's clear_faulted_tracker so GPU removal can wait
296 // on it.
297 status = uvm_tracker_add_push_safe(&non_replayable_faults->clear_faulted_tracker, &push);
298
299 // Add this push to the channel's clear_faulted_tracker so user channel
300 // removal can wait on it instead of using the per-GPU tracker, which would
301 // require a lock.
302 if (status == NV_OK)
303 status = uvm_tracker_add_push_safe(&user_channel->clear_faulted_tracker, &push);
304
305 return status;
306 }
307
clear_faulted_register_on_gpu(uvm_gpu_t * gpu,uvm_user_channel_t * user_channel,const uvm_fault_buffer_entry_t * fault_entry,NvU32 batch_id,uvm_tracker_t * tracker)308 static NV_STATUS clear_faulted_register_on_gpu(uvm_gpu_t *gpu,
309 uvm_user_channel_t *user_channel,
310 const uvm_fault_buffer_entry_t *fault_entry,
311 NvU32 batch_id,
312 uvm_tracker_t *tracker)
313 {
314 NV_STATUS status;
315
316 UVM_ASSERT(!gpu->parent->has_clear_faulted_channel_method);
317
318 // We need to wait for all pending work before writing to the channel
319 // register
320 status = uvm_tracker_wait(tracker);
321 if (status != NV_OK)
322 return status;
323
324 gpu->parent->host_hal->clear_faulted_channel_register(user_channel, fault_entry);
325
326 uvm_tools_broadcast_replay_sync(gpu, batch_id, fault_entry->fault_source.client_type);
327
328 return NV_OK;
329 }
330
clear_faulted_on_gpu(uvm_gpu_t * gpu,uvm_user_channel_t * user_channel,const uvm_fault_buffer_entry_t * fault_entry,NvU32 batch_id,uvm_tracker_t * tracker)331 static NV_STATUS clear_faulted_on_gpu(uvm_gpu_t *gpu,
332 uvm_user_channel_t *user_channel,
333 const uvm_fault_buffer_entry_t *fault_entry,
334 NvU32 batch_id,
335 uvm_tracker_t *tracker)
336 {
337 if (gpu->parent->has_clear_faulted_channel_method || use_clear_faulted_channel_sw_method(gpu))
338 return clear_faulted_method_on_gpu(gpu, user_channel, fault_entry, batch_id, tracker);
339
340 return clear_faulted_register_on_gpu(gpu, user_channel, fault_entry, batch_id, tracker);
341 }
342
service_managed_fault_in_block_locked(uvm_gpu_t * gpu,uvm_va_block_t * va_block,uvm_va_block_retry_t * va_block_retry,uvm_fault_buffer_entry_t * fault_entry,uvm_service_block_context_t * service_context,const bool hmm_migratable)343 static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,
344 uvm_va_block_t *va_block,
345 uvm_va_block_retry_t *va_block_retry,
346 uvm_fault_buffer_entry_t *fault_entry,
347 uvm_service_block_context_t *service_context,
348 const bool hmm_migratable)
349 {
350 NV_STATUS status = NV_OK;
351 uvm_page_index_t page_index;
352 uvm_perf_thrashing_hint_t thrashing_hint;
353 uvm_processor_id_t new_residency;
354 bool read_duplicate;
355 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
356 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
357 const uvm_va_policy_t *policy;
358
359 UVM_ASSERT(!fault_entry->is_fatal);
360
361 uvm_assert_rwsem_locked(&va_space->lock);
362
363 UVM_ASSERT(fault_entry->va_space == va_space);
364 UVM_ASSERT(fault_entry->fault_address >= va_block->start);
365 UVM_ASSERT(fault_entry->fault_address <= va_block->end);
366
367 policy = uvm_va_policy_get(va_block, fault_entry->fault_address);
368
369 if (service_context->num_retries == 0) {
370 // notify event to tools/performance heuristics. For now we use a
371 // unique batch id per fault, since we clear the faulted channel for
372 // each fault.
373 uvm_perf_event_notify_gpu_fault(&va_space->perf_events,
374 va_block,
375 gpu->id,
376 policy->preferred_location,
377 fault_entry,
378 ++non_replayable_faults->batch_id,
379 false);
380 }
381
382 // Check logical permissions
383 status = uvm_va_block_check_logical_permissions(va_block,
384 service_context->block_context,
385 gpu->id,
386 uvm_va_block_cpu_page_index(va_block,
387 fault_entry->fault_address),
388 fault_entry->fault_access_type,
389 uvm_range_group_address_migratable(va_space,
390 fault_entry->fault_address));
391 if (status != NV_OK) {
392 fault_entry->is_fatal = true;
393 fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status);
394 return NV_OK;
395 }
396
397 // TODO: Bug 1880194: Revisit thrashing detection
398 thrashing_hint.type = UVM_PERF_THRASHING_HINT_TYPE_NONE;
399
400 service_context->read_duplicate_count = 0;
401 service_context->thrashing_pin_count = 0;
402
403 page_index = uvm_va_block_cpu_page_index(va_block, fault_entry->fault_address);
404
405 // Compute new residency and update the masks
406 new_residency = uvm_va_block_select_residency(va_block,
407 service_context->block_context,
408 page_index,
409 gpu->id,
410 fault_entry->access_type_mask,
411 policy,
412 &thrashing_hint,
413 UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS,
414 hmm_migratable,
415 &read_duplicate);
416
417 // Initialize the minimum necessary state in the fault service context
418 uvm_processor_mask_zero(&service_context->resident_processors);
419
420 // Set new residency and update the masks
421 uvm_processor_mask_set(&service_context->resident_processors, new_residency);
422
423 // The masks need to be fully zeroed as the fault region may grow due to prefetching
424 uvm_page_mask_zero(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
425 uvm_page_mask_set(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index);
426
427 if (read_duplicate) {
428 uvm_page_mask_zero(&service_context->read_duplicate_mask);
429 uvm_page_mask_set(&service_context->read_duplicate_mask, page_index);
430 service_context->read_duplicate_count = 1;
431 }
432
433 service_context->access_type[page_index] = fault_entry->fault_access_type;
434
435 service_context->region = uvm_va_block_region_for_page(page_index);
436
437 status = uvm_va_block_service_locked(gpu->id, va_block, va_block_retry, service_context);
438
439 ++service_context->num_retries;
440
441 return status;
442 }
443
service_managed_fault_in_block(uvm_gpu_t * gpu,uvm_va_block_t * va_block,uvm_fault_buffer_entry_t * fault_entry,const bool hmm_migratable)444 static NV_STATUS service_managed_fault_in_block(uvm_gpu_t *gpu,
445 uvm_va_block_t *va_block,
446 uvm_fault_buffer_entry_t *fault_entry,
447 const bool hmm_migratable)
448 {
449 NV_STATUS status, tracker_status;
450 uvm_va_block_retry_t va_block_retry;
451 uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.non_replayable.block_service_context;
452
453 service_context->operation = UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS;
454 service_context->num_retries = 0;
455
456 if (uvm_va_block_is_hmm(va_block))
457 uvm_hmm_migrate_begin_wait(va_block);
458
459 uvm_mutex_lock(&va_block->lock);
460
461 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, &va_block_retry,
462 service_managed_fault_in_block_locked(gpu,
463 va_block,
464 &va_block_retry,
465 fault_entry,
466 service_context,
467 hmm_migratable));
468
469 tracker_status = uvm_tracker_add_tracker_safe(&gpu->parent->fault_buffer_info.non_replayable.fault_service_tracker,
470 &va_block->tracker);
471
472 uvm_mutex_unlock(&va_block->lock);
473
474 if (uvm_va_block_is_hmm(va_block))
475 uvm_hmm_migrate_finish(va_block);
476
477 return status == NV_OK? tracker_status: status;
478 }
479
480 // See uvm_unregister_channel for comments on the the channel destruction
481 // sequence.
kill_channel_delayed(void * _user_channel)482 static void kill_channel_delayed(void *_user_channel)
483 {
484 uvm_user_channel_t *user_channel = (uvm_user_channel_t *)_user_channel;
485 uvm_va_space_t *va_space = user_channel->kill_channel.va_space;
486
487 uvm_va_space_down_read_rm(va_space);
488 if (user_channel->gpu_va_space) {
489 // RM handles the fault, which will do the correct fault reporting in the
490 // kernel logs and will initiate channel teardown
491 NV_STATUS status = nvUvmInterfaceReportNonReplayableFault(uvm_gpu_device_handle(user_channel->gpu),
492 user_channel->kill_channel.fault_packet);
493 UVM_ASSERT(status == NV_OK);
494 }
495 uvm_va_space_up_read_rm(va_space);
496
497 uvm_user_channel_release(user_channel);
498 }
499
kill_channel_delayed_entry(void * user_channel)500 static void kill_channel_delayed_entry(void *user_channel)
501 {
502 UVM_ENTRY_VOID(kill_channel_delayed(user_channel));
503 }
504
schedule_kill_channel(uvm_gpu_t * gpu,uvm_fault_buffer_entry_t * fault_entry,uvm_user_channel_t * user_channel)505 static void schedule_kill_channel(uvm_gpu_t *gpu,
506 uvm_fault_buffer_entry_t *fault_entry,
507 uvm_user_channel_t *user_channel)
508 {
509 uvm_va_space_t *va_space = fault_entry->va_space;
510 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
511 void *packet = (char *)non_replayable_faults->shadow_buffer_copy +
512 (fault_entry->non_replayable.buffer_index * gpu->parent->fault_buffer_hal->entry_size(gpu->parent));
513
514 UVM_ASSERT(gpu);
515 UVM_ASSERT(va_space);
516 UVM_ASSERT(user_channel);
517
518 if (user_channel->kill_channel.scheduled)
519 return;
520
521 user_channel->kill_channel.scheduled = true;
522 user_channel->kill_channel.va_space = va_space;
523
524 // Save the packet to be handled by RM in the channel structure
525 memcpy(user_channel->kill_channel.fault_packet, packet, gpu->parent->fault_buffer_hal->entry_size(gpu->parent));
526
527 // Retain the channel here so it is not prematurely destroyed. It will be
528 // released after forwarding the fault to RM in kill_channel_delayed.
529 uvm_user_channel_retain(user_channel);
530
531 // Schedule a work item to kill the channel
532 nv_kthread_q_item_init(&user_channel->kill_channel.kill_channel_q_item,
533 kill_channel_delayed_entry,
534 user_channel);
535
536 nv_kthread_q_schedule_q_item(&gpu->parent->isr.kill_channel_q,
537 &user_channel->kill_channel.kill_channel_q_item);
538 }
539
service_fault_fatal(uvm_fault_buffer_entry_t * fault_entry,NV_STATUS status)540 static void service_fault_fatal(uvm_fault_buffer_entry_t *fault_entry, NV_STATUS status)
541 {
542 UVM_ASSERT(fault_entry->fault_access_type != UVM_FAULT_ACCESS_TYPE_PREFETCH);
543
544 fault_entry->is_fatal = true;
545 fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status);
546 }
547
service_non_managed_fault(uvm_gpu_va_space_t * gpu_va_space,struct mm_struct * mm,uvm_fault_buffer_entry_t * fault_entry,NV_STATUS lookup_status)548 static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,
549 struct mm_struct *mm,
550 uvm_fault_buffer_entry_t *fault_entry,
551 NV_STATUS lookup_status)
552 {
553 uvm_gpu_t *gpu = gpu_va_space->gpu;
554 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
555 uvm_ats_fault_invalidate_t *ats_invalidate = &non_replayable_faults->ats_invalidate;
556 NV_STATUS status = lookup_status;
557 NV_STATUS fatal_fault_status = NV_ERR_INVALID_ADDRESS;
558
559 UVM_ASSERT(!fault_entry->is_fatal);
560
561 // Avoid dropping fault events when the VA block is not found or cannot be created
562 uvm_perf_event_notify_gpu_fault(&fault_entry->va_space->perf_events,
563 NULL,
564 gpu->id,
565 UVM_ID_INVALID,
566 fault_entry,
567 ++non_replayable_faults->batch_id,
568 false);
569
570 if (status != NV_ERR_INVALID_ADDRESS)
571 return status;
572
573 if (uvm_ats_can_service_faults(gpu_va_space, mm)) {
574 struct vm_area_struct *vma;
575 uvm_va_range_t *va_range_next;
576 NvU64 fault_address = fault_entry->fault_address;
577 uvm_fault_access_type_t fault_access_type = fault_entry->fault_access_type;
578 uvm_ats_fault_context_t *ats_context = &non_replayable_faults->ats_context;
579
580 uvm_page_mask_zero(&ats_context->read_fault_mask);
581 uvm_page_mask_zero(&ats_context->write_fault_mask);
582
583 ats_context->client_type = UVM_FAULT_CLIENT_TYPE_HUB;
584
585 ats_invalidate->tlb_batch_pending = false;
586
587 va_range_next = uvm_va_space_iter_first(gpu_va_space->va_space, fault_entry->fault_address, ~0ULL);
588
589 // The VA isn't managed. See if ATS knows about it.
590 vma = find_vma_intersection(mm, fault_address, fault_address + 1);
591 if (!vma || uvm_ats_check_in_gmmu_region(gpu_va_space->va_space, fault_address, va_range_next)) {
592
593 // Do not return error due to logical errors in the application
594 status = NV_OK;
595 }
596 else {
597 NvU64 base = UVM_VA_BLOCK_ALIGN_DOWN(fault_address);
598 uvm_page_mask_t *faults_serviced_mask = &ats_context->faults_serviced_mask;
599 uvm_page_index_t page_index = (fault_address - base) / PAGE_SIZE;
600 uvm_page_mask_t *fault_mask = (fault_access_type >= UVM_FAULT_ACCESS_TYPE_WRITE) ?
601 &ats_context->write_fault_mask :
602 &ats_context->read_fault_mask;
603
604 uvm_page_mask_set(fault_mask, page_index);
605
606 status = uvm_ats_service_faults(gpu_va_space, vma, base, ats_context);
607 if (status == NV_OK) {
608 // Invalidate ATS TLB entries if needed
609 if (uvm_page_mask_test(faults_serviced_mask, page_index)) {
610 status = uvm_ats_invalidate_tlbs(gpu_va_space,
611 ats_invalidate,
612 &non_replayable_faults->fault_service_tracker);
613 fatal_fault_status = NV_OK;
614 }
615 }
616 else {
617 fatal_fault_status = status;
618 }
619 }
620 }
621 else {
622 fatal_fault_status = status;
623
624 // Do not return error due to logical errors in the application
625 status = NV_OK;
626 }
627
628 if (fatal_fault_status != NV_OK)
629 service_fault_fatal(fault_entry, fatal_fault_status);
630
631 return status;
632 }
633
service_fault_once(uvm_gpu_t * gpu,uvm_fault_buffer_entry_t * fault_entry,const bool hmm_migratable)634 static NV_STATUS service_fault_once(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_entry, const bool hmm_migratable)
635 {
636 NV_STATUS status;
637 uvm_user_channel_t *user_channel;
638 uvm_va_block_t *va_block;
639 uvm_va_space_t *va_space = NULL;
640 struct mm_struct *mm;
641 uvm_gpu_va_space_t *gpu_va_space;
642 uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
643 uvm_va_block_context_t *va_block_context =
644 gpu->parent->fault_buffer_info.non_replayable.block_service_context.block_context;
645
646 status = uvm_parent_gpu_fault_entry_to_va_space(gpu->parent, fault_entry, &va_space);
647 if (status != NV_OK) {
648 // The VA space lookup will fail if we're running concurrently with
649 // removal of the channel from the VA space (channel unregister, GPU VA
650 // space unregister, VA space destroy, etc). The other thread will stop
651 // the channel and remove the channel from the table, so the faulting
652 // condition will be gone. In the case of replayable faults we need to
653 // flush the buffer, but here we can just ignore the entry and proceed
654 // on.
655 //
656 // Note that we can't have any subcontext issues here, since non-
657 // replayable faults only use the address space of their channel.
658 UVM_ASSERT(status == NV_ERR_INVALID_CHANNEL);
659 UVM_ASSERT(!va_space);
660 return NV_OK;
661 }
662
663 UVM_ASSERT(va_space);
664
665 // If an mm is registered with the VA space, we have to retain it
666 // in order to lock it before locking the VA space. It is guaranteed
667 // to remain valid until we release. If no mm is registered, we
668 // can only service managed faults, not ATS/HMM faults.
669 mm = uvm_va_space_mm_retain_lock(va_space);
670 uvm_va_block_context_init(va_block_context, mm);
671
672 uvm_va_space_down_read(va_space);
673
674 gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
675
676 if (!gpu_va_space) {
677 // The va_space might have gone away. See the comment above.
678 status = NV_OK;
679 goto exit_no_channel;
680 }
681
682 fault_entry->va_space = va_space;
683
684 user_channel = uvm_gpu_va_space_get_user_channel(gpu_va_space, fault_entry->instance_ptr);
685 if (!user_channel) {
686 // The channel might have gone away. See the comment above.
687 status = NV_OK;
688 goto exit_no_channel;
689 }
690
691 fault_entry->fault_source.channel_id = user_channel->hw_channel_id;
692
693 if (!fault_entry->is_fatal) {
694 if (mm) {
695 status = uvm_va_block_find_create(fault_entry->va_space,
696 fault_entry->fault_address,
697 &va_block_context->hmm.vma,
698 &va_block);
699 }
700 else {
701 status = uvm_va_block_find_create_managed(fault_entry->va_space,
702 fault_entry->fault_address,
703 &va_block);
704 }
705 if (status == NV_OK)
706 status = service_managed_fault_in_block(gpu_va_space->gpu, va_block, fault_entry, hmm_migratable);
707 else
708 status = service_non_managed_fault(gpu_va_space, mm, fault_entry, status);
709
710 // We are done, we clear the faulted bit on the channel, so it can be
711 // re-scheduled again
712 if (status == NV_OK && !fault_entry->is_fatal) {
713 status = clear_faulted_on_gpu(gpu,
714 user_channel,
715 fault_entry,
716 non_replayable_faults->batch_id,
717 &non_replayable_faults->fault_service_tracker);
718 uvm_tracker_clear(&non_replayable_faults->fault_service_tracker);
719 }
720 }
721
722 if (fault_entry->is_fatal)
723 uvm_tools_record_gpu_fatal_fault(gpu->id, fault_entry->va_space, fault_entry, fault_entry->fatal_reason);
724
725 if (fault_entry->is_fatal ||
726 (status != NV_OK &&
727 status != NV_WARN_MORE_PROCESSING_REQUIRED &&
728 status != NV_WARN_MISMATCHED_TARGET))
729 schedule_kill_channel(gpu, fault_entry, user_channel);
730
731 exit_no_channel:
732 uvm_va_space_up_read(va_space);
733 uvm_va_space_mm_release_unlock(va_space, mm);
734
735 if (status != NV_OK &&
736 status != NV_WARN_MORE_PROCESSING_REQUIRED &&
737 status != NV_WARN_MISMATCHED_TARGET)
738 UVM_DBG_PRINT("Error servicing non-replayable faults on GPU: %s\n", uvm_gpu_name(gpu));
739
740 return status;
741 }
742
service_fault(uvm_gpu_t * gpu,uvm_fault_buffer_entry_t * fault_entry)743 static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_entry)
744 {
745 uvm_service_block_context_t *service_context =
746 &gpu->parent->fault_buffer_info.non_replayable.block_service_context;
747 NV_STATUS status;
748 bool hmm_migratable = true;
749
750 service_context->num_retries = 0;
751
752 do {
753 status = service_fault_once(gpu, fault_entry, hmm_migratable);
754 if (status == NV_WARN_MISMATCHED_TARGET) {
755 hmm_migratable = false;
756 status = NV_WARN_MORE_PROCESSING_REQUIRED;
757 }
758 } while (status == NV_WARN_MORE_PROCESSING_REQUIRED);
759
760 return status;
761 }
762
uvm_gpu_service_non_replayable_fault_buffer(uvm_gpu_t * gpu)763 void uvm_gpu_service_non_replayable_fault_buffer(uvm_gpu_t *gpu)
764 {
765 NvU32 cached_faults;
766
767 // If this handler is modified to handle fewer than all of the outstanding
768 // faults, then special handling will need to be added to uvm_suspend()
769 // to guarantee that fault processing has completed before control is
770 // returned to the RM.
771 do {
772 NV_STATUS status;
773 NvU32 i;
774
775 status = fetch_non_replayable_fault_buffer_entries(gpu->parent, &cached_faults);
776 if (status != NV_OK)
777 return;
778
779 // Differently to replayable faults, we do not batch up and preprocess
780 // non-replayable faults since getting multiple faults on the same
781 // memory region is not very likely
782 for (i = 0; i < cached_faults; ++i) {
783 status = service_fault(gpu, &gpu->parent->fault_buffer_info.non_replayable.fault_cache[i]);
784 if (status != NV_OK)
785 return;
786 }
787 } while (cached_faults > 0);
788 }
789