1 /*******************************************************************************
2     Copyright (c) 2015-2024 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "linux/sort.h"
25 #include "nv_uvm_interface.h"
26 #include "uvm_common.h"
27 #include "uvm_linux.h"
28 #include "uvm_global.h"
29 #include "uvm_gpu_replayable_faults.h"
30 #include "uvm_hal.h"
31 #include "uvm_kvmalloc.h"
32 #include "uvm_tools.h"
33 #include "uvm_va_block.h"
34 #include "uvm_va_range.h"
35 #include "uvm_va_space.h"
36 #include "uvm_va_space_mm.h"
37 #include "uvm_procfs.h"
38 #include "uvm_perf_thrashing.h"
39 #include "uvm_gpu_non_replayable_faults.h"
40 #include "uvm_ats_faults.h"
41 #include "uvm_test.h"
42 
43 // The documentation at the beginning of uvm_gpu_non_replayable_faults.c
44 // provides some background for understanding replayable faults, non-replayable
45 // faults, and how UVM services each fault type.
46 
47 // The HW fault buffer flush mode instructs RM on how to flush the hardware
48 // replayable fault buffer; it is only used in Confidential Computing.
49 //
50 // Unless HW_FAULT_BUFFER_FLUSH_MODE_MOVE is functionally required (because UVM
51 // needs to inspect the faults currently present in the HW fault buffer) it is
52 // recommended to use HW_FAULT_BUFFER_FLUSH_MODE_DISCARD for performance
53 // reasons.
54 typedef enum
55 {
56     // Flush the HW fault buffer, discarding all the resulting faults. UVM never
57     // gets to see these faults.
58     HW_FAULT_BUFFER_FLUSH_MODE_DISCARD,
59 
60     // Flush the HW fault buffer, and move all the resulting faults to the SW
61     // fault ("shadow") buffer.
62     HW_FAULT_BUFFER_FLUSH_MODE_MOVE,
63 } hw_fault_buffer_flush_mode_t;
64 
65 #define UVM_PERF_REENABLE_PREFETCH_FAULTS_LAPSE_MSEC_DEFAULT 1000
66 
67 // Lapse of time in milliseconds after which prefetch faults can be re-enabled.
68 // 0 means it is never disabled
69 static unsigned uvm_perf_reenable_prefetch_faults_lapse_msec = UVM_PERF_REENABLE_PREFETCH_FAULTS_LAPSE_MSEC_DEFAULT;
70 module_param(uvm_perf_reenable_prefetch_faults_lapse_msec, uint, S_IRUGO);
71 
72 #define UVM_PERF_FAULT_BATCH_COUNT_MIN 1
73 #define UVM_PERF_FAULT_BATCH_COUNT_DEFAULT 256
74 
75 // Number of entries that are fetched from the GPU fault buffer and serviced in
76 // batch
77 static unsigned uvm_perf_fault_batch_count = UVM_PERF_FAULT_BATCH_COUNT_DEFAULT;
78 module_param(uvm_perf_fault_batch_count, uint, S_IRUGO);
79 
80 #define UVM_PERF_FAULT_REPLAY_POLICY_DEFAULT UVM_PERF_FAULT_REPLAY_POLICY_BATCH_FLUSH
81 
82 // Policy that determines when to issue fault replays
83 static uvm_perf_fault_replay_policy_t uvm_perf_fault_replay_policy = UVM_PERF_FAULT_REPLAY_POLICY_DEFAULT;
84 module_param(uvm_perf_fault_replay_policy, uint, S_IRUGO);
85 
86 #define UVM_PERF_FAULT_REPLAY_UPDATE_PUT_RATIO_DEFAULT 50
87 
88 // Reading fault buffer GET/PUT pointers from the CPU is expensive. However,
89 // updating PUT before flushing the buffer helps minimizing the number of
90 // duplicates in the buffer as it discards faults that were not processed
91 // because of the batch size limit or because they arrived during servicing.
92 // If PUT is not updated, the replay operation will make them show up again
93 // in the buffer as duplicates.
94 //
95 // We keep track of the number of duplicates in each batch and we use
96 // UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT for the fault buffer flush after if the
97 // percentage of duplicate faults in a batch is greater than the ratio defined
98 // in the following module parameter. UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT is
99 // used, otherwise.
100 static unsigned uvm_perf_fault_replay_update_put_ratio = UVM_PERF_FAULT_REPLAY_UPDATE_PUT_RATIO_DEFAULT;
101 module_param(uvm_perf_fault_replay_update_put_ratio, uint, S_IRUGO);
102 
103 #define UVM_PERF_FAULT_MAX_BATCHES_PER_SERVICE_DEFAULT 20
104 
105 #define UVM_PERF_FAULT_MAX_THROTTLE_PER_SERVICE_DEFAULT 5
106 
107 // Maximum number of batches to be processed per execution of the bottom-half
108 static unsigned uvm_perf_fault_max_batches_per_service = UVM_PERF_FAULT_MAX_BATCHES_PER_SERVICE_DEFAULT;
109 module_param(uvm_perf_fault_max_batches_per_service, uint, S_IRUGO);
110 
111 // Maximum number of batches with thrashing pages per execution of the bottom-half
112 static unsigned uvm_perf_fault_max_throttle_per_service = UVM_PERF_FAULT_MAX_THROTTLE_PER_SERVICE_DEFAULT;
113 module_param(uvm_perf_fault_max_throttle_per_service, uint, S_IRUGO);
114 
115 static unsigned uvm_perf_fault_coalesce = 1;
116 module_param(uvm_perf_fault_coalesce, uint, S_IRUGO);
117 
118 // This function is used for both the initial fault buffer initialization and
119 // the power management resume path.
120 static void fault_buffer_reinit_replayable_faults(uvm_parent_gpu_t *parent_gpu)
121 {
122     uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable;
123 
124     // Read the current get/put pointers, as this might not be the first time
125     // we take control of the fault buffer since the GPU was initialized,
126     // or since we may need to bring UVM's cached copies back in sync following
127     // a sleep cycle.
128     replayable_faults->cached_get = parent_gpu->fault_buffer_hal->read_get(parent_gpu);
129     replayable_faults->cached_put = parent_gpu->fault_buffer_hal->read_put(parent_gpu);
130 
131     // (Re-)enable fault prefetching
132     if (parent_gpu->fault_buffer_info.prefetch_faults_enabled)
133         parent_gpu->arch_hal->enable_prefetch_faults(parent_gpu);
134     else
135         parent_gpu->arch_hal->disable_prefetch_faults(parent_gpu);
136 }
137 
138 // There is no error handling in this function. The caller is in charge of
139 // calling fault_buffer_deinit_replayable_faults on failure.
140 static NV_STATUS fault_buffer_init_replayable_faults(uvm_parent_gpu_t *parent_gpu)
141 {
142     NV_STATUS status = NV_OK;
143     uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable;
144     uvm_fault_service_batch_context_t *batch_context = &replayable_faults->batch_service_context;
145 
146     UVM_ASSERT(parent_gpu->fault_buffer_info.rm_info.replayable.bufferSize %
147                parent_gpu->fault_buffer_hal->entry_size(parent_gpu) == 0);
148 
149     replayable_faults->max_faults = parent_gpu->fault_buffer_info.rm_info.replayable.bufferSize /
150                                     parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
151 
152     // Check provided module parameter value
153     parent_gpu->fault_buffer_info.max_batch_size = max(uvm_perf_fault_batch_count,
154                                                        (NvU32)UVM_PERF_FAULT_BATCH_COUNT_MIN);
155     parent_gpu->fault_buffer_info.max_batch_size = min(parent_gpu->fault_buffer_info.max_batch_size,
156                                                        replayable_faults->max_faults);
157 
158     if (parent_gpu->fault_buffer_info.max_batch_size != uvm_perf_fault_batch_count) {
159         pr_info("Invalid uvm_perf_fault_batch_count value on GPU %s: %u. Valid range [%u:%u] Using %u instead\n",
160                 uvm_parent_gpu_name(parent_gpu),
161                 uvm_perf_fault_batch_count,
162                 UVM_PERF_FAULT_BATCH_COUNT_MIN,
163                 replayable_faults->max_faults,
164                 parent_gpu->fault_buffer_info.max_batch_size);
165     }
166 
167     batch_context->fault_cache = uvm_kvmalloc_zero(replayable_faults->max_faults * sizeof(*batch_context->fault_cache));
168     if (!batch_context->fault_cache)
169         return NV_ERR_NO_MEMORY;
170 
171     // fault_cache is used to signal that the tracker was initialized.
172     uvm_tracker_init(&replayable_faults->replay_tracker);
173 
174     batch_context->ordered_fault_cache = uvm_kvmalloc_zero(replayable_faults->max_faults *
175                                                            sizeof(*batch_context->ordered_fault_cache));
176     if (!batch_context->ordered_fault_cache)
177         return NV_ERR_NO_MEMORY;
178 
179     // This value must be initialized by HAL
180     UVM_ASSERT(replayable_faults->utlb_count > 0);
181 
182     batch_context->utlbs = uvm_kvmalloc_zero(replayable_faults->utlb_count * sizeof(*batch_context->utlbs));
183     if (!batch_context->utlbs)
184         return NV_ERR_NO_MEMORY;
185 
186     batch_context->max_utlb_id = 0;
187 
188     status = uvm_rm_locked_call(nvUvmInterfaceOwnPageFaultIntr(parent_gpu->rm_device, NV_TRUE));
189     if (status != NV_OK) {
190         UVM_ERR_PRINT("Failed to take page fault ownership from RM: %s, GPU %s\n",
191                       nvstatusToString(status),
192                       uvm_parent_gpu_name(parent_gpu));
193         return status;
194     }
195 
196     replayable_faults->replay_policy = uvm_perf_fault_replay_policy < UVM_PERF_FAULT_REPLAY_POLICY_MAX?
197                                            uvm_perf_fault_replay_policy:
198                                            UVM_PERF_FAULT_REPLAY_POLICY_DEFAULT;
199 
200     if (replayable_faults->replay_policy != uvm_perf_fault_replay_policy) {
201         pr_info("Invalid uvm_perf_fault_replay_policy value on GPU %s: %d. Using %d instead\n",
202                 uvm_parent_gpu_name(parent_gpu),
203                 uvm_perf_fault_replay_policy,
204                 replayable_faults->replay_policy);
205     }
206 
207     replayable_faults->replay_update_put_ratio = min(uvm_perf_fault_replay_update_put_ratio, 100u);
208     if (replayable_faults->replay_update_put_ratio != uvm_perf_fault_replay_update_put_ratio) {
209         pr_info("Invalid uvm_perf_fault_replay_update_put_ratio value on GPU %s: %u. Using %u instead\n",
210                 uvm_parent_gpu_name(parent_gpu),
211                 uvm_perf_fault_replay_update_put_ratio,
212                 replayable_faults->replay_update_put_ratio);
213     }
214 
215     // Re-enable fault prefetching just in case it was disabled in a previous run
216     parent_gpu->fault_buffer_info.prefetch_faults_enabled = parent_gpu->prefetch_fault_supported;
217 
218     fault_buffer_reinit_replayable_faults(parent_gpu);
219 
220     return NV_OK;
221 }
222 
223 static void fault_buffer_deinit_replayable_faults(uvm_parent_gpu_t *parent_gpu)
224 {
225     uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable;
226     uvm_fault_service_batch_context_t *batch_context = &replayable_faults->batch_service_context;
227 
228     if (batch_context->fault_cache) {
229         UVM_ASSERT(uvm_tracker_is_empty(&replayable_faults->replay_tracker));
230         uvm_tracker_deinit(&replayable_faults->replay_tracker);
231     }
232 
233     if (parent_gpu->fault_buffer_info.rm_info.faultBufferHandle) {
234         // Re-enable prefetch faults in case we disabled them
235         if (parent_gpu->prefetch_fault_supported && !parent_gpu->fault_buffer_info.prefetch_faults_enabled)
236             parent_gpu->arch_hal->enable_prefetch_faults(parent_gpu);
237     }
238 
239     uvm_kvfree(batch_context->fault_cache);
240     uvm_kvfree(batch_context->ordered_fault_cache);
241     uvm_kvfree(batch_context->utlbs);
242     batch_context->fault_cache         = NULL;
243     batch_context->ordered_fault_cache = NULL;
244     batch_context->utlbs               = NULL;
245 }
246 
247 NV_STATUS uvm_parent_gpu_fault_buffer_init(uvm_parent_gpu_t *parent_gpu)
248 {
249     NV_STATUS status = NV_OK;
250 
251     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
252     UVM_ASSERT(parent_gpu->replayable_faults_supported);
253 
254     status = uvm_rm_locked_call(nvUvmInterfaceInitFaultInfo(parent_gpu->rm_device,
255                                                             &parent_gpu->fault_buffer_info.rm_info));
256     if (status != NV_OK) {
257         UVM_ERR_PRINT("Failed to init fault buffer info from RM: %s, GPU %s\n",
258                       nvstatusToString(status),
259                       uvm_parent_gpu_name(parent_gpu));
260 
261         // nvUvmInterfaceInitFaultInfo may leave fields in rm_info populated
262         // when it returns an error. Set the buffer handle to zero as it is
263         // used by the deinitialization logic to determine if it was correctly
264         // initialized.
265         parent_gpu->fault_buffer_info.rm_info.faultBufferHandle = 0;
266         goto fail;
267     }
268 
269     status = fault_buffer_init_replayable_faults(parent_gpu);
270     if (status != NV_OK)
271         goto fail;
272 
273     if (parent_gpu->non_replayable_faults_supported) {
274         status = uvm_parent_gpu_fault_buffer_init_non_replayable_faults(parent_gpu);
275         if (status != NV_OK)
276             goto fail;
277     }
278 
279     return NV_OK;
280 
281 fail:
282     uvm_parent_gpu_fault_buffer_deinit(parent_gpu);
283 
284     return status;
285 }
286 
287 // Reinitialize state relevant to replayable fault handling after returning
288 // from a power management cycle.
289 void uvm_parent_gpu_fault_buffer_resume(uvm_parent_gpu_t *parent_gpu)
290 {
291     UVM_ASSERT(parent_gpu->replayable_faults_supported);
292 
293     fault_buffer_reinit_replayable_faults(parent_gpu);
294 }
295 
296 void uvm_parent_gpu_fault_buffer_deinit(uvm_parent_gpu_t *parent_gpu)
297 {
298     NV_STATUS status = NV_OK;
299 
300     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
301 
302     if (parent_gpu->non_replayable_faults_supported)
303         uvm_parent_gpu_fault_buffer_deinit_non_replayable_faults(parent_gpu);
304 
305     fault_buffer_deinit_replayable_faults(parent_gpu);
306 
307     if (parent_gpu->fault_buffer_info.rm_info.faultBufferHandle) {
308         status = uvm_rm_locked_call(nvUvmInterfaceOwnPageFaultIntr(parent_gpu->rm_device, NV_FALSE));
309         UVM_ASSERT(status == NV_OK);
310 
311         uvm_rm_locked_call_void(nvUvmInterfaceDestroyFaultInfo(parent_gpu->rm_device,
312                                                                &parent_gpu->fault_buffer_info.rm_info));
313 
314         parent_gpu->fault_buffer_info.rm_info.faultBufferHandle = 0;
315     }
316 }
317 
318 bool uvm_parent_gpu_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu)
319 {
320     uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable;
321 
322     UVM_ASSERT(parent_gpu->replayable_faults_supported);
323 
324     // Fast path 1: we left some faults unserviced in the buffer in the last pass
325     if (replayable_faults->cached_get != replayable_faults->cached_put)
326         return true;
327 
328     // Fast path 2: read the valid bit of the fault buffer entry pointed by the
329     // cached get pointer
330     if (!parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, replayable_faults->cached_get)) {
331         // Slow path: read the put pointer from the GPU register via BAR0
332         // over PCIe
333         replayable_faults->cached_put = parent_gpu->fault_buffer_hal->read_put(parent_gpu);
334 
335         // No interrupt pending
336         if (replayable_faults->cached_get == replayable_faults->cached_put)
337             return false;
338     }
339 
340     return true;
341 }
342 
343 // Push a fault cancel method on the given client. Any failure during this
344 // operation may lead to application hang (requiring manual Ctrl+C from the
345 // user) or system crash (requiring reboot).
346 // In that case we log an error message.
347 //
348 // gpc_id and client_id aren't used if global_cancel is true.
349 //
350 // This function acquires both the given tracker and the replay tracker
351 static NV_STATUS push_cancel_on_gpu(uvm_gpu_t *gpu,
352                                     uvm_gpu_phys_address_t instance_ptr,
353                                     bool global_cancel,
354                                     NvU32 gpc_id,
355                                     NvU32 client_id,
356                                     uvm_tracker_t *tracker)
357 {
358     NV_STATUS status;
359     uvm_push_t push;
360     uvm_tracker_t *replay_tracker = &gpu->parent->fault_buffer_info.replayable.replay_tracker;
361 
362     UVM_ASSERT(tracker != NULL);
363 
364     status = uvm_tracker_add_tracker_safe(tracker, replay_tracker);
365     if (status != NV_OK)
366         return status;
367 
368     if (global_cancel) {
369         status = uvm_push_begin_acquire(gpu->channel_manager,
370                                         UVM_CHANNEL_TYPE_MEMOPS,
371                                         tracker,
372                                         &push,
373                                         "Cancel targeting instance_ptr {0x%llx:%s}\n",
374                                         instance_ptr.address,
375                                         uvm_aperture_string(instance_ptr.aperture));
376     }
377     else {
378         status = uvm_push_begin_acquire(gpu->channel_manager,
379                                         UVM_CHANNEL_TYPE_MEMOPS,
380                                         tracker,
381                                         &push,
382                                         "Cancel targeting instance_ptr {0x%llx:%s} gpc %u client %u\n",
383                                         instance_ptr.address,
384                                         uvm_aperture_string(instance_ptr.aperture),
385                                         gpc_id,
386                                         client_id);
387     }
388 
389     UVM_ASSERT(status == NV_OK);
390     if (status != NV_OK) {
391         UVM_ERR_PRINT("Failed to create push and acquire trackers before pushing cancel: %s, GPU %s\n",
392                       nvstatusToString(status),
393                       uvm_gpu_name(gpu));
394         return status;
395     }
396 
397     if (global_cancel)
398         gpu->parent->host_hal->cancel_faults_global(&push, instance_ptr);
399     else
400         gpu->parent->host_hal->cancel_faults_targeted(&push, instance_ptr, gpc_id, client_id);
401 
402     // We don't need to put the cancel in the GPU replay tracker since we wait
403     // on it immediately.
404     status = uvm_push_end_and_wait(&push);
405 
406     UVM_ASSERT(status == NV_OK);
407     if (status != NV_OK)
408         UVM_ERR_PRINT("Failed to wait for pushed cancel: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
409 
410     // The cancellation is complete, so the input trackers must be complete too.
411     uvm_tracker_clear(tracker);
412     uvm_tracker_clear(replay_tracker);
413 
414     return status;
415 }
416 
417 static NV_STATUS push_cancel_on_gpu_targeted(uvm_gpu_t *gpu,
418                                              uvm_gpu_phys_address_t instance_ptr,
419                                              NvU32 gpc_id,
420                                              NvU32 client_id,
421                                              uvm_tracker_t *tracker)
422 {
423     return push_cancel_on_gpu(gpu, instance_ptr, false, gpc_id, client_id, tracker);
424 }
425 
426 static NV_STATUS push_cancel_on_gpu_global(uvm_gpu_t *gpu, uvm_gpu_phys_address_t instance_ptr, uvm_tracker_t *tracker)
427 {
428     UVM_ASSERT(!gpu->parent->smc.enabled);
429 
430     return push_cancel_on_gpu(gpu, instance_ptr, true, 0, 0, tracker);
431 }
432 
433 // Volta implements a targeted VA fault cancel that simplifies the fault cancel
434 // process. You only need to specify the address, type, and mmu_engine_id for
435 // the access to be cancelled. Caller must hold the VA space lock for the access
436 // to be cancelled.
437 static NV_STATUS cancel_fault_precise_va(uvm_gpu_t *gpu,
438                                          uvm_fault_buffer_entry_t *fault_entry,
439                                          uvm_fault_cancel_va_mode_t cancel_va_mode)
440 {
441     NV_STATUS status;
442     uvm_gpu_va_space_t *gpu_va_space;
443     uvm_gpu_phys_address_t pdb;
444     uvm_push_t push;
445     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
446     NvU64 offset;
447 
448     UVM_ASSERT(gpu->parent->replayable_faults_supported);
449     UVM_ASSERT(fault_entry->fatal_reason != UvmEventFatalReasonInvalid);
450     UVM_ASSERT(!fault_entry->filtered);
451 
452     gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(fault_entry->va_space, gpu->parent);
453     UVM_ASSERT(gpu_va_space);
454     pdb = uvm_page_tree_pdb(&gpu_va_space->page_tables)->addr;
455 
456     // Record fatal fault event
457     uvm_tools_record_gpu_fatal_fault(gpu->id, fault_entry->va_space, fault_entry, fault_entry->fatal_reason);
458 
459     status = uvm_push_begin_acquire(gpu->channel_manager,
460                                     UVM_CHANNEL_TYPE_MEMOPS,
461                                     &replayable_faults->replay_tracker,
462                                     &push,
463                                     "Precise cancel targeting PDB {0x%llx:%s} VA 0x%llx VEID %u with access type %s",
464                                     pdb.address,
465                                     uvm_aperture_string(pdb.aperture),
466                                     fault_entry->fault_address,
467                                     fault_entry->fault_source.ve_id,
468                                     uvm_fault_access_type_string(fault_entry->fault_access_type));
469     if (status != NV_OK) {
470         UVM_ERR_PRINT("Failed to create push and acquire replay tracker before pushing cancel: %s, GPU %s\n",
471                       nvstatusToString(status),
472                       uvm_gpu_name(gpu));
473         return status;
474     }
475 
476     // UVM aligns fault addresses to PAGE_SIZE as it is the smallest mapping
477     // and coherence tracking granularity. However, the cancel method requires
478     // the original address (4K-aligned) reported in the packet, which is lost
479     // at this point. Since the access permissions are the same for the whole
480     // 64K page, we issue a cancel per 4K range to make sure that the HW sees
481     // the address reported in the packet.
482     for (offset = 0; offset < PAGE_SIZE; offset += UVM_PAGE_SIZE_4K) {
483         gpu->parent->host_hal->cancel_faults_va(&push, pdb, fault_entry, cancel_va_mode);
484         fault_entry->fault_address += UVM_PAGE_SIZE_4K;
485     }
486     fault_entry->fault_address = UVM_PAGE_ALIGN_DOWN(fault_entry->fault_address - 1);
487 
488     // We don't need to put the cancel in the GPU replay tracker since we wait
489     // on it immediately.
490     status = uvm_push_end_and_wait(&push);
491     if (status != NV_OK) {
492         UVM_ERR_PRINT("Failed to wait for pushed VA global fault cancel: %s, GPU %s\n",
493                       nvstatusToString(status), uvm_gpu_name(gpu));
494     }
495 
496     uvm_tracker_clear(&replayable_faults->replay_tracker);
497 
498     return status;
499 }
500 
501 static NV_STATUS push_replay_on_gpu(uvm_gpu_t *gpu,
502                                     uvm_fault_replay_type_t type,
503                                     uvm_fault_service_batch_context_t *batch_context)
504 {
505     NV_STATUS status;
506     uvm_push_t push;
507     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
508     uvm_tracker_t *tracker = NULL;
509 
510     if (batch_context)
511         tracker = &batch_context->tracker;
512 
513     status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, tracker, &push,
514                                     "Replaying faults");
515     if (status != NV_OK)
516         return status;
517 
518     gpu->parent->host_hal->replay_faults(&push, type);
519 
520     // Do not count REPLAY_TYPE_START_ACK_ALL's toward the replay count.
521     // REPLAY_TYPE_START_ACK_ALL's are issued for cancels, and the cancel
522     // algorithm checks to make sure that no REPLAY_TYPE_START's have been
523     // issued using batch_context->replays.
524     if (batch_context && type != UVM_FAULT_REPLAY_TYPE_START_ACK_ALL) {
525         uvm_tools_broadcast_replay(gpu, &push, batch_context->batch_id, UVM_FAULT_CLIENT_TYPE_GPC);
526         ++batch_context->num_replays;
527     }
528 
529     uvm_push_end(&push);
530 
531     // Add this push to the GPU's replay_tracker so cancel can wait on it.
532     status = uvm_tracker_add_push_safe(&replayable_faults->replay_tracker, &push);
533 
534     if (uvm_procfs_is_debug_enabled()) {
535         if (type == UVM_FAULT_REPLAY_TYPE_START)
536             ++replayable_faults->stats.num_replays;
537         else
538             ++replayable_faults->stats.num_replays_ack_all;
539     }
540 
541     return status;
542 }
543 
544 static void write_get(uvm_parent_gpu_t *parent_gpu, NvU32 get)
545 {
546     uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable;
547 
548     UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock));
549 
550     // Write get on the GPU only if it's changed.
551     if (replayable_faults->cached_get == get)
552         return;
553 
554     replayable_faults->cached_get = get;
555 
556     // Update get pointer on the GPU
557     parent_gpu->fault_buffer_hal->write_get(parent_gpu, get);
558 }
559 
560 // In Confidential Computing GSP-RM owns the HW replayable fault buffer.
561 // Flushing the fault buffer implies flushing both the HW buffer (using a RM
562 // API), and the SW buffer accessible by UVM ("shadow" buffer).
563 //
564 // The HW buffer needs to be flushed first. This is because, once that flush
565 // completes, any faults that were present in the HW buffer have been moved to
566 // the shadow buffer, or have been discarded by RM.
567 static NV_STATUS hw_fault_buffer_flush_locked(uvm_parent_gpu_t *parent_gpu, hw_fault_buffer_flush_mode_t flush_mode)
568 {
569     NV_STATUS status;
570     NvBool is_flush_mode_move;
571 
572     UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock));
573     UVM_ASSERT((flush_mode == HW_FAULT_BUFFER_FLUSH_MODE_MOVE) || (flush_mode == HW_FAULT_BUFFER_FLUSH_MODE_DISCARD));
574 
575     if (!g_uvm_global.conf_computing_enabled)
576         return NV_OK;
577 
578     is_flush_mode_move = (NvBool) (flush_mode == HW_FAULT_BUFFER_FLUSH_MODE_MOVE);
579     status = nvUvmInterfaceFlushReplayableFaultBuffer(&parent_gpu->fault_buffer_info.rm_info, is_flush_mode_move);
580 
581     UVM_ASSERT(status == NV_OK);
582 
583     return status;
584 }
585 
586 static void fault_buffer_skip_replayable_entry(uvm_parent_gpu_t *parent_gpu, NvU32 index)
587 {
588     UVM_ASSERT(parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, index));
589 
590     // Flushed faults are never decrypted, but the decryption IV associated with
591     // replayable faults still requires manual adjustment so it is kept in sync
592     // with the encryption IV on the GSP-RM's side.
593     if (g_uvm_global.conf_computing_enabled)
594         uvm_conf_computing_fault_increment_decrypt_iv(parent_gpu, 1);
595 
596     parent_gpu->fault_buffer_hal->entry_clear_valid(parent_gpu, index);
597 }
598 
599 static NV_STATUS fault_buffer_flush_locked(uvm_gpu_t *gpu,
600                                            uvm_gpu_buffer_flush_mode_t flush_mode,
601                                            uvm_fault_replay_type_t fault_replay,
602                                            uvm_fault_service_batch_context_t *batch_context)
603 {
604     NvU32 get;
605     NvU32 put;
606     uvm_spin_loop_t spin;
607     uvm_parent_gpu_t *parent_gpu = gpu->parent;
608     uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable;
609     NV_STATUS status;
610 
611     UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock));
612     UVM_ASSERT(parent_gpu->replayable_faults_supported);
613 
614     // Wait for the prior replay to flush out old fault messages
615     if (flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT) {
616         status = uvm_tracker_wait(&replayable_faults->replay_tracker);
617         if (status != NV_OK)
618             return status;
619     }
620 
621     // Read PUT pointer from the GPU if requested
622     if (flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT || flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT) {
623         status = hw_fault_buffer_flush_locked(parent_gpu, HW_FAULT_BUFFER_FLUSH_MODE_DISCARD);
624         if (status != NV_OK)
625             return status;
626         replayable_faults->cached_put = parent_gpu->fault_buffer_hal->read_put(parent_gpu);
627     }
628 
629     get = replayable_faults->cached_get;
630     put = replayable_faults->cached_put;
631 
632     while (get != put) {
633         // Wait until valid bit is set
634         UVM_SPIN_WHILE(!parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, get), &spin);
635 
636         fault_buffer_skip_replayable_entry(parent_gpu, get);
637         ++get;
638         if (get == replayable_faults->max_faults)
639             get = 0;
640     }
641 
642     write_get(gpu->parent, get);
643 
644     // Issue fault replay
645     return push_replay_on_gpu(gpu, fault_replay, batch_context);
646 }
647 
648 NV_STATUS uvm_gpu_fault_buffer_flush(uvm_gpu_t *gpu)
649 {
650     NV_STATUS status = NV_OK;
651 
652     UVM_ASSERT(gpu->parent->replayable_faults_supported);
653 
654     // Disables replayable fault interrupts and fault servicing
655     uvm_parent_gpu_replayable_faults_isr_lock(gpu->parent);
656 
657     status = fault_buffer_flush_locked(gpu,
658                                        UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
659                                        UVM_FAULT_REPLAY_TYPE_START,
660                                        NULL);
661 
662     // This will trigger the top half to start servicing faults again, if the
663     // replay brought any back in
664     uvm_parent_gpu_replayable_faults_isr_unlock(gpu->parent);
665     return status;
666 }
667 
668 static inline int cmp_fault_instance_ptr(const uvm_fault_buffer_entry_t *a,
669                                          const uvm_fault_buffer_entry_t *b)
670 {
671     int result = uvm_gpu_phys_addr_cmp(a->instance_ptr, b->instance_ptr);
672     // On Volta+ we need to sort by {instance_ptr + subctx_id} pair since it can
673     // map to a different VA space
674     if (result != 0)
675         return result;
676     return UVM_CMP_DEFAULT(a->fault_source.ve_id, b->fault_source.ve_id);
677 }
678 
679 // Compare two VA spaces
680 static inline int cmp_va_space(const uvm_va_space_t *a, const uvm_va_space_t *b)
681 {
682     return UVM_CMP_DEFAULT(a, b);
683 }
684 
685 // Compare two virtual addresses
686 static inline int cmp_addr(NvU64 a, NvU64 b)
687 {
688     return UVM_CMP_DEFAULT(a, b);
689 }
690 
691 // Compare two fault access types
692 static inline int cmp_access_type(uvm_fault_access_type_t a, uvm_fault_access_type_t b)
693 {
694     UVM_ASSERT(a >= 0 && a < UVM_FAULT_ACCESS_TYPE_COUNT);
695     UVM_ASSERT(b >= 0 && b < UVM_FAULT_ACCESS_TYPE_COUNT);
696 
697     // Check that fault access type enum values are ordered by "intrusiveness"
698     BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG <= UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK);
699     BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK <= UVM_FAULT_ACCESS_TYPE_WRITE);
700     BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_WRITE <= UVM_FAULT_ACCESS_TYPE_READ);
701     BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_READ <= UVM_FAULT_ACCESS_TYPE_PREFETCH);
702 
703     return b - a;
704 }
705 
706 typedef enum
707 {
708     // Fetch a batch of faults from the buffer. Stop at the first entry that is
709     // not ready yet
710     FAULT_FETCH_MODE_BATCH_READY,
711 
712     // Fetch all faults in the buffer before PUT. Wait for all faults to become
713     // ready
714     FAULT_FETCH_MODE_ALL,
715 } fault_fetch_mode_t;
716 
717 static void fetch_fault_buffer_merge_entry(uvm_fault_buffer_entry_t *current_entry,
718                                            uvm_fault_buffer_entry_t *last_entry)
719 {
720     UVM_ASSERT(last_entry->num_instances > 0);
721 
722     ++last_entry->num_instances;
723     uvm_fault_access_type_mask_set(&last_entry->access_type_mask, current_entry->fault_access_type);
724 
725     if (current_entry->fault_access_type > last_entry->fault_access_type) {
726         // If the new entry has a higher access type, it becomes the
727         // fault to be serviced. Add the previous one to the list of instances
728         current_entry->access_type_mask = last_entry->access_type_mask;
729         current_entry->num_instances = last_entry->num_instances;
730         last_entry->filtered = true;
731 
732         // We only merge faults from different uTLBs if the new fault has an
733         // access type with the same or lower level of intrusiveness.
734         UVM_ASSERT(current_entry->fault_source.utlb_id == last_entry->fault_source.utlb_id);
735 
736         list_replace(&last_entry->merged_instances_list, &current_entry->merged_instances_list);
737         list_add(&last_entry->merged_instances_list, &current_entry->merged_instances_list);
738     }
739     else {
740         // Add the new entry to the list of instances for reporting purposes
741         current_entry->filtered = true;
742         list_add(&current_entry->merged_instances_list, &last_entry->merged_instances_list);
743     }
744 }
745 
746 static bool fetch_fault_buffer_try_merge_entry(uvm_fault_buffer_entry_t *current_entry,
747                                                uvm_fault_service_batch_context_t *batch_context,
748                                                uvm_fault_utlb_info_t *current_tlb,
749                                                bool is_same_instance_ptr)
750 {
751     uvm_fault_buffer_entry_t *last_tlb_entry = current_tlb->last_fault;
752     uvm_fault_buffer_entry_t *last_global_entry = batch_context->last_fault;
753 
754     // Check the last coalesced fault and the coalesced fault that was
755     // originated from this uTLB
756     const bool is_last_tlb_fault = current_tlb->num_pending_faults > 0 &&
757                                    cmp_fault_instance_ptr(current_entry, last_tlb_entry) == 0 &&
758                                    current_entry->fault_address == last_tlb_entry->fault_address;
759 
760     // We only merge faults from different uTLBs if the new fault has an
761     // access type with the same or lower level of intrusiveness. This is to
762     // avoid having to update num_pending_faults on both uTLBs and recomputing
763     // last_fault.
764     const bool is_last_fault = is_same_instance_ptr &&
765                                current_entry->fault_address == last_global_entry->fault_address &&
766                                current_entry->fault_access_type <= last_global_entry->fault_access_type;
767 
768     if (is_last_tlb_fault) {
769         fetch_fault_buffer_merge_entry(current_entry, last_tlb_entry);
770         if (current_entry->fault_access_type > last_tlb_entry->fault_access_type)
771             current_tlb->last_fault = current_entry;
772 
773         return true;
774     }
775     else if (is_last_fault) {
776         fetch_fault_buffer_merge_entry(current_entry, last_global_entry);
777         if (current_entry->fault_access_type > last_global_entry->fault_access_type)
778             batch_context->last_fault = current_entry;
779 
780         return true;
781     }
782 
783     return false;
784 }
785 
786 // Fetch entries from the fault buffer, decode them and store them in the batch
787 // context. We implement the fetch modes described above.
788 //
789 // When possible, we coalesce duplicate entries to minimize the fault handling
790 // overhead. Basically, we merge faults with the same instance pointer and page
791 // virtual address. We keep track of the last fault per uTLB to detect
792 // duplicates due to local reuse and the last fault in the whole batch to
793 // detect reuse across CTAs.
794 //
795 // We will service the first fault entry with the most "intrusive" (atomic >
796 // write > read > prefetch) access type*. That fault entry is called the
797 // "representative". The rest of filtered faults have the "filtered" flag set
798 // and are added to a list in the representative fault entry for reporting
799 // purposes. The representative fault entry also contains a mask with all the
800 // access types that produced a fault on the page.
801 //
802 // *We only merge faults from different uTLBs if the new fault has an access
803 // type with the same or lower level of intrusiveness.
804 //
805 // This optimization cannot be performed during fault cancel on Pascal GPUs
806 // (fetch_mode == FAULT_FETCH_MODE_ALL) since we need accurate tracking of all
807 // the faults in each uTLB in order to guarantee precise fault attribution.
808 static NV_STATUS fetch_fault_buffer_entries(uvm_gpu_t *gpu,
809                                             uvm_fault_service_batch_context_t *batch_context,
810                                             fault_fetch_mode_t fetch_mode)
811 {
812     NvU32 get;
813     NvU32 put;
814     NvU32 fault_index;
815     NvU32 num_coalesced_faults;
816     NvU32 utlb_id;
817     uvm_fault_buffer_entry_t *fault_cache;
818     uvm_spin_loop_t spin;
819     NV_STATUS status = NV_OK;
820     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
821     const bool in_pascal_cancel_path = (!gpu->parent->fault_cancel_va_supported && fetch_mode == FAULT_FETCH_MODE_ALL);
822     const bool may_filter = uvm_perf_fault_coalesce && !in_pascal_cancel_path;
823 
824     UVM_ASSERT(uvm_sem_is_locked(&gpu->parent->isr.replayable_faults.service_lock));
825     UVM_ASSERT(gpu->parent->replayable_faults_supported);
826 
827     fault_cache = batch_context->fault_cache;
828 
829     get = replayable_faults->cached_get;
830 
831     // Read put pointer from GPU and cache it
832     if (get == replayable_faults->cached_put)
833         replayable_faults->cached_put = gpu->parent->fault_buffer_hal->read_put(gpu->parent);
834 
835     put = replayable_faults->cached_put;
836 
837     batch_context->is_single_instance_ptr = true;
838     batch_context->last_fault = NULL;
839 
840     fault_index = 0;
841     num_coalesced_faults = 0;
842 
843     // Clear uTLB counters
844     for (utlb_id = 0; utlb_id <= batch_context->max_utlb_id; ++utlb_id) {
845         batch_context->utlbs[utlb_id].num_pending_faults = 0;
846         batch_context->utlbs[utlb_id].has_fatal_faults = false;
847     }
848     batch_context->max_utlb_id = 0;
849 
850     if (get == put)
851         goto done;
852 
853     // Parse until get != put and have enough space to cache.
854     while ((get != put) &&
855            (fetch_mode == FAULT_FETCH_MODE_ALL || fault_index < gpu->parent->fault_buffer_info.max_batch_size)) {
856         bool is_same_instance_ptr = true;
857         uvm_fault_buffer_entry_t *current_entry = &fault_cache[fault_index];
858         uvm_fault_utlb_info_t *current_tlb;
859 
860         // We cannot just wait for the last entry (the one pointed by put) to
861         // become valid, we have to do it individually since entries can be
862         // written out of order
863         UVM_SPIN_WHILE(!gpu->parent->fault_buffer_hal->entry_is_valid(gpu->parent, get), &spin) {
864             // We have some entry to work on. Let's do the rest later.
865             if (fetch_mode == FAULT_FETCH_MODE_BATCH_READY && fault_index > 0)
866                 goto done;
867         }
868 
869         // Prevent later accesses being moved above the read of the valid bit
870         smp_mb__after_atomic();
871 
872         // Got valid bit set. Let's cache.
873         status = gpu->parent->fault_buffer_hal->parse_replayable_entry(gpu->parent, get, current_entry);
874         if (status != NV_OK)
875             goto done;
876 
877         // The GPU aligns the fault addresses to 4k, but all of our tracking is
878         // done in PAGE_SIZE chunks which might be larger.
879         current_entry->fault_address = UVM_PAGE_ALIGN_DOWN(current_entry->fault_address);
880 
881         // Make sure that all fields in the entry are properly initialized
882         current_entry->is_fatal = (current_entry->fault_type >= UVM_FAULT_TYPE_FATAL);
883 
884         if (current_entry->is_fatal) {
885             // Record the fatal fault event later as we need the va_space locked
886             current_entry->fatal_reason = UvmEventFatalReasonInvalidFaultType;
887         }
888         else {
889             current_entry->fatal_reason = UvmEventFatalReasonInvalid;
890         }
891 
892         current_entry->va_space = NULL;
893         current_entry->filtered = false;
894         current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
895 
896         if (current_entry->fault_source.utlb_id > batch_context->max_utlb_id) {
897             UVM_ASSERT(current_entry->fault_source.utlb_id < replayable_faults->utlb_count);
898             batch_context->max_utlb_id = current_entry->fault_source.utlb_id;
899         }
900 
901         current_tlb = &batch_context->utlbs[current_entry->fault_source.utlb_id];
902 
903         if (fault_index > 0) {
904             UVM_ASSERT(batch_context->last_fault);
905             is_same_instance_ptr = cmp_fault_instance_ptr(current_entry, batch_context->last_fault) == 0;
906 
907             // Coalesce duplicate faults when possible
908             if (may_filter && !current_entry->is_fatal) {
909                 bool merged = fetch_fault_buffer_try_merge_entry(current_entry,
910                                                                  batch_context,
911                                                                  current_tlb,
912                                                                  is_same_instance_ptr);
913                 if (merged)
914                     goto next_fault;
915             }
916         }
917 
918         if (batch_context->is_single_instance_ptr && !is_same_instance_ptr)
919             batch_context->is_single_instance_ptr = false;
920 
921         current_entry->num_instances = 1;
922         current_entry->access_type_mask = uvm_fault_access_type_mask_bit(current_entry->fault_access_type);
923         INIT_LIST_HEAD(&current_entry->merged_instances_list);
924 
925         ++current_tlb->num_pending_faults;
926         current_tlb->last_fault = current_entry;
927         batch_context->last_fault = current_entry;
928 
929         ++num_coalesced_faults;
930 
931     next_fault:
932         ++fault_index;
933         ++get;
934         if (get == replayable_faults->max_faults)
935             get = 0;
936     }
937 
938 done:
939     write_get(gpu->parent, get);
940 
941     batch_context->num_cached_faults = fault_index;
942     batch_context->num_coalesced_faults = num_coalesced_faults;
943 
944     return status;
945 }
946 
947 // Sort comparator for pointers to fault buffer entries that sorts by
948 // instance pointer
949 static int cmp_sort_fault_entry_by_instance_ptr(const void *_a, const void *_b)
950 {
951     const uvm_fault_buffer_entry_t **a = (const uvm_fault_buffer_entry_t **)_a;
952     const uvm_fault_buffer_entry_t **b = (const uvm_fault_buffer_entry_t **)_b;
953 
954     return cmp_fault_instance_ptr(*a, *b);
955 }
956 
957 // Sort comparator for pointers to fault buffer entries that sorts by va_space,
958 // fault address and fault access type
959 static int cmp_sort_fault_entry_by_va_space_address_access_type(const void *_a, const void *_b)
960 {
961     const uvm_fault_buffer_entry_t **a = (const uvm_fault_buffer_entry_t **)_a;
962     const uvm_fault_buffer_entry_t **b = (const uvm_fault_buffer_entry_t **)_b;
963 
964     int result;
965 
966     result = cmp_va_space((*a)->va_space, (*b)->va_space);
967     if (result != 0)
968         return result;
969 
970     result = cmp_addr((*a)->fault_address, (*b)->fault_address);
971     if (result != 0)
972         return result;
973 
974     return cmp_access_type((*a)->fault_access_type, (*b)->fault_access_type);
975 }
976 
977 // Translate all instance pointers to VA spaces. Since the buffer is ordered by
978 // instance_ptr, we minimize the number of translations
979 //
980 // This function returns NV_WARN_MORE_PROCESSING_REQUIRED if a fault buffer
981 // flush occurred and executed successfully, or the error code if it failed.
982 // NV_OK otherwise.
983 static NV_STATUS translate_instance_ptrs(uvm_gpu_t *gpu,
984                                          uvm_fault_service_batch_context_t *batch_context)
985 {
986     NvU32 i;
987     NV_STATUS status;
988 
989     for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
990         uvm_fault_buffer_entry_t *current_entry;
991 
992         current_entry = batch_context->ordered_fault_cache[i];
993 
994         // If this instance pointer matches the previous instance pointer, just
995         // copy over the already-translated va_space and move on.
996         if (i != 0 && cmp_fault_instance_ptr(current_entry, batch_context->ordered_fault_cache[i - 1]) == 0) {
997             current_entry->va_space = batch_context->ordered_fault_cache[i - 1]->va_space;
998             continue;
999         }
1000 
1001         status = uvm_parent_gpu_fault_entry_to_va_space(gpu->parent, current_entry, &current_entry->va_space);
1002         if (status != NV_OK) {
1003             if (status == NV_ERR_PAGE_TABLE_NOT_AVAIL) {
1004                 // The channel is valid but the subcontext is not. This can only
1005                 // happen if the subcontext is torn down before its work is
1006                 // complete while other subcontexts in the same TSG are still
1007                 // executing. This is a violation of the programming model. We
1008                 // have limited options since the VA space is gone, meaning we
1009                 // can't target the PDB for cancel even if we wanted to. So
1010                 // we'll just throw away precise attribution and cancel this
1011                 // fault using the SW method, which validates that the intended
1012                 // context (TSG) is still running so we don't cancel an innocent
1013                 // context.
1014                 UVM_ASSERT(!current_entry->va_space);
1015                 UVM_ASSERT(gpu->max_subcontexts > 0);
1016 
1017                 if (gpu->parent->smc.enabled) {
1018                     status = push_cancel_on_gpu_targeted(gpu,
1019                                                          current_entry->instance_ptr,
1020                                                          current_entry->fault_source.gpc_id,
1021                                                          current_entry->fault_source.client_id,
1022                                                          &batch_context->tracker);
1023                 }
1024                 else {
1025                     status = push_cancel_on_gpu_global(gpu, current_entry->instance_ptr, &batch_context->tracker);
1026                 }
1027 
1028                 if (status != NV_OK)
1029                     return status;
1030 
1031                 // Fall through and let the flush restart fault processing
1032             }
1033             else {
1034                 UVM_ASSERT(status == NV_ERR_INVALID_CHANNEL);
1035             }
1036 
1037             // If the channel is gone then we're looking at a stale fault entry.
1038             // The fault must have been resolved already (serviced or
1039             // cancelled), so we can just flush the fault buffer.
1040             //
1041             // No need to use UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT since
1042             // there was a context preemption for the entries we want to flush,
1043             // meaning PUT must reflect them.
1044             status = fault_buffer_flush_locked(gpu,
1045                                                UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
1046                                                UVM_FAULT_REPLAY_TYPE_START,
1047                                                batch_context);
1048             if (status != NV_OK)
1049                  return status;
1050 
1051             return NV_WARN_MORE_PROCESSING_REQUIRED;
1052         }
1053         else {
1054             UVM_ASSERT(current_entry->va_space);
1055         }
1056     }
1057 
1058     return NV_OK;
1059 }
1060 
1061 // Fault cache preprocessing for fault coalescing
1062 //
1063 // This function generates an ordered view of the given fault_cache in which
1064 // faults are sorted by VA space, fault address (aligned to 4K) and access type
1065 // "intrusiveness". In order to minimize the number of instance_ptr to VA space
1066 // translations we perform a first sort by instance_ptr.
1067 //
1068 // This function returns NV_WARN_MORE_PROCESSING_REQUIRED if a fault buffer
1069 // flush occurred during instance_ptr translation and executed successfully, or
1070 // the error code if it failed. NV_OK otherwise.
1071 //
1072 // Current scheme:
1073 // 1) sort by instance_ptr
1074 // 2) translate all instance_ptrs to VA spaces
1075 // 3) sort by va_space, fault address (fault_address is page-aligned at this
1076 //    point) and access type
1077 static NV_STATUS preprocess_fault_batch(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
1078 {
1079     NV_STATUS status;
1080     NvU32 i, j;
1081     uvm_fault_buffer_entry_t **ordered_fault_cache = batch_context->ordered_fault_cache;
1082 
1083     UVM_ASSERT(batch_context->num_coalesced_faults > 0);
1084     UVM_ASSERT(batch_context->num_cached_faults >= batch_context->num_coalesced_faults);
1085 
1086     // Generate an ordered view of the fault cache in ordered_fault_cache.
1087     // We sort the pointers, not the entries in fault_cache
1088 
1089     // Initialize pointers before they are sorted. We only sort one instance per
1090     // coalesced fault
1091     for (i = 0, j = 0; i < batch_context->num_cached_faults; ++i) {
1092         if (!batch_context->fault_cache[i].filtered)
1093             ordered_fault_cache[j++] = &batch_context->fault_cache[i];
1094     }
1095     UVM_ASSERT(j == batch_context->num_coalesced_faults);
1096 
1097     // 1) if the fault batch contains more than one, sort by instance_ptr
1098     if (!batch_context->is_single_instance_ptr) {
1099         sort(ordered_fault_cache,
1100              batch_context->num_coalesced_faults,
1101              sizeof(*ordered_fault_cache),
1102              cmp_sort_fault_entry_by_instance_ptr,
1103              NULL);
1104     }
1105 
1106     // 2) translate all instance_ptrs to VA spaces
1107     status = translate_instance_ptrs(gpu, batch_context);
1108     if (status != NV_OK)
1109         return status;
1110 
1111     // 3) sort by va_space, fault address (GPU already reports 4K-aligned
1112     // address) and access type
1113     sort(ordered_fault_cache,
1114          batch_context->num_coalesced_faults,
1115          sizeof(*ordered_fault_cache),
1116          cmp_sort_fault_entry_by_va_space_address_access_type,
1117          NULL);
1118 
1119     return NV_OK;
1120 }
1121 
1122 static bool check_fault_entry_duplicate(const uvm_fault_buffer_entry_t *current_entry,
1123                                         const uvm_fault_buffer_entry_t *previous_entry)
1124 {
1125     bool is_duplicate = false;
1126 
1127     if (previous_entry) {
1128         is_duplicate = (current_entry->va_space == previous_entry->va_space) &&
1129                        (current_entry->fault_address == previous_entry->fault_address);
1130     }
1131 
1132     return is_duplicate;
1133 }
1134 
1135 static void update_batch_and_notify_fault(uvm_gpu_t *gpu,
1136                                           uvm_fault_service_batch_context_t *batch_context,
1137                                           uvm_va_block_t *va_block,
1138                                           uvm_processor_id_t preferred_location,
1139                                           uvm_fault_buffer_entry_t *current_entry,
1140                                           bool is_duplicate)
1141 {
1142     if (is_duplicate)
1143         batch_context->num_duplicate_faults += current_entry->num_instances;
1144     else
1145         batch_context->num_duplicate_faults += current_entry->num_instances - 1;
1146 
1147     uvm_perf_event_notify_gpu_fault(&current_entry->va_space->perf_events,
1148                                     va_block,
1149                                     gpu->id,
1150                                     preferred_location,
1151                                     current_entry,
1152                                     batch_context->batch_id,
1153                                     is_duplicate);
1154 }
1155 
1156 static void mark_fault_invalid_prefetch(uvm_fault_service_batch_context_t *batch_context,
1157                                         uvm_fault_buffer_entry_t *fault_entry)
1158 {
1159     fault_entry->is_invalid_prefetch = true;
1160 
1161     // For block faults, the following counter might be updated more than once
1162     // for the same fault if block_context->num_retries > 0. As a result, this
1163     // counter might be higher than the actual count. In order for this counter
1164     // to be always accurate, block_context needs to passed down the stack from
1165     // all callers. But since num_retries > 0 case is uncommon and imprecise
1166     // invalid_prefetch counter doesn't affect functionality (other than
1167     // disabling prefetching if the counter indicates lots of invalid prefetch
1168     // faults), this is ok.
1169     batch_context->num_invalid_prefetch_faults += fault_entry->num_instances;
1170 }
1171 
1172 static void mark_fault_throttled(uvm_fault_service_batch_context_t *batch_context,
1173                                  uvm_fault_buffer_entry_t *fault_entry)
1174 {
1175     fault_entry->is_throttled = true;
1176     batch_context->has_throttled_faults = true;
1177 }
1178 
1179 static void mark_fault_fatal(uvm_fault_service_batch_context_t *batch_context,
1180                              uvm_fault_buffer_entry_t *fault_entry,
1181                              UvmEventFatalReason fatal_reason,
1182                              uvm_fault_cancel_va_mode_t cancel_va_mode)
1183 {
1184     uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[fault_entry->fault_source.utlb_id];
1185 
1186     fault_entry->is_fatal = true;
1187     fault_entry->fatal_reason = fatal_reason;
1188     fault_entry->replayable.cancel_va_mode = cancel_va_mode;
1189 
1190     utlb->has_fatal_faults = true;
1191 
1192     if (!batch_context->fatal_va_space) {
1193         UVM_ASSERT(fault_entry->va_space);
1194         batch_context->fatal_va_space = fault_entry->va_space;
1195     }
1196 }
1197 
1198 static void fault_entry_duplicate_flags(uvm_fault_service_batch_context_t *batch_context,
1199                                         uvm_fault_buffer_entry_t *current_entry,
1200                                         const uvm_fault_buffer_entry_t *previous_entry)
1201 {
1202     UVM_ASSERT(previous_entry);
1203     UVM_ASSERT(check_fault_entry_duplicate(current_entry, previous_entry));
1204 
1205     // Propagate the is_invalid_prefetch flag across all prefetch faults
1206     // on the page
1207     if (previous_entry->is_invalid_prefetch)
1208         mark_fault_invalid_prefetch(batch_context, current_entry);
1209 
1210     // If a page is throttled, all faults on the page must be skipped
1211     if (previous_entry->is_throttled)
1212         mark_fault_throttled(batch_context, current_entry);
1213 }
1214 
1215 // This function computes the maximum access type that can be serviced for the
1216 // reported fault instances given the logical permissions of the VA range. If
1217 // none of the fault instances can be serviced UVM_FAULT_ACCESS_TYPE_COUNT is
1218 // returned instead.
1219 //
1220 // In the case that there are faults that cannot be serviced, this function
1221 // also sets the flags required for fault cancellation. Prefetch faults do not
1222 // need to be cancelled since they disappear on replay.
1223 //
1224 // The UVM driver considers two scenarios for logical permissions violation:
1225 // - All access types are invalid. For example, when faulting from a processor
1226 // that doesn't have access to the preferred location of a range group when it
1227 // is not migratable. In this case all accesses to the page must be cancelled.
1228 // - Write/atomic accesses are invalid. Basically, when trying to modify a
1229 // read-only VA range. In this case we restrict fault cancelling to those types
1230 // of accesses.
1231 //
1232 // Return values:
1233 // - service_access_type: highest access type that can be serviced.
1234 static uvm_fault_access_type_t check_fault_access_permissions(uvm_gpu_t *gpu,
1235                                                               uvm_fault_service_batch_context_t *batch_context,
1236                                                               uvm_va_block_t *va_block,
1237                                                               uvm_service_block_context_t *service_block_context,
1238                                                               uvm_fault_buffer_entry_t *fault_entry,
1239                                                               bool allow_migration)
1240 {
1241     NV_STATUS perm_status;
1242     UvmEventFatalReason fatal_reason;
1243     uvm_fault_cancel_va_mode_t cancel_va_mode;
1244     uvm_fault_access_type_t ret = UVM_FAULT_ACCESS_TYPE_COUNT;
1245     uvm_va_block_context_t *va_block_context = service_block_context->block_context;
1246 
1247     perm_status = uvm_va_block_check_logical_permissions(va_block,
1248                                                          va_block_context,
1249                                                          gpu->id,
1250                                                          uvm_va_block_cpu_page_index(va_block,
1251                                                                                      fault_entry->fault_address),
1252                                                          fault_entry->fault_access_type,
1253                                                          allow_migration);
1254     if (perm_status == NV_OK)
1255         return fault_entry->fault_access_type;
1256 
1257     if (fault_entry->fault_access_type == UVM_FAULT_ACCESS_TYPE_PREFETCH) {
1258         // Only update the count the first time since logical permissions cannot
1259         // change while we hold the VA space lock
1260         // TODO: Bug 1750144: That might not be true with HMM.
1261         if (service_block_context->num_retries == 0)
1262             mark_fault_invalid_prefetch(batch_context, fault_entry);
1263 
1264         return ret;
1265     }
1266 
1267     // At this point we know that some fault instances cannot be serviced
1268     fatal_reason = uvm_tools_status_to_fatal_fault_reason(perm_status);
1269 
1270     if (fault_entry->fault_access_type > UVM_FAULT_ACCESS_TYPE_READ) {
1271         cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_WRITE_AND_ATOMIC;
1272 
1273         // If there are pending read accesses on the same page, we have to
1274         // service them before we can cancel the write/atomic faults. So we
1275         // retry with read fault access type.
1276         if (uvm_fault_access_type_mask_test(fault_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ)) {
1277             perm_status = uvm_va_block_check_logical_permissions(va_block,
1278                                                                  va_block_context,
1279                                                                  gpu->id,
1280                                                                  uvm_va_block_cpu_page_index(va_block,
1281                                                                                              fault_entry->fault_address),
1282                                                                  UVM_FAULT_ACCESS_TYPE_READ,
1283                                                                  allow_migration);
1284             if (perm_status == NV_OK) {
1285                 ret = UVM_FAULT_ACCESS_TYPE_READ;
1286             }
1287             else {
1288                 // Read accesses didn't succeed, cancel all faults
1289                 cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
1290                 fatal_reason = uvm_tools_status_to_fatal_fault_reason(perm_status);
1291             }
1292         }
1293     }
1294     else {
1295         cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
1296     }
1297 
1298     mark_fault_fatal(batch_context, fault_entry, fatal_reason, cancel_va_mode);
1299 
1300     return ret;
1301 }
1302 
1303 // We notify the fault event for all faults within the block so that the
1304 // performance heuristics are updated. Then, all required actions for the block
1305 // data are performed by the performance heuristics code.
1306 //
1307 // Fatal faults are flagged as fatal for later cancellation. Servicing is not
1308 // interrupted on fatal faults due to insufficient permissions or invalid
1309 // addresses.
1310 //
1311 // Return codes:
1312 // - NV_OK if all faults were handled (both fatal and non-fatal)
1313 // - NV_ERR_MORE_PROCESSING_REQUIRED if servicing needs allocation retry
1314 // - NV_ERR_NO_MEMORY if the faults could not be serviced due to OOM
1315 // - Any other value is a UVM-global error
1316 static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
1317                                                   uvm_va_block_t *va_block,
1318                                                   uvm_va_block_retry_t *va_block_retry,
1319                                                   uvm_fault_service_batch_context_t *batch_context,
1320                                                   NvU32 first_fault_index,
1321                                                   const bool hmm_migratable,
1322                                                   NvU32 *block_faults)
1323 {
1324     NV_STATUS status = NV_OK;
1325     NvU32 i;
1326     uvm_page_index_t first_page_index;
1327     uvm_page_index_t last_page_index;
1328     NvU32 page_fault_count = 0;
1329     uvm_range_group_range_iter_t iter;
1330     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
1331     uvm_fault_buffer_entry_t **ordered_fault_cache = batch_context->ordered_fault_cache;
1332     uvm_service_block_context_t *block_context = &replayable_faults->block_service_context;
1333     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1334     const uvm_va_policy_t *policy;
1335     NvU64 end;
1336 
1337     // Check that all uvm_fault_access_type_t values can fit into an NvU8
1338     BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_COUNT > (int)(NvU8)-1);
1339 
1340     uvm_assert_mutex_locked(&va_block->lock);
1341 
1342     *block_faults = 0;
1343 
1344     first_page_index = PAGES_PER_UVM_VA_BLOCK;
1345     last_page_index = 0;
1346 
1347     // Initialize fault service block context
1348     uvm_processor_mask_zero(&block_context->resident_processors);
1349     block_context->thrashing_pin_count = 0;
1350     block_context->read_duplicate_count = 0;
1351 
1352     uvm_range_group_range_migratability_iter_first(va_space, va_block->start, va_block->end, &iter);
1353 
1354     // The first entry is guaranteed to fall within this block
1355     UVM_ASSERT(ordered_fault_cache[first_fault_index]->va_space == va_space);
1356     UVM_ASSERT(ordered_fault_cache[first_fault_index]->fault_address >= va_block->start);
1357     UVM_ASSERT(ordered_fault_cache[first_fault_index]->fault_address <= va_block->end);
1358 
1359     if (uvm_va_block_is_hmm(va_block)) {
1360         policy = uvm_hmm_find_policy_end(va_block,
1361                                          block_context->block_context->hmm.vma,
1362                                          ordered_fault_cache[first_fault_index]->fault_address,
1363                                          &end);
1364     }
1365     else {
1366         policy = uvm_va_range_get_policy(va_block->va_range);
1367         end = va_block->end;
1368     }
1369 
1370     // Scan the sorted array and notify the fault event for all fault entries
1371     // in the block
1372     for (i = first_fault_index;
1373          i < batch_context->num_coalesced_faults &&
1374          ordered_fault_cache[i]->va_space == va_space &&
1375          ordered_fault_cache[i]->fault_address <= end;
1376          ++i) {
1377         uvm_fault_buffer_entry_t *current_entry = ordered_fault_cache[i];
1378         const uvm_fault_buffer_entry_t *previous_entry = NULL;
1379         bool read_duplicate;
1380         uvm_processor_id_t new_residency;
1381         uvm_perf_thrashing_hint_t thrashing_hint;
1382         uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, current_entry->fault_address);
1383         bool is_duplicate = false;
1384         uvm_fault_access_type_t service_access_type;
1385         NvU32 service_access_type_mask;
1386 
1387         UVM_ASSERT(current_entry->fault_access_type ==
1388                    uvm_fault_access_type_mask_highest(current_entry->access_type_mask));
1389 
1390         // Unserviceable faults were already skipped by the caller. There are no
1391         // unserviceable fault types that could be in the same VA block as a
1392         // serviceable fault.
1393         UVM_ASSERT(!current_entry->is_fatal);
1394         current_entry->is_throttled        = false;
1395         current_entry->is_invalid_prefetch = false;
1396 
1397         if (i > first_fault_index) {
1398             previous_entry = ordered_fault_cache[i - 1];
1399             is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
1400         }
1401 
1402         // Only update counters the first time since logical permissions cannot
1403         // change while we hold the VA space lock.
1404         // TODO: Bug 1750144: That might not be true with HMM.
1405         if (block_context->num_retries == 0) {
1406             update_batch_and_notify_fault(gpu,
1407                                           batch_context,
1408                                           va_block,
1409                                           policy->preferred_location,
1410                                           current_entry,
1411                                           is_duplicate);
1412         }
1413 
1414         // Service the most intrusive fault per page, only. Waive the rest
1415         if (is_duplicate) {
1416             fault_entry_duplicate_flags(batch_context, current_entry, previous_entry);
1417 
1418             // The previous fault was non-fatal so the page has been already
1419             // serviced
1420             if (!previous_entry->is_fatal)
1421                 continue;
1422         }
1423 
1424         // Ensure that the migratability iterator covers the current fault
1425         // address
1426         while (iter.end < current_entry->fault_address)
1427             uvm_range_group_range_migratability_iter_next(va_space, &iter, va_block->end);
1428 
1429         UVM_ASSERT(iter.start <= current_entry->fault_address && iter.end >= current_entry->fault_address);
1430 
1431         service_access_type = check_fault_access_permissions(gpu,
1432                                                              batch_context,
1433                                                              va_block,
1434                                                              block_context,
1435                                                              current_entry,
1436                                                              iter.migratable);
1437 
1438         // Do not exit early due to logical errors such as access permission
1439         // violation.
1440         if (service_access_type == UVM_FAULT_ACCESS_TYPE_COUNT)
1441             continue;
1442 
1443         if (service_access_type != current_entry->fault_access_type) {
1444             // Some of the fault instances cannot be serviced due to invalid
1445             // access permissions. Recompute the access type service mask to
1446             // service the rest.
1447             UVM_ASSERT(service_access_type < current_entry->fault_access_type);
1448             service_access_type_mask = uvm_fault_access_type_mask_bit(service_access_type);
1449         }
1450         else {
1451             service_access_type_mask = current_entry->access_type_mask;
1452         }
1453 
1454         // If the GPU already has the necessary access permission, the fault
1455         // does not need to be serviced
1456         if (uvm_va_block_page_is_gpu_authorized(va_block,
1457                                                 page_index,
1458                                                 gpu->id,
1459                                                 uvm_fault_access_type_to_prot(service_access_type)))
1460             continue;
1461 
1462         thrashing_hint = uvm_perf_thrashing_get_hint(va_block,
1463                                                      block_context->block_context,
1464                                                      current_entry->fault_address,
1465                                                      gpu->id);
1466         if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) {
1467             // Throttling is implemented by sleeping in the fault handler on
1468             // the CPU and by continuing to process faults on other pages on
1469             // the GPU
1470             //
1471             // Only update the flag the first time since logical permissions
1472             // cannot change while we hold the VA space lock.
1473             // TODO: Bug 1750144: That might not be true with HMM.
1474             if (block_context->num_retries == 0)
1475                 mark_fault_throttled(batch_context, current_entry);
1476 
1477             continue;
1478         }
1479         else if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
1480             if (block_context->thrashing_pin_count++ == 0)
1481                 uvm_page_mask_zero(&block_context->thrashing_pin_mask);
1482 
1483             uvm_page_mask_set(&block_context->thrashing_pin_mask, page_index);
1484         }
1485 
1486         // Compute new residency and update the masks
1487         new_residency = uvm_va_block_select_residency(va_block,
1488                                                       block_context->block_context,
1489                                                       page_index,
1490                                                       gpu->id,
1491                                                       service_access_type_mask,
1492                                                       policy,
1493                                                       &thrashing_hint,
1494                                                       UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS,
1495                                                       hmm_migratable,
1496                                                       &read_duplicate);
1497 
1498         if (!uvm_processor_mask_test_and_set(&block_context->resident_processors, new_residency))
1499             uvm_page_mask_zero(&block_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
1500 
1501         uvm_page_mask_set(&block_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index);
1502 
1503         if (read_duplicate) {
1504             if (block_context->read_duplicate_count++ == 0)
1505                 uvm_page_mask_zero(&block_context->read_duplicate_mask);
1506 
1507             uvm_page_mask_set(&block_context->read_duplicate_mask, page_index);
1508         }
1509 
1510         ++page_fault_count;
1511 
1512         block_context->access_type[page_index] = service_access_type;
1513 
1514         if (page_index < first_page_index)
1515             first_page_index = page_index;
1516         if (page_index > last_page_index)
1517             last_page_index = page_index;
1518     }
1519 
1520     // Apply the changes computed in the fault service block context, if there
1521     // are pages to be serviced
1522     if (page_fault_count > 0) {
1523         block_context->region = uvm_va_block_region(first_page_index, last_page_index + 1);
1524         status = uvm_va_block_service_locked(gpu->id, va_block, va_block_retry, block_context);
1525     }
1526 
1527     *block_faults = i - first_fault_index;
1528 
1529     ++block_context->num_retries;
1530 
1531     if (status == NV_OK && batch_context->fatal_va_space)
1532         status = uvm_va_block_set_cancel(va_block, block_context->block_context, gpu);
1533 
1534     return status;
1535 }
1536 
1537 // We notify the fault event for all faults within the block so that the
1538 // performance heuristics are updated. The VA block lock is taken for the whole
1539 // fault servicing although it might be temporarily dropped and re-taken if
1540 // memory eviction is required.
1541 //
1542 // See the comments for function service_fault_batch_block_locked for
1543 // implementation details and error codes.
1544 static NV_STATUS service_fault_batch_block(uvm_gpu_t *gpu,
1545                                            uvm_va_block_t *va_block,
1546                                            uvm_fault_service_batch_context_t *batch_context,
1547                                            NvU32 first_fault_index,
1548                                            const bool hmm_migratable,
1549                                            NvU32 *block_faults)
1550 {
1551     NV_STATUS status;
1552     uvm_va_block_retry_t va_block_retry;
1553     NV_STATUS tracker_status;
1554     uvm_service_block_context_t *fault_block_context = &gpu->parent->fault_buffer_info.replayable.block_service_context;
1555 
1556     fault_block_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS;
1557     fault_block_context->num_retries = 0;
1558 
1559     if (uvm_va_block_is_hmm(va_block))
1560         uvm_hmm_migrate_begin_wait(va_block);
1561 
1562     uvm_mutex_lock(&va_block->lock);
1563 
1564     status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, &va_block_retry,
1565                                        service_fault_batch_block_locked(gpu,
1566                                                                         va_block,
1567                                                                         &va_block_retry,
1568                                                                         batch_context,
1569                                                                         first_fault_index,
1570                                                                         hmm_migratable,
1571                                                                         block_faults));
1572 
1573     tracker_status = uvm_tracker_add_tracker_safe(&batch_context->tracker, &va_block->tracker);
1574 
1575     uvm_mutex_unlock(&va_block->lock);
1576 
1577     if (uvm_va_block_is_hmm(va_block))
1578         uvm_hmm_migrate_finish(va_block);
1579 
1580     return status == NV_OK? tracker_status: status;
1581 }
1582 
1583 typedef enum
1584 {
1585     // Use this mode when calling from the normal fault servicing path
1586     FAULT_SERVICE_MODE_REGULAR,
1587 
1588     // Use this mode when servicing faults from the fault cancelling algorithm.
1589     // In this mode no replays are issued
1590     FAULT_SERVICE_MODE_CANCEL,
1591 } fault_service_mode_t;
1592 
1593 static void service_fault_batch_fatal(uvm_gpu_t *gpu,
1594                                       uvm_fault_service_batch_context_t *batch_context,
1595                                       NvU32 first_fault_index,
1596                                       NV_STATUS status,
1597                                       uvm_fault_cancel_va_mode_t cancel_va_mode,
1598                                       NvU32 *block_faults)
1599 {
1600     uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index];
1601     const uvm_fault_buffer_entry_t *previous_entry = first_fault_index > 0 ?
1602                                                        batch_context->ordered_fault_cache[first_fault_index - 1] : NULL;
1603     bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
1604 
1605     if (is_duplicate)
1606         fault_entry_duplicate_flags(batch_context, current_entry, previous_entry);
1607 
1608     if (current_entry->fault_access_type == UVM_FAULT_ACCESS_TYPE_PREFETCH)
1609         mark_fault_invalid_prefetch(batch_context, current_entry);
1610     else
1611         mark_fault_fatal(batch_context, current_entry, uvm_tools_status_to_fatal_fault_reason(status), cancel_va_mode);
1612 
1613     (*block_faults)++;
1614 }
1615 
1616 static void service_fault_batch_fatal_notify(uvm_gpu_t *gpu,
1617                                              uvm_fault_service_batch_context_t *batch_context,
1618                                              NvU32 first_fault_index,
1619                                              NV_STATUS status,
1620                                              uvm_fault_cancel_va_mode_t cancel_va_mode,
1621                                              NvU32 *block_faults)
1622 {
1623     uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index];
1624     const uvm_fault_buffer_entry_t *previous_entry = first_fault_index > 0 ?
1625                                                        batch_context->ordered_fault_cache[first_fault_index - 1] : NULL;
1626     bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
1627 
1628     service_fault_batch_fatal(gpu, batch_context, first_fault_index, status, cancel_va_mode, block_faults);
1629 
1630     update_batch_and_notify_fault(gpu, batch_context, NULL, UVM_ID_INVALID, current_entry, is_duplicate);
1631 }
1632 
1633 static NV_STATUS service_fault_batch_ats_sub_vma(uvm_gpu_va_space_t *gpu_va_space,
1634                                                  struct vm_area_struct *vma,
1635                                                  NvU64 base,
1636                                                  uvm_fault_service_batch_context_t *batch_context,
1637                                                  NvU32 fault_index_start,
1638                                                  NvU32 fault_index_end,
1639                                                  NvU32 *block_faults)
1640 {
1641     NvU32 i;
1642     NV_STATUS status = NV_OK;
1643     uvm_gpu_t *gpu = gpu_va_space->gpu;
1644     uvm_ats_fault_context_t *ats_context = &batch_context->ats_context;
1645     const uvm_page_mask_t *read_fault_mask = &ats_context->read_fault_mask;
1646     const uvm_page_mask_t *write_fault_mask = &ats_context->write_fault_mask;
1647     const uvm_page_mask_t *reads_serviced_mask = &ats_context->reads_serviced_mask;
1648     uvm_page_mask_t *faults_serviced_mask = &ats_context->faults_serviced_mask;
1649     uvm_page_mask_t *accessed_mask = &ats_context->accessed_mask;
1650 
1651     UVM_ASSERT(vma);
1652 
1653     ats_context->client_type = UVM_FAULT_CLIENT_TYPE_GPC;
1654 
1655     uvm_page_mask_or(accessed_mask, write_fault_mask, read_fault_mask);
1656 
1657     status = uvm_ats_service_faults(gpu_va_space, vma, base, &batch_context->ats_context);
1658 
1659     // Remove prefetched pages from the serviced mask since fault servicing
1660     // failures belonging to prefetch pages need to be ignored.
1661     uvm_page_mask_and(faults_serviced_mask, faults_serviced_mask, accessed_mask);
1662 
1663     UVM_ASSERT(uvm_page_mask_subset(faults_serviced_mask, accessed_mask));
1664 
1665     if ((status != NV_OK) || uvm_page_mask_equal(faults_serviced_mask, accessed_mask)) {
1666         (*block_faults) += (fault_index_end - fault_index_start);
1667         return status;
1668     }
1669 
1670     // Check faults_serviced_mask and reads_serviced_mask for precise fault
1671     // attribution after calling the ATS servicing routine. The
1672     // errors returned from ATS servicing routine should only be
1673     // global errors such as OOM or ECC. uvm_gpu_service_replayable_faults()
1674     // handles global errors by calling cancel_fault_batch(). Precise
1675     // attribution isn't currently supported in such cases.
1676     //
1677     // Precise fault attribution for global errors can be handled by
1678     // servicing one fault at a time until fault servicing encounters an
1679     // error.
1680     // TODO: Bug 3989244: Precise ATS fault attribution for global errors.
1681     for (i = fault_index_start; i < fault_index_end; i++) {
1682         uvm_page_index_t page_index;
1683         uvm_fault_cancel_va_mode_t cancel_va_mode;
1684         uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
1685         uvm_fault_access_type_t access_type = current_entry->fault_access_type;
1686 
1687         page_index = (current_entry->fault_address - base) / PAGE_SIZE;
1688 
1689         if (uvm_page_mask_test(faults_serviced_mask, page_index)) {
1690             (*block_faults)++;
1691             continue;
1692         }
1693 
1694         if (access_type <= UVM_FAULT_ACCESS_TYPE_READ) {
1695             cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
1696         }
1697 	else {
1698             UVM_ASSERT(access_type >= UVM_FAULT_ACCESS_TYPE_WRITE);
1699             if (uvm_fault_access_type_mask_test(current_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ) &&
1700                 !uvm_page_mask_test(reads_serviced_mask, page_index))
1701                 cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
1702             else
1703                 cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_WRITE_AND_ATOMIC;
1704         }
1705 
1706         service_fault_batch_fatal(gpu, batch_context, i, NV_ERR_INVALID_ADDRESS, cancel_va_mode, block_faults);
1707     }
1708 
1709     return status;
1710 }
1711 
1712 static void start_new_sub_batch(NvU64 *sub_batch_base,
1713                                 NvU64 address,
1714                                 NvU32 *sub_batch_fault_index,
1715                                 NvU32 fault_index,
1716                                 uvm_ats_fault_context_t *ats_context)
1717 {
1718     uvm_page_mask_zero(&ats_context->read_fault_mask);
1719     uvm_page_mask_zero(&ats_context->write_fault_mask);
1720 
1721     *sub_batch_fault_index = fault_index;
1722     *sub_batch_base = UVM_VA_BLOCK_ALIGN_DOWN(address);
1723 }
1724 
1725 static NV_STATUS service_fault_batch_ats_sub(uvm_gpu_va_space_t *gpu_va_space,
1726                                              struct vm_area_struct *vma,
1727                                              uvm_fault_service_batch_context_t *batch_context,
1728                                              NvU32 fault_index,
1729                                              NvU64 outer,
1730                                              NvU32 *block_faults)
1731 {
1732     NV_STATUS status = NV_OK;
1733     NvU32 i = fault_index;
1734     NvU32 sub_batch_fault_index;
1735     NvU64 sub_batch_base;
1736     uvm_fault_buffer_entry_t *previous_entry = NULL;
1737     uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
1738     uvm_ats_fault_context_t *ats_context = &batch_context->ats_context;
1739     uvm_page_mask_t *read_fault_mask = &ats_context->read_fault_mask;
1740     uvm_page_mask_t *write_fault_mask = &ats_context->write_fault_mask;
1741     uvm_gpu_t *gpu = gpu_va_space->gpu;
1742     bool replay_per_va_block =
1743                         (gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK);
1744 
1745     UVM_ASSERT(vma);
1746 
1747     outer = min(outer, (NvU64) vma->vm_end);
1748 
1749     start_new_sub_batch(&sub_batch_base, current_entry->fault_address, &sub_batch_fault_index, i, ats_context);
1750 
1751     do {
1752         uvm_page_index_t page_index;
1753         NvU64 fault_address = current_entry->fault_address;
1754         uvm_fault_access_type_t access_type = current_entry->fault_access_type;
1755         bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
1756 
1757         // ATS faults can't be unserviceable, since unserviceable faults require
1758         // GMMU PTEs.
1759         UVM_ASSERT(!current_entry->is_fatal);
1760 
1761         i++;
1762 
1763         update_batch_and_notify_fault(gpu_va_space->gpu,
1764                                       batch_context,
1765                                       NULL,
1766                                       UVM_ID_INVALID,
1767                                       current_entry,
1768                                       is_duplicate);
1769 
1770         // End of sub-batch. Service faults gathered so far.
1771         if (fault_address >= (sub_batch_base + UVM_VA_BLOCK_SIZE)) {
1772             UVM_ASSERT(!uvm_page_mask_empty(read_fault_mask) || !uvm_page_mask_empty(write_fault_mask));
1773 
1774             status = service_fault_batch_ats_sub_vma(gpu_va_space,
1775                                                      vma,
1776                                                      sub_batch_base,
1777                                                      batch_context,
1778                                                      sub_batch_fault_index,
1779                                                      i - 1,
1780                                                      block_faults);
1781             if (status != NV_OK || replay_per_va_block)
1782                 break;
1783 
1784             start_new_sub_batch(&sub_batch_base, fault_address, &sub_batch_fault_index, i - 1, ats_context);
1785         }
1786 
1787         page_index = (fault_address - sub_batch_base) / PAGE_SIZE;
1788 
1789         if ((access_type <= UVM_FAULT_ACCESS_TYPE_READ) ||
1790              uvm_fault_access_type_mask_test(current_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ))
1791             uvm_page_mask_set(read_fault_mask, page_index);
1792 
1793         if (access_type >= UVM_FAULT_ACCESS_TYPE_WRITE)
1794             uvm_page_mask_set(write_fault_mask, page_index);
1795 
1796         previous_entry = current_entry;
1797         current_entry = i < batch_context->num_coalesced_faults ? batch_context->ordered_fault_cache[i] : NULL;
1798 
1799     } while (current_entry &&
1800              (current_entry->fault_address < outer) &&
1801              (previous_entry->va_space == current_entry->va_space));
1802 
1803     // Service the last sub-batch.
1804     if ((status == NV_OK) && (!uvm_page_mask_empty(read_fault_mask) || !uvm_page_mask_empty(write_fault_mask))) {
1805         status = service_fault_batch_ats_sub_vma(gpu_va_space,
1806                                                  vma,
1807                                                  sub_batch_base,
1808                                                  batch_context,
1809                                                  sub_batch_fault_index,
1810                                                  i,
1811                                                  block_faults);
1812     }
1813 
1814     return status;
1815 }
1816 
1817 static NV_STATUS service_fault_batch_ats(uvm_gpu_va_space_t *gpu_va_space,
1818                                          struct mm_struct *mm,
1819                                          uvm_fault_service_batch_context_t *batch_context,
1820                                          NvU32 first_fault_index,
1821                                          NvU64 outer,
1822                                          NvU32 *block_faults)
1823 {
1824     NvU32 i;
1825     NV_STATUS status = NV_OK;
1826 
1827     for (i = first_fault_index; i < batch_context->num_coalesced_faults;) {
1828         uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
1829         const uvm_fault_buffer_entry_t *previous_entry = i > first_fault_index ?
1830                                                                        batch_context->ordered_fault_cache[i - 1] : NULL;
1831         NvU64 fault_address = current_entry->fault_address;
1832         struct vm_area_struct *vma;
1833         NvU32 num_faults_before = (*block_faults);
1834 
1835         if (previous_entry && (previous_entry->va_space != current_entry->va_space))
1836             break;
1837 
1838         if (fault_address >= outer)
1839             break;
1840 
1841         vma = find_vma_intersection(mm, fault_address, fault_address + 1);
1842         if (!vma) {
1843             // Since a vma wasn't found, cancel all accesses on the page since
1844             // cancelling write and atomic accesses will not cancel pending read
1845             // faults and this can lead to a deadlock since read faults need to
1846             // be serviced first before cancelling write faults.
1847             service_fault_batch_fatal_notify(gpu_va_space->gpu,
1848                                              batch_context,
1849                                              i,
1850                                              NV_ERR_INVALID_ADDRESS,
1851                                              UVM_FAULT_CANCEL_VA_MODE_ALL,
1852                                              block_faults);
1853 
1854             // Do not fail due to logical errors.
1855             status = NV_OK;
1856 
1857             break;
1858         }
1859 
1860         status = service_fault_batch_ats_sub(gpu_va_space, vma, batch_context, i, outer, block_faults);
1861         if (status != NV_OK)
1862             break;
1863 
1864         i += ((*block_faults) - num_faults_before);
1865     }
1866 
1867     return status;
1868 }
1869 
1870 static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space,
1871                                               uvm_gpu_va_space_t *gpu_va_space,
1872                                               uvm_fault_service_batch_context_t *batch_context,
1873                                               NvU32 fault_index,
1874                                               NvU32 *block_faults,
1875                                               bool replay_per_va_block,
1876                                               const bool hmm_migratable)
1877 {
1878     NV_STATUS status;
1879     uvm_va_range_t *va_range = NULL;
1880     uvm_va_range_t *va_range_next = NULL;
1881     uvm_va_block_t *va_block;
1882     uvm_gpu_t *gpu = gpu_va_space->gpu;
1883     uvm_va_block_context_t *va_block_context =
1884         gpu->parent->fault_buffer_info.replayable.block_service_context.block_context;
1885     uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[fault_index];
1886     struct mm_struct *mm = va_block_context->mm;
1887     NvU64 fault_address = current_entry->fault_address;
1888 
1889     (*block_faults) = 0;
1890 
1891     va_range_next = uvm_va_space_iter_first(va_space, fault_address, ~0ULL);
1892     if (va_range_next && (fault_address >= va_range_next->node.start)) {
1893         UVM_ASSERT(fault_address < va_range_next->node.end);
1894 
1895         va_range = va_range_next;
1896         va_range_next = uvm_va_space_iter_next(va_range_next, ~0ULL);
1897     }
1898 
1899     if (va_range)
1900         status = uvm_va_block_find_create_in_range(va_space, va_range, fault_address, &va_block);
1901     else if (mm)
1902         status = uvm_hmm_va_block_find_create(va_space, fault_address, &va_block_context->hmm.vma, &va_block);
1903     else
1904         status = NV_ERR_INVALID_ADDRESS;
1905 
1906     if (status == NV_OK) {
1907         status = service_fault_batch_block(gpu, va_block, batch_context, fault_index, hmm_migratable, block_faults);
1908     }
1909     else if ((status == NV_ERR_INVALID_ADDRESS) && uvm_ats_can_service_faults(gpu_va_space, mm)) {
1910         NvU64 outer = ~0ULL;
1911 
1912          UVM_ASSERT(replay_per_va_block ==
1913                     (gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK));
1914 
1915         // Limit outer to the minimum of next va_range.start and first
1916         // fault_address' next UVM_GMMU_ATS_GRANULARITY alignment so that it's
1917         // enough to check whether the first fault in this dispatch belongs to a
1918         // GMMU region.
1919         if (va_range_next) {
1920             outer = min(va_range_next->node.start,
1921                            UVM_ALIGN_DOWN(fault_address + UVM_GMMU_ATS_GRANULARITY, UVM_GMMU_ATS_GRANULARITY));
1922         }
1923 
1924         // ATS lookups are disabled on all addresses within the same
1925         // UVM_GMMU_ATS_GRANULARITY as existing GMMU mappings (see documentation
1926         // in uvm_mmu.h). User mode is supposed to reserve VAs as appropriate to
1927         // prevent any system memory allocations from falling within the NO_ATS
1928         // range of other GMMU mappings, so this shouldn't happen during normal
1929         // operation. However, since this scenario may lead to infinite fault
1930         // loops, we handle it by canceling the fault.
1931         if (uvm_ats_check_in_gmmu_region(va_space, fault_address, va_range_next)) {
1932             service_fault_batch_fatal_notify(gpu,
1933                                              batch_context,
1934                                              fault_index,
1935                                              NV_ERR_INVALID_ADDRESS,
1936                                              UVM_FAULT_CANCEL_VA_MODE_ALL,
1937                                              block_faults);
1938 
1939             // Do not fail due to logical errors
1940             status = NV_OK;
1941         }
1942         else {
1943             status = service_fault_batch_ats(gpu_va_space, mm, batch_context, fault_index, outer, block_faults);
1944         }
1945     }
1946     else {
1947         service_fault_batch_fatal_notify(gpu,
1948                                          batch_context,
1949                                          fault_index,
1950                                          status,
1951                                          UVM_FAULT_CANCEL_VA_MODE_ALL,
1952                                          block_faults);
1953 
1954         // Do not fail due to logical errors
1955         status = NV_OK;
1956     }
1957 
1958     return status;
1959 }
1960 
1961 // Called when a fault in the batch has been marked fatal. Flush the buffer
1962 // under the VA and mmap locks to remove any potential stale fatal faults, then
1963 // service all new faults for just that VA space and cancel those which are
1964 // fatal. Faults in other VA spaces are replayed when done and will be processed
1965 // when normal fault servicing resumes.
1966 static NV_STATUS service_fault_batch_for_cancel(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
1967 {
1968     NV_STATUS status = NV_OK;
1969     NvU32 i;
1970     uvm_va_space_t *va_space = batch_context->fatal_va_space;
1971     uvm_gpu_va_space_t *gpu_va_space = NULL;
1972     struct mm_struct *mm;
1973     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
1974     uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.replayable.block_service_context;
1975     uvm_va_block_context_t *va_block_context = service_context->block_context;
1976 
1977     UVM_ASSERT(gpu->parent->replayable_faults_supported);
1978     UVM_ASSERT(va_space);
1979 
1980     // Perform the flush and re-fetch while holding the mmap_lock and the
1981     // VA space lock. This avoids stale faults because it prevents any vma
1982     // modifications (mmap, munmap, mprotect) from happening between the time HW
1983     // takes the fault and we cancel it.
1984     mm = uvm_va_space_mm_retain_lock(va_space);
1985     uvm_va_block_context_init(va_block_context, mm);
1986     uvm_va_space_down_read(va_space);
1987 
1988     // We saw fatal faults in this VA space before. Flush while holding
1989     // mmap_lock to make sure those faults come back (aren't stale).
1990     //
1991     // We need to wait until all old fault messages have arrived before
1992     // flushing, hence UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT.
1993     status = fault_buffer_flush_locked(gpu,
1994                                        UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
1995                                        UVM_FAULT_REPLAY_TYPE_START,
1996                                        batch_context);
1997     if (status != NV_OK)
1998         goto done;
1999 
2000     // Wait for the flush's replay to finish to give the legitimate faults a
2001     // chance to show up in the buffer again.
2002     status = uvm_tracker_wait(&replayable_faults->replay_tracker);
2003     if (status != NV_OK)
2004         goto done;
2005 
2006     // We expect all replayed faults to have arrived in the buffer so we can re-
2007     // service them. The replay-and-wait sequence above will ensure they're all
2008     // in the HW buffer. When GSP owns the HW buffer, we also have to wait for
2009     // GSP to copy all available faults from the HW buffer into the shadow
2010     // buffer.
2011     status = hw_fault_buffer_flush_locked(gpu->parent, HW_FAULT_BUFFER_FLUSH_MODE_MOVE);
2012     if (status != NV_OK)
2013         goto done;
2014 
2015     // If there is no GPU VA space for the GPU, ignore all faults in the VA
2016     // space. This can happen if the GPU VA space has been destroyed since we
2017     // unlocked the VA space in service_fault_batch. That means the fatal faults
2018     // are stale, because unregistering the GPU VA space requires preempting the
2019     // context and detaching all channels in that VA space. Restart fault
2020     // servicing from the top.
2021     gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
2022     if (!gpu_va_space)
2023         goto done;
2024 
2025     // Re-parse the new faults
2026     batch_context->num_invalid_prefetch_faults = 0;
2027     batch_context->num_duplicate_faults        = 0;
2028     batch_context->num_replays                 = 0;
2029     batch_context->fatal_va_space              = NULL;
2030     batch_context->has_throttled_faults        = false;
2031 
2032     status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_ALL);
2033     if (status != NV_OK)
2034         goto done;
2035 
2036     // No more faults left. Either the previously-seen fatal entry was stale, or
2037     // RM killed the context underneath us.
2038     if (batch_context->num_cached_faults == 0)
2039         goto done;
2040 
2041     ++batch_context->batch_id;
2042 
2043     status = preprocess_fault_batch(gpu, batch_context);
2044     if (status != NV_OK) {
2045         if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
2046             // Another flush happened due to stale faults or a context-fatal
2047             // error. The previously-seen fatal fault might not exist anymore,
2048             // so restart fault servicing from the top.
2049             status = NV_OK;
2050         }
2051 
2052         goto done;
2053     }
2054 
2055     // Search for the target VA space
2056     for (i = 0; i < batch_context->num_coalesced_faults; i++) {
2057         uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
2058         UVM_ASSERT(current_entry->va_space);
2059         if (current_entry->va_space == va_space)
2060             break;
2061     }
2062 
2063     while (i < batch_context->num_coalesced_faults) {
2064         uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
2065 
2066         if (current_entry->va_space != va_space)
2067             break;
2068 
2069         // service_fault_batch_dispatch() doesn't expect unserviceable faults.
2070         // Just cancel them directly.
2071         if (current_entry->is_fatal) {
2072             status = cancel_fault_precise_va(gpu, current_entry, UVM_FAULT_CANCEL_VA_MODE_ALL);
2073             if (status != NV_OK)
2074                 break;
2075 
2076             ++i;
2077         }
2078         else {
2079             uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate;
2080             NvU32 block_faults;
2081             const bool hmm_migratable = true;
2082 
2083             ats_invalidate->tlb_batch_pending = false;
2084 
2085             // Service all the faults that we can. We only really need to search
2086             // for fatal faults, but attempting to service all is the easiest
2087             // way to do that.
2088             status = service_fault_batch_dispatch(va_space, gpu_va_space, batch_context, i, &block_faults, false, hmm_migratable);
2089             if (status != NV_OK) {
2090                 // TODO: Bug 3900733: clean up locking in service_fault_batch().
2091                 // We need to drop lock and retry. That means flushing and
2092                 // starting over.
2093                 if (status == NV_WARN_MORE_PROCESSING_REQUIRED || status == NV_WARN_MISMATCHED_TARGET)
2094                     status = NV_OK;
2095 
2096                 break;
2097             }
2098 
2099             // Invalidate TLBs before cancel to ensure that fatal faults don't
2100             // get stuck in HW behind non-fatal faults to the same line.
2101             status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
2102             if (status != NV_OK)
2103                 break;
2104 
2105             while (block_faults-- > 0) {
2106                 current_entry = batch_context->ordered_fault_cache[i];
2107                 if (current_entry->is_fatal) {
2108                     status = cancel_fault_precise_va(gpu, current_entry, current_entry->replayable.cancel_va_mode);
2109                     if (status != NV_OK)
2110                         break;
2111                 }
2112 
2113                 ++i;
2114             }
2115         }
2116     }
2117 
2118 done:
2119     uvm_va_space_up_read(va_space);
2120     uvm_va_space_mm_release_unlock(va_space, mm);
2121 
2122     if (status == NV_OK) {
2123         // There are two reasons to flush the fault buffer here.
2124         //
2125         // 1) Functional. We need to replay both the serviced non-fatal faults
2126         //    and the skipped faults in other VA spaces. The former need to be
2127         //    restarted and the latter need to be replayed so the normal fault
2128         //    service mechanism can fetch and process them.
2129         //
2130         // 2) Performance. After cancelling the fatal faults, a flush removes
2131         //    any potential duplicated fault that may have been added while
2132         //    processing the faults in this batch. This flush also avoids doing
2133         //    unnecessary processing after the fatal faults have been cancelled,
2134         //    so all the rest are unlikely to remain after a replay because the
2135         //    context is probably in the process of dying.
2136         status = fault_buffer_flush_locked(gpu,
2137                                            UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
2138                                            UVM_FAULT_REPLAY_TYPE_START,
2139                                            batch_context);
2140     }
2141 
2142     return status;
2143 }
2144 // Scan the ordered view of faults and group them by different va_blocks
2145 // (managed faults) and service faults for each va_block, in batch.
2146 // Service non-managed faults one at a time as they are encountered during the
2147 // scan.
2148 //
2149 // Fatal faults are marked for later processing by the caller.
2150 static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
2151                                      fault_service_mode_t service_mode,
2152                                      uvm_fault_service_batch_context_t *batch_context)
2153 {
2154     NV_STATUS status = NV_OK;
2155     NvU32 i;
2156     uvm_va_space_t *va_space = NULL;
2157     uvm_gpu_va_space_t *gpu_va_space = NULL;
2158     uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate;
2159     struct mm_struct *mm = NULL;
2160     const bool replay_per_va_block = service_mode != FAULT_SERVICE_MODE_CANCEL &&
2161                                      gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK;
2162     uvm_service_block_context_t *service_context =
2163         &gpu->parent->fault_buffer_info.replayable.block_service_context;
2164     uvm_va_block_context_t *va_block_context = service_context->block_context;
2165     bool hmm_migratable = true;
2166 
2167     UVM_ASSERT(gpu->parent->replayable_faults_supported);
2168 
2169     ats_invalidate->tlb_batch_pending = false;
2170 
2171     for (i = 0; i < batch_context->num_coalesced_faults;) {
2172         NvU32 block_faults;
2173         uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
2174         uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[current_entry->fault_source.utlb_id];
2175 
2176         UVM_ASSERT(current_entry->va_space);
2177 
2178         if (current_entry->va_space != va_space) {
2179             // Fault on a different va_space, drop the lock of the old one...
2180             if (va_space != NULL) {
2181                 // TLB entries are invalidated per GPU VA space
2182                 status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
2183                 if (status != NV_OK)
2184                     goto fail;
2185 
2186                 uvm_va_space_up_read(va_space);
2187                 uvm_va_space_mm_release_unlock(va_space, mm);
2188                 mm = NULL;
2189             }
2190 
2191             va_space = current_entry->va_space;
2192 
2193             // ... and take the lock of the new one
2194 
2195             // If an mm is registered with the VA space, we have to retain it
2196             // in order to lock it before locking the VA space. It is guaranteed
2197             // to remain valid until we release. If no mm is registered, we
2198             // can only service managed faults, not ATS/HMM faults.
2199             mm = uvm_va_space_mm_retain_lock(va_space);
2200             uvm_va_block_context_init(va_block_context, mm);
2201 
2202             uvm_va_space_down_read(va_space);
2203             gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
2204         }
2205 
2206         // Some faults could be already fatal if they cannot be handled by
2207         // the UVM driver
2208         if (current_entry->is_fatal) {
2209             ++i;
2210             if (!batch_context->fatal_va_space)
2211                 batch_context->fatal_va_space = va_space;
2212 
2213             utlb->has_fatal_faults = true;
2214             UVM_ASSERT(utlb->num_pending_faults > 0);
2215             continue;
2216         }
2217 
2218         if (!gpu_va_space) {
2219             // If there is no GPU VA space for the GPU, ignore the fault. This
2220             // can happen if a GPU VA space is destroyed without explicitly
2221             // freeing all memory ranges and there are stale entries in the
2222             // buffer that got fixed by the servicing in a previous batch.
2223             ++i;
2224             continue;
2225         }
2226 
2227         status = service_fault_batch_dispatch(va_space,
2228                                               gpu_va_space,
2229                                               batch_context,
2230                                               i,
2231                                               &block_faults,
2232                                               replay_per_va_block,
2233                                               hmm_migratable);
2234         // TODO: Bug 3900733: clean up locking in service_fault_batch().
2235         if (status == NV_WARN_MORE_PROCESSING_REQUIRED || status == NV_WARN_MISMATCHED_TARGET) {
2236             if (status == NV_WARN_MISMATCHED_TARGET)
2237                 hmm_migratable = false;
2238             uvm_va_space_up_read(va_space);
2239             uvm_va_space_mm_release_unlock(va_space, mm);
2240             mm = NULL;
2241             va_space = NULL;
2242             status = NV_OK;
2243             continue;
2244         }
2245 
2246         if (status != NV_OK)
2247             goto fail;
2248 
2249         hmm_migratable = true;
2250         i += block_faults;
2251 
2252         // Don't issue replays in cancel mode
2253         if (replay_per_va_block && !batch_context->fatal_va_space) {
2254             status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
2255             if (status != NV_OK)
2256                 goto fail;
2257 
2258             // Increment the batch id if UVM_PERF_FAULT_REPLAY_POLICY_BLOCK
2259             // is used, as we issue a replay after servicing each VA block
2260             // and we can service a number of VA blocks before returning.
2261             ++batch_context->batch_id;
2262         }
2263     }
2264 
2265     if (va_space != NULL) {
2266         NV_STATUS invalidate_status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
2267         if (invalidate_status != NV_OK)
2268             status = invalidate_status;
2269     }
2270 
2271 fail:
2272     if (va_space != NULL) {
2273         uvm_va_space_up_read(va_space);
2274         uvm_va_space_mm_release_unlock(va_space, mm);
2275     }
2276 
2277     return status;
2278 }
2279 
2280 // Tells if the given fault entry is the first one in its uTLB
2281 static bool is_first_fault_in_utlb(uvm_fault_service_batch_context_t *batch_context, NvU32 fault_index)
2282 {
2283     NvU32 i;
2284     NvU32 utlb_id = batch_context->fault_cache[fault_index].fault_source.utlb_id;
2285 
2286     for (i = 0; i < fault_index; ++i) {
2287         uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i];
2288 
2289         // We have found a prior fault in the same uTLB
2290         if (current_entry->fault_source.utlb_id == utlb_id)
2291             return false;
2292     }
2293 
2294     return true;
2295 }
2296 
2297 // Compute the number of fatal and non-fatal faults for a page in the given uTLB
2298 static void faults_for_page_in_utlb(uvm_fault_service_batch_context_t *batch_context,
2299                                     uvm_va_space_t *va_space,
2300                                     NvU64 addr,
2301                                     NvU32 utlb_id,
2302                                     NvU32 *fatal_faults,
2303                                     NvU32 *non_fatal_faults)
2304 {
2305     NvU32 i;
2306 
2307     *fatal_faults = 0;
2308     *non_fatal_faults = 0;
2309 
2310     // Fault filtering is not allowed in the TLB-based fault cancel path
2311     UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults);
2312 
2313     for (i = 0; i < batch_context->num_cached_faults; ++i) {
2314         uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i];
2315 
2316         if (current_entry->fault_source.utlb_id == utlb_id &&
2317             current_entry->va_space == va_space && current_entry->fault_address == addr) {
2318             // We have found the page
2319             if (current_entry->is_fatal)
2320                 ++(*fatal_faults);
2321             else
2322                 ++(*non_fatal_faults);
2323         }
2324     }
2325 }
2326 
2327 // Function that tells if there are addresses (reminder: they are aligned to 4K)
2328 // with non-fatal faults only
2329 static bool no_fatal_pages_in_utlb(uvm_fault_service_batch_context_t *batch_context,
2330                                    NvU32 start_index,
2331                                    NvU32 utlb_id)
2332 {
2333     NvU32 i;
2334 
2335     // Fault filtering is not allowed in the TLB-based fault cancel path
2336     UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults);
2337 
2338     for (i = start_index; i < batch_context->num_cached_faults; ++i) {
2339         uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i];
2340 
2341         if (current_entry->fault_source.utlb_id == utlb_id) {
2342             // We have found a fault for the uTLB
2343             NvU32 fatal_faults;
2344             NvU32 non_fatal_faults;
2345 
2346             faults_for_page_in_utlb(batch_context,
2347                                     current_entry->va_space,
2348                                     current_entry->fault_address,
2349                                     utlb_id,
2350                                     &fatal_faults,
2351                                     &non_fatal_faults);
2352 
2353             if (non_fatal_faults > 0 && fatal_faults == 0)
2354                 return true;
2355         }
2356     }
2357 
2358     return false;
2359 }
2360 
2361 static void record_fatal_fault_helper(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *entry, UvmEventFatalReason reason)
2362 {
2363     uvm_va_space_t *va_space;
2364 
2365     va_space = entry->va_space;
2366     UVM_ASSERT(va_space);
2367     uvm_va_space_down_read(va_space);
2368     // Record fatal fault event
2369     uvm_tools_record_gpu_fatal_fault(gpu->id, va_space, entry, reason);
2370     uvm_va_space_up_read(va_space);
2371 }
2372 
2373 // This function tries to find and issue a cancel for each uTLB that meets
2374 // the requirements to guarantee precise fault attribution:
2375 // - No new faults can arrive on the uTLB (uTLB is in lockdown)
2376 // - The first fault in the buffer for a specific uTLB is fatal
2377 // - There are no other addresses in the uTLB with non-fatal faults only
2378 //
2379 // This function and the related helpers iterate over faults as read from HW,
2380 // not through the ordered fault view
2381 //
2382 // TODO: Bug 1766754
2383 // This is very costly, although not critical for performance since we are
2384 // cancelling.
2385 // - Build a list with all the faults within a uTLB
2386 // - Sort by uTLB id
2387 static NV_STATUS try_to_cancel_utlbs(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
2388 {
2389     NvU32 i;
2390 
2391     // Fault filtering is not allowed in the TLB-based fault cancel path
2392     UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults);
2393 
2394     for (i = 0; i < batch_context->num_cached_faults; ++i) {
2395         uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i];
2396         uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[current_entry->fault_source.utlb_id];
2397         NvU32 gpc_id = current_entry->fault_source.gpc_id;
2398         NvU32 utlb_id = current_entry->fault_source.utlb_id;
2399         NvU32 client_id = current_entry->fault_source.client_id;
2400 
2401         // Only fatal faults are considered
2402         if (!current_entry->is_fatal)
2403             continue;
2404 
2405         // Only consider uTLBs in lock-down
2406         if (!utlb->in_lockdown)
2407             continue;
2408 
2409         // Issue a single cancel per uTLB
2410         if (utlb->cancelled)
2411             continue;
2412 
2413         if (is_first_fault_in_utlb(batch_context, i) &&
2414             !no_fatal_pages_in_utlb(batch_context, i + 1, utlb_id)) {
2415             NV_STATUS status;
2416 
2417             record_fatal_fault_helper(gpu, current_entry, current_entry->fatal_reason);
2418 
2419             status = push_cancel_on_gpu_targeted(gpu,
2420                                                  current_entry->instance_ptr,
2421                                                  gpc_id,
2422                                                  client_id,
2423                                                  &batch_context->tracker);
2424             if (status != NV_OK)
2425                 return status;
2426 
2427             utlb->cancelled = true;
2428         }
2429     }
2430 
2431     return NV_OK;
2432 }
2433 
2434 static NvU32 find_fatal_fault_in_utlb(uvm_fault_service_batch_context_t *batch_context,
2435                                       NvU32 utlb_id)
2436 {
2437     NvU32 i;
2438 
2439     // Fault filtering is not allowed in the TLB-based fault cancel path
2440     UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults);
2441 
2442     for (i = 0; i < batch_context->num_cached_faults; ++i) {
2443         if (batch_context->fault_cache[i].is_fatal &&
2444             batch_context->fault_cache[i].fault_source.utlb_id == utlb_id)
2445             return i;
2446     }
2447 
2448     return i;
2449 }
2450 
2451 static NvU32 is_fatal_fault_in_buffer(uvm_fault_service_batch_context_t *batch_context,
2452                                       uvm_fault_buffer_entry_t *fault)
2453 {
2454     NvU32 i;
2455 
2456     // Fault filtering is not allowed in the TLB-based fault cancel path
2457     UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults);
2458 
2459     for (i = 0; i < batch_context->num_cached_faults; ++i) {
2460         uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i];
2461         if (cmp_fault_instance_ptr(current_entry, fault) == 0 &&
2462             current_entry->fault_address == fault->fault_address &&
2463             current_entry->fault_access_type == fault->fault_access_type &&
2464             current_entry->fault_source.utlb_id == fault->fault_source.utlb_id) {
2465             return true;
2466         }
2467     }
2468 
2469     return false;
2470 }
2471 
2472 // Cancel all faults in the given fault service batch context, even those not
2473 // marked as fatal.
2474 static NV_STATUS cancel_faults_all(uvm_gpu_t *gpu,
2475                                    uvm_fault_service_batch_context_t *batch_context,
2476                                    UvmEventFatalReason reason)
2477 {
2478     NV_STATUS status = NV_OK;
2479     NV_STATUS fault_status;
2480     NvU32 i = 0;
2481 
2482     UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
2483     UVM_ASSERT(reason != UvmEventFatalReasonInvalid);
2484 
2485     while (i < batch_context->num_coalesced_faults && status == NV_OK) {
2486         uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
2487         uvm_va_space_t *va_space = current_entry->va_space;
2488         bool skip_va_space;
2489 
2490         UVM_ASSERT(va_space);
2491 
2492         uvm_va_space_down_read(va_space);
2493 
2494         // If there is no GPU VA space for the GPU, ignore all faults in
2495         // that VA space. This can happen if the GPU VA space has been
2496         // destroyed since we unlocked the VA space in service_fault_batch.
2497         // Ignoring the fault avoids targetting a PDB that might have been
2498         // reused by another process.
2499         skip_va_space = !uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
2500 
2501         for (;
2502              i < batch_context->num_coalesced_faults && current_entry->va_space == va_space;
2503              current_entry = batch_context->ordered_fault_cache[++i]) {
2504             uvm_fault_cancel_va_mode_t cancel_va_mode;
2505 
2506             if (skip_va_space)
2507                 continue;
2508 
2509             if (current_entry->is_fatal) {
2510                 UVM_ASSERT(current_entry->fatal_reason != UvmEventFatalReasonInvalid);
2511                 cancel_va_mode = current_entry->replayable.cancel_va_mode;
2512             }
2513             else {
2514                 current_entry->fatal_reason = reason;
2515                 cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
2516             }
2517 
2518             status = cancel_fault_precise_va(gpu, current_entry, cancel_va_mode);
2519             if (status != NV_OK)
2520                 break;
2521         }
2522 
2523         uvm_va_space_up_read(va_space);
2524     }
2525 
2526     // Because each cancel itself triggers a replay, there may be a large number
2527     // of new duplicated faults in the buffer after cancelling all the known
2528     // ones. Flushing the buffer discards them to avoid unnecessary processing.
2529     fault_status = fault_buffer_flush_locked(gpu,
2530                                              UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
2531                                              UVM_FAULT_REPLAY_TYPE_START,
2532                                              batch_context);
2533 
2534     // We report the first encountered error.
2535     if (status == NV_OK)
2536         status = fault_status;
2537 
2538     return status;
2539 }
2540 
2541 // Function called when the system has found a global error and needs to
2542 // trigger RC in RM.
2543 static void cancel_fault_batch_tlb(uvm_gpu_t *gpu,
2544                                    uvm_fault_service_batch_context_t *batch_context,
2545                                    UvmEventFatalReason reason)
2546 {
2547     NvU32 i;
2548 
2549     for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
2550         NV_STATUS status = NV_OK;
2551         uvm_fault_buffer_entry_t *current_entry;
2552         uvm_fault_buffer_entry_t *coalesced_entry;
2553 
2554         current_entry = batch_context->ordered_fault_cache[i];
2555 
2556         // The list iteration below skips the entry used as 'head'.
2557         // Report the 'head' entry explicitly.
2558         uvm_va_space_down_read(current_entry->va_space);
2559         uvm_tools_record_gpu_fatal_fault(gpu->id, current_entry->va_space, current_entry, reason);
2560 
2561         list_for_each_entry(coalesced_entry, &current_entry->merged_instances_list, merged_instances_list)
2562             uvm_tools_record_gpu_fatal_fault(gpu->id, current_entry->va_space, coalesced_entry, reason);
2563         uvm_va_space_up_read(current_entry->va_space);
2564 
2565         // We need to cancel each instance pointer to correctly handle faults from multiple contexts.
2566         status = push_cancel_on_gpu_global(gpu, current_entry->instance_ptr, &batch_context->tracker);
2567         if (status != NV_OK)
2568             break;
2569     }
2570 }
2571 
2572 static void cancel_fault_batch(uvm_gpu_t *gpu,
2573                                uvm_fault_service_batch_context_t *batch_context,
2574                                UvmEventFatalReason reason)
2575 {
2576     // Return code is ignored since we're on a global error path and wouldn't be
2577     // able to recover anyway.
2578     if (gpu->parent->fault_cancel_va_supported)
2579         cancel_faults_all(gpu, batch_context, reason);
2580     else
2581         cancel_fault_batch_tlb(gpu, batch_context, reason);
2582 }
2583 
2584 
2585 // Current fault cancel algorithm
2586 //
2587 // 1- Disable prefetching to avoid new requests keep coming and flooding the
2588 // buffer.
2589 // LOOP
2590 //   2- Record one fatal fault per uTLB to check if it shows up after the replay
2591 //   3- Flush fault buffer (REPLAY_TYPE_START_ACK_ALL to prevent new faults from
2592 //      coming to TLBs with pending faults)
2593 //   4- Wait for replay to finish
2594 //   5- Fetch all faults from buffer
2595 //   6- Check what uTLBs are in lockdown mode and can be cancelled
2596 //   7- Preprocess faults (order per va_space, fault address, access type)
2597 //   8- Service all non-fatal faults and mark all non-serviceable faults as fatal
2598 //      6.1- If fatal faults are not found, we are done
2599 //   9- Search for a uTLB which can be targeted for cancel, as described in
2600 //      try_to_cancel_utlbs. If found, cancel it.
2601 // END LOOP
2602 // 10- Re-enable prefetching
2603 //
2604 // NOTE: prefetch faults MUST NOT trigger fault cancel. We make sure that no
2605 // prefetch faults are left in the buffer by disabling prefetching and
2606 // flushing the fault buffer afterwards (prefetch faults are not replayed and,
2607 // therefore, will not show up again)
2608 static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
2609 {
2610     NV_STATUS status;
2611     NV_STATUS tracker_status;
2612     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
2613     bool first = true;
2614 
2615     UVM_ASSERT(gpu->parent->replayable_faults_supported);
2616 
2617     // 1) Disable prefetching to avoid new requests keep coming and flooding
2618     //    the buffer
2619     if (gpu->parent->fault_buffer_info.prefetch_faults_enabled)
2620         gpu->parent->arch_hal->disable_prefetch_faults(gpu->parent);
2621 
2622     while (1) {
2623         NvU32 utlb_id;
2624 
2625         // 2) Record one fatal fault per uTLB to check if it shows up after
2626         // the replay. This is used to handle the case in which the uTLB is
2627         // being cancelled from behind our backs by RM. See the comment in
2628         // step 6.
2629         for (utlb_id = 0; utlb_id <= batch_context->max_utlb_id; ++utlb_id) {
2630             uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[utlb_id];
2631 
2632             if (!first && utlb->has_fatal_faults) {
2633                 NvU32 idx = find_fatal_fault_in_utlb(batch_context, utlb_id);
2634                 UVM_ASSERT(idx < batch_context->num_cached_faults);
2635 
2636                 utlb->prev_fatal_fault = batch_context->fault_cache[idx];
2637             }
2638             else {
2639                 utlb->prev_fatal_fault.fault_address = (NvU64)-1;
2640             }
2641         }
2642         first = false;
2643 
2644         // 3) Flush fault buffer. After this call, all faults from any of the
2645         // faulting uTLBs are before PUT. New faults from other uTLBs can keep
2646         // arriving. Therefore, in each iteration we just try to cancel faults
2647         // from uTLBs that contained fatal faults in the previous iterations
2648         // and will cause the TLB to stop generating new page faults after the
2649         // following replay with type UVM_FAULT_REPLAY_TYPE_START_ACK_ALL.
2650         //
2651         // No need to use UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT since we
2652         // don't care too much about old faults, just new faults from uTLBs
2653         // which faulted before the replay.
2654         status = fault_buffer_flush_locked(gpu,
2655                                            UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
2656                                            UVM_FAULT_REPLAY_TYPE_START_ACK_ALL,
2657                                            batch_context);
2658         if (status != NV_OK)
2659             break;
2660 
2661         // 4) Wait for replay to finish
2662         status = uvm_tracker_wait(&replayable_faults->replay_tracker);
2663         if (status != NV_OK)
2664             break;
2665 
2666         batch_context->num_invalid_prefetch_faults = 0;
2667         batch_context->num_replays                 = 0;
2668         batch_context->fatal_va_space              = NULL;
2669         batch_context->has_throttled_faults        = false;
2670 
2671         // 5) Fetch all faults from buffer
2672         status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_ALL);
2673         if (status != NV_OK)
2674             break;
2675 
2676         ++batch_context->batch_id;
2677 
2678         UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults);
2679 
2680         // No more faults left, we are done
2681         if (batch_context->num_cached_faults == 0)
2682             break;
2683 
2684         // 6) Check what uTLBs are in lockdown mode and can be cancelled
2685         for (utlb_id = 0; utlb_id <= batch_context->max_utlb_id; ++utlb_id) {
2686             uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[utlb_id];
2687 
2688             utlb->in_lockdown = false;
2689             utlb->cancelled   = false;
2690 
2691             if (utlb->prev_fatal_fault.fault_address != (NvU64)-1) {
2692                 // If a previously-reported fault shows up again we can "safely"
2693                 // assume that the uTLB that contains it is in lockdown mode
2694                 // and no new translations will show up before cancel.
2695                 // A fatal fault could only be removed behind our backs by RM
2696                 // issuing a cancel, which only happens when RM is resetting the
2697                 // engine. That means the instance pointer can't generate any
2698                 // new faults, so we won't have an ABA problem where a new
2699                 // fault arrives with the same state.
2700                 if (is_fatal_fault_in_buffer(batch_context, &utlb->prev_fatal_fault))
2701                     utlb->in_lockdown = true;
2702             }
2703         }
2704 
2705         // 7) Preprocess faults
2706         status = preprocess_fault_batch(gpu, batch_context);
2707         if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
2708             continue;
2709         else if (status != NV_OK)
2710             break;
2711 
2712         // 8) Service all non-fatal faults and mark all non-serviceable faults
2713         // as fatal
2714         status = service_fault_batch(gpu, FAULT_SERVICE_MODE_CANCEL, batch_context);
2715         UVM_ASSERT(batch_context->num_replays == 0);
2716         if (status == NV_ERR_NO_MEMORY)
2717             continue;
2718         else if (status != NV_OK)
2719             break;
2720 
2721         // No more fatal faults left, we are done
2722         if (!batch_context->fatal_va_space)
2723             break;
2724 
2725         // 9) Search for uTLBs that contain fatal faults and meet the
2726         // requirements to be cancelled
2727         try_to_cancel_utlbs(gpu, batch_context);
2728     }
2729 
2730     // 10) Re-enable prefetching
2731     if (gpu->parent->fault_buffer_info.prefetch_faults_enabled)
2732         gpu->parent->arch_hal->enable_prefetch_faults(gpu->parent);
2733 
2734     if (status == NV_OK)
2735         status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
2736 
2737     tracker_status = uvm_tracker_wait(&batch_context->tracker);
2738 
2739     return status == NV_OK? tracker_status: status;
2740 }
2741 
2742 static NV_STATUS cancel_faults_precise(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
2743 {
2744     UVM_ASSERT(batch_context->fatal_va_space);
2745     if (gpu->parent->fault_cancel_va_supported)
2746         return service_fault_batch_for_cancel(gpu, batch_context);
2747 
2748     return cancel_faults_precise_tlb(gpu, batch_context);
2749 }
2750 
2751 static void enable_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu, uvm_fault_service_batch_context_t *batch_context)
2752 {
2753     if (!parent_gpu->prefetch_fault_supported)
2754         return;
2755 
2756     // If more than 66% of faults are invalid prefetch accesses, disable
2757     // prefetch faults for a while.
2758     // num_invalid_prefetch_faults may be higher than the actual count. See the
2759     // comment in mark_fault_invalid_prefetch(..).
2760     // Some tests rely on this logic (and ratio) to correctly disable prefetch
2761     // fault reporting. If the logic changes, the tests will have to be changed.
2762     if (parent_gpu->fault_buffer_info.prefetch_faults_enabled &&
2763         uvm_perf_reenable_prefetch_faults_lapse_msec > 0 &&
2764         ((batch_context->num_invalid_prefetch_faults * 3 > parent_gpu->fault_buffer_info.max_batch_size * 2) ||
2765          (uvm_enable_builtin_tests &&
2766           parent_gpu->rm_info.isSimulated &&
2767           batch_context->num_invalid_prefetch_faults > 5))) {
2768         uvm_parent_gpu_disable_prefetch_faults(parent_gpu);
2769     }
2770     else if (!parent_gpu->fault_buffer_info.prefetch_faults_enabled) {
2771         NvU64 lapse = NV_GETTIME() - parent_gpu->fault_buffer_info.disable_prefetch_faults_timestamp;
2772 
2773         // Reenable prefetch faults after some time
2774         if (lapse > ((NvU64)uvm_perf_reenable_prefetch_faults_lapse_msec * (1000 * 1000)))
2775             uvm_parent_gpu_enable_prefetch_faults(parent_gpu);
2776     }
2777 }
2778 
2779 void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
2780 {
2781     NvU32 num_replays = 0;
2782     NvU32 num_batches = 0;
2783     NvU32 num_throttled = 0;
2784     NV_STATUS status = NV_OK;
2785     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
2786     uvm_fault_service_batch_context_t *batch_context = &replayable_faults->batch_service_context;
2787 
2788     UVM_ASSERT(gpu->parent->replayable_faults_supported);
2789 
2790     uvm_tracker_init(&batch_context->tracker);
2791 
2792     // Process all faults in the buffer
2793     while (1) {
2794         if (num_throttled >= uvm_perf_fault_max_throttle_per_service ||
2795             num_batches >= uvm_perf_fault_max_batches_per_service) {
2796             break;
2797         }
2798 
2799         batch_context->num_invalid_prefetch_faults = 0;
2800         batch_context->num_duplicate_faults        = 0;
2801         batch_context->num_replays                 = 0;
2802         batch_context->fatal_va_space              = NULL;
2803         batch_context->has_throttled_faults        = false;
2804 
2805         status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_BATCH_READY);
2806         if (status != NV_OK)
2807             break;
2808 
2809         if (batch_context->num_cached_faults == 0)
2810             break;
2811 
2812         ++batch_context->batch_id;
2813 
2814         status = preprocess_fault_batch(gpu, batch_context);
2815 
2816         num_replays += batch_context->num_replays;
2817 
2818         if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
2819             continue;
2820         else if (status != NV_OK)
2821             break;
2822 
2823         status = service_fault_batch(gpu, FAULT_SERVICE_MODE_REGULAR, batch_context);
2824 
2825         // We may have issued replays even if status != NV_OK if
2826         // UVM_PERF_FAULT_REPLAY_POLICY_BLOCK is being used or the fault buffer
2827         // was flushed
2828         num_replays += batch_context->num_replays;
2829 
2830         enable_disable_prefetch_faults(gpu->parent, batch_context);
2831 
2832         if (status != NV_OK) {
2833             // Unconditionally cancel all faults to trigger RC. This will not
2834             // provide precise attribution, but this case handles global
2835             // errors such as OOM or ECC where it's not reasonable to
2836             // guarantee precise attribution. We ignore the return value of
2837             // the cancel operation since this path is already returning an
2838             // error code.
2839             cancel_fault_batch(gpu, batch_context, uvm_tools_status_to_fatal_fault_reason(status));
2840             break;
2841         }
2842 
2843         if (batch_context->fatal_va_space) {
2844             status = uvm_tracker_wait(&batch_context->tracker);
2845             if (status == NV_OK) {
2846                 status = cancel_faults_precise(gpu, batch_context);
2847                 if (status == NV_OK) {
2848                     // Cancel handling should've issued at least one replay
2849                     UVM_ASSERT(batch_context->num_replays > 0);
2850                     ++num_batches;
2851                     continue;
2852                 }
2853             }
2854 
2855             break;
2856         }
2857 
2858         if (replayable_faults->replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BATCH) {
2859             status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
2860             if (status != NV_OK)
2861                 break;
2862             ++num_replays;
2863         }
2864         else if (replayable_faults->replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BATCH_FLUSH) {
2865             uvm_gpu_buffer_flush_mode_t flush_mode = UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT;
2866 
2867             if (batch_context->num_duplicate_faults * 100 >
2868                 batch_context->num_cached_faults * replayable_faults->replay_update_put_ratio) {
2869                 flush_mode = UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT;
2870             }
2871 
2872             status = fault_buffer_flush_locked(gpu, flush_mode, UVM_FAULT_REPLAY_TYPE_START, batch_context);
2873             if (status != NV_OK)
2874                 break;
2875             ++num_replays;
2876             status = uvm_tracker_wait(&replayable_faults->replay_tracker);
2877             if (status != NV_OK)
2878                 break;
2879         }
2880 
2881         if (batch_context->has_throttled_faults)
2882             ++num_throttled;
2883 
2884         ++num_batches;
2885     }
2886 
2887     if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
2888         status = NV_OK;
2889 
2890     // Make sure that we issue at least one replay if no replay has been
2891     // issued yet to avoid dropping faults that do not show up in the buffer
2892     if ((status == NV_OK && replayable_faults->replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_ONCE) ||
2893         num_replays == 0)
2894         status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
2895 
2896     uvm_tracker_deinit(&batch_context->tracker);
2897 
2898     if (status != NV_OK)
2899         UVM_DBG_PRINT("Error servicing replayable faults on GPU: %s\n", uvm_gpu_name(gpu));
2900 }
2901 
2902 void uvm_parent_gpu_enable_prefetch_faults(uvm_parent_gpu_t *parent_gpu)
2903 {
2904     UVM_ASSERT(parent_gpu->isr.replayable_faults.handling);
2905     UVM_ASSERT(parent_gpu->prefetch_fault_supported);
2906 
2907     if (!parent_gpu->fault_buffer_info.prefetch_faults_enabled) {
2908         parent_gpu->arch_hal->enable_prefetch_faults(parent_gpu);
2909         parent_gpu->fault_buffer_info.prefetch_faults_enabled = true;
2910     }
2911 }
2912 
2913 void uvm_parent_gpu_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu)
2914 {
2915     UVM_ASSERT(parent_gpu->isr.replayable_faults.handling);
2916     UVM_ASSERT(parent_gpu->prefetch_fault_supported);
2917 
2918     if (parent_gpu->fault_buffer_info.prefetch_faults_enabled) {
2919         parent_gpu->arch_hal->disable_prefetch_faults(parent_gpu);
2920         parent_gpu->fault_buffer_info.prefetch_faults_enabled = false;
2921         parent_gpu->fault_buffer_info.disable_prefetch_faults_timestamp = NV_GETTIME();
2922     }
2923 }
2924 
2925 const char *uvm_perf_fault_replay_policy_string(uvm_perf_fault_replay_policy_t replay_policy)
2926 {
2927     BUILD_BUG_ON(UVM_PERF_FAULT_REPLAY_POLICY_MAX != 4);
2928 
2929     switch (replay_policy) {
2930         UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_BLOCK);
2931         UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_BATCH);
2932         UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_BATCH_FLUSH);
2933         UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_ONCE);
2934         UVM_ENUM_STRING_DEFAULT();
2935     }
2936 }
2937 
2938 NV_STATUS uvm_test_get_prefetch_faults_reenable_lapse(UVM_TEST_GET_PREFETCH_FAULTS_REENABLE_LAPSE_PARAMS *params,
2939                                                       struct file *filp)
2940 {
2941     params->reenable_lapse = uvm_perf_reenable_prefetch_faults_lapse_msec;
2942 
2943     return NV_OK;
2944 }
2945 
2946 NV_STATUS uvm_test_set_prefetch_faults_reenable_lapse(UVM_TEST_SET_PREFETCH_FAULTS_REENABLE_LAPSE_PARAMS *params,
2947                                                       struct file *filp)
2948 {
2949     uvm_perf_reenable_prefetch_faults_lapse_msec = params->reenable_lapse;
2950 
2951     return NV_OK;
2952 }
2953 
2954 NV_STATUS uvm_test_drain_replayable_faults(UVM_TEST_DRAIN_REPLAYABLE_FAULTS_PARAMS *params, struct file *filp)
2955 {
2956     uvm_gpu_t *gpu;
2957     NV_STATUS status = NV_OK;
2958     uvm_spin_loop_t spin;
2959     bool pending = true;
2960     uvm_va_space_t *va_space = uvm_va_space_get(filp);
2961 
2962     gpu = uvm_va_space_retain_gpu_by_uuid(va_space, &params->gpu_uuid);
2963     if (!gpu)
2964         return NV_ERR_INVALID_DEVICE;
2965 
2966     uvm_spin_loop_init(&spin);
2967 
2968     do {
2969         uvm_parent_gpu_replayable_faults_isr_lock(gpu->parent);
2970         pending = uvm_parent_gpu_replayable_faults_pending(gpu->parent);
2971         uvm_parent_gpu_replayable_faults_isr_unlock(gpu->parent);
2972 
2973         if (!pending)
2974             break;
2975 
2976         if (fatal_signal_pending(current)) {
2977             status = NV_ERR_SIGNAL_PENDING;
2978             break;
2979         }
2980 
2981         UVM_SPIN_LOOP(&spin);
2982     } while (uvm_spin_loop_elapsed(&spin) < params->timeout_ns);
2983 
2984     if (pending && status == NV_OK)
2985         status = NV_ERR_TIMEOUT;
2986 
2987     uvm_gpu_release(gpu);
2988 
2989     return status;
2990 }
2991