1 /*******************************************************************************
2     Copyright (c) 2015-2022 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "linux/sort.h"
25 #include "nv_uvm_interface.h"
26 #include "uvm_linux.h"
27 #include "uvm_global.h"
28 #include "uvm_gpu_replayable_faults.h"
29 #include "uvm_hal.h"
30 #include "uvm_kvmalloc.h"
31 #include "uvm_tools.h"
32 #include "uvm_va_block.h"
33 #include "uvm_va_range.h"
34 #include "uvm_va_space.h"
35 #include "uvm_va_space_mm.h"
36 #include "uvm_procfs.h"
37 #include "uvm_perf_thrashing.h"
38 #include "uvm_gpu_non_replayable_faults.h"
39 #include "uvm_ats_faults.h"
40 #include "uvm_test.h"
41 
42 // The documentation at the beginning of uvm_gpu_non_replayable_faults.c
43 // provides some background for understanding replayable faults, non-replayable
44 // faults, and how UVM services each fault type.
45 
46 #define UVM_PERF_REENABLE_PREFETCH_FAULTS_LAPSE_MSEC_DEFAULT 1000
47 
48 // Lapse of time in milliseconds after which prefetch faults can be re-enabled.
49 // 0 means it is never disabled
50 static unsigned uvm_perf_reenable_prefetch_faults_lapse_msec = UVM_PERF_REENABLE_PREFETCH_FAULTS_LAPSE_MSEC_DEFAULT;
51 module_param(uvm_perf_reenable_prefetch_faults_lapse_msec, uint, S_IRUGO);
52 
53 #define UVM_PERF_FAULT_BATCH_COUNT_MIN 1
54 #define UVM_PERF_FAULT_BATCH_COUNT_DEFAULT 256
55 
56 // Number of entries that are fetched from the GPU fault buffer and serviced in
57 // batch
58 static unsigned uvm_perf_fault_batch_count = UVM_PERF_FAULT_BATCH_COUNT_DEFAULT;
59 module_param(uvm_perf_fault_batch_count, uint, S_IRUGO);
60 
61 #define UVM_PERF_FAULT_REPLAY_POLICY_DEFAULT UVM_PERF_FAULT_REPLAY_POLICY_BATCH_FLUSH
62 
63 // Policy that determines when to issue fault replays
64 static uvm_perf_fault_replay_policy_t uvm_perf_fault_replay_policy = UVM_PERF_FAULT_REPLAY_POLICY_DEFAULT;
65 module_param(uvm_perf_fault_replay_policy, uint, S_IRUGO);
66 
67 #define UVM_PERF_FAULT_REPLAY_UPDATE_PUT_RATIO_DEFAULT 50
68 
69 // Reading fault buffer GET/PUT pointers from the CPU is expensive. However,
70 // updating PUT before flushing the buffer helps minimizing the number of
71 // duplicates in the buffer as it discards faults that were not processed
72 // because of the batch size limit or because they arrived during servicing.
73 // If PUT is not updated, the replay operation will make them show up again
74 // in the buffer as duplicates.
75 //
76 // We keep track of the number of duplicates in each batch and we use
77 // UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT for the fault buffer flush after if the
78 // percentage of duplicate faults in a batch is greater than the ratio defined
79 // in the following module parameter. UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT is
80 // used, otherwise.
81 static unsigned uvm_perf_fault_replay_update_put_ratio = UVM_PERF_FAULT_REPLAY_UPDATE_PUT_RATIO_DEFAULT;
82 module_param(uvm_perf_fault_replay_update_put_ratio, uint, S_IRUGO);
83 
84 #define UVM_PERF_FAULT_MAX_BATCHES_PER_SERVICE_DEFAULT 20
85 
86 #define UVM_PERF_FAULT_MAX_THROTTLE_PER_SERVICE_DEFAULT 5
87 
88 // Maximum number of batches to be processed per execution of the bottom-half
89 static unsigned uvm_perf_fault_max_batches_per_service = UVM_PERF_FAULT_MAX_BATCHES_PER_SERVICE_DEFAULT;
90 module_param(uvm_perf_fault_max_batches_per_service, uint, S_IRUGO);
91 
92 // Maximum number of batches with thrashing pages per execution of the bottom-half
93 static unsigned uvm_perf_fault_max_throttle_per_service = UVM_PERF_FAULT_MAX_THROTTLE_PER_SERVICE_DEFAULT;
94 module_param(uvm_perf_fault_max_throttle_per_service, uint, S_IRUGO);
95 
96 static unsigned uvm_perf_fault_coalesce = 1;
97 module_param(uvm_perf_fault_coalesce, uint, S_IRUGO);
98 
99 // This function is used for both the initial fault buffer initialization and
100 // the power management resume path.
101 static void fault_buffer_reinit_replayable_faults(uvm_parent_gpu_t *parent_gpu)
102 {
103     uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable;
104 
105     // Read the current get/put pointers, as this might not be the first time
106     // we take control of the fault buffer since the GPU was initialized,
107     // or since we may need to bring UVM's cached copies back in sync following
108     // a sleep cycle.
109     replayable_faults->cached_get = parent_gpu->fault_buffer_hal->read_get(parent_gpu);
110     replayable_faults->cached_put = parent_gpu->fault_buffer_hal->read_put(parent_gpu);
111 
112     // (Re-)enable fault prefetching
113     if (parent_gpu->fault_buffer_info.prefetch_faults_enabled)
114         parent_gpu->arch_hal->enable_prefetch_faults(parent_gpu);
115     else
116         parent_gpu->arch_hal->disable_prefetch_faults(parent_gpu);
117 }
118 
119 // There is no error handling in this function. The caller is in charge of
120 // calling fault_buffer_deinit_replayable_faults on failure.
121 static NV_STATUS fault_buffer_init_replayable_faults(uvm_parent_gpu_t *parent_gpu)
122 {
123     NV_STATUS status = NV_OK;
124     uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable;
125     uvm_fault_service_batch_context_t *batch_context = &replayable_faults->batch_service_context;
126 
127     UVM_ASSERT(parent_gpu->fault_buffer_info.rm_info.replayable.bufferSize %
128                parent_gpu->fault_buffer_hal->entry_size(parent_gpu) == 0);
129 
130     replayable_faults->max_faults = parent_gpu->fault_buffer_info.rm_info.replayable.bufferSize /
131                                     parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
132 
133     // Check provided module parameter value
134     parent_gpu->fault_buffer_info.max_batch_size = max(uvm_perf_fault_batch_count,
135                                                        (NvU32)UVM_PERF_FAULT_BATCH_COUNT_MIN);
136     parent_gpu->fault_buffer_info.max_batch_size = min(parent_gpu->fault_buffer_info.max_batch_size,
137                                                        replayable_faults->max_faults);
138 
139     if (parent_gpu->fault_buffer_info.max_batch_size != uvm_perf_fault_batch_count) {
140         pr_info("Invalid uvm_perf_fault_batch_count value on GPU %s: %u. Valid range [%u:%u] Using %u instead\n",
141                 parent_gpu->name,
142                 uvm_perf_fault_batch_count,
143                 UVM_PERF_FAULT_BATCH_COUNT_MIN,
144                 replayable_faults->max_faults,
145                 parent_gpu->fault_buffer_info.max_batch_size);
146     }
147 
148     batch_context->fault_cache = uvm_kvmalloc_zero(replayable_faults->max_faults * sizeof(*batch_context->fault_cache));
149     if (!batch_context->fault_cache)
150         return NV_ERR_NO_MEMORY;
151 
152     // fault_cache is used to signal that the tracker was initialized.
153     uvm_tracker_init(&replayable_faults->replay_tracker);
154 
155     batch_context->ordered_fault_cache = uvm_kvmalloc_zero(replayable_faults->max_faults *
156                                                            sizeof(*batch_context->ordered_fault_cache));
157     if (!batch_context->ordered_fault_cache)
158         return NV_ERR_NO_MEMORY;
159 
160     // This value must be initialized by HAL
161     UVM_ASSERT(replayable_faults->utlb_count > 0);
162 
163     batch_context->utlbs = uvm_kvmalloc_zero(replayable_faults->utlb_count * sizeof(*batch_context->utlbs));
164     if (!batch_context->utlbs)
165         return NV_ERR_NO_MEMORY;
166 
167     batch_context->max_utlb_id = 0;
168 
169     status = uvm_rm_locked_call(nvUvmInterfaceOwnPageFaultIntr(parent_gpu->rm_device, NV_TRUE));
170     if (status != NV_OK) {
171         UVM_ERR_PRINT("Failed to take page fault ownership from RM: %s, GPU %s\n",
172                       nvstatusToString(status),
173                       parent_gpu->name);
174         return status;
175     }
176 
177     replayable_faults->replay_policy = uvm_perf_fault_replay_policy < UVM_PERF_FAULT_REPLAY_POLICY_MAX?
178                                            uvm_perf_fault_replay_policy:
179                                            UVM_PERF_FAULT_REPLAY_POLICY_DEFAULT;
180 
181     if (replayable_faults->replay_policy != uvm_perf_fault_replay_policy) {
182         pr_info("Invalid uvm_perf_fault_replay_policy value on GPU %s: %d. Using %d instead\n",
183                 parent_gpu->name,
184                 uvm_perf_fault_replay_policy,
185                 replayable_faults->replay_policy);
186     }
187 
188     replayable_faults->replay_update_put_ratio = min(uvm_perf_fault_replay_update_put_ratio, 100u);
189     if (replayable_faults->replay_update_put_ratio != uvm_perf_fault_replay_update_put_ratio) {
190         pr_info("Invalid uvm_perf_fault_replay_update_put_ratio value on GPU %s: %u. Using %u instead\n",
191                 parent_gpu->name,
192                 uvm_perf_fault_replay_update_put_ratio,
193                 replayable_faults->replay_update_put_ratio);
194     }
195 
196     // Re-enable fault prefetching just in case it was disabled in a previous run
197     parent_gpu->fault_buffer_info.prefetch_faults_enabled = parent_gpu->prefetch_fault_supported;
198 
199     fault_buffer_reinit_replayable_faults(parent_gpu);
200 
201     return NV_OK;
202 }
203 
204 static void fault_buffer_deinit_replayable_faults(uvm_parent_gpu_t *parent_gpu)
205 {
206     uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable;
207     uvm_fault_service_batch_context_t *batch_context = &replayable_faults->batch_service_context;
208 
209     if (batch_context->fault_cache) {
210         UVM_ASSERT(uvm_tracker_is_empty(&replayable_faults->replay_tracker));
211         uvm_tracker_deinit(&replayable_faults->replay_tracker);
212     }
213 
214     if (parent_gpu->fault_buffer_info.rm_info.faultBufferHandle) {
215         // Re-enable prefetch faults in case we disabled them
216         if (parent_gpu->prefetch_fault_supported && !parent_gpu->fault_buffer_info.prefetch_faults_enabled)
217             parent_gpu->arch_hal->enable_prefetch_faults(parent_gpu);
218     }
219 
220     uvm_kvfree(batch_context->fault_cache);
221     uvm_kvfree(batch_context->ordered_fault_cache);
222     uvm_kvfree(batch_context->utlbs);
223     batch_context->fault_cache         = NULL;
224     batch_context->ordered_fault_cache = NULL;
225     batch_context->utlbs               = NULL;
226 }
227 
228 NV_STATUS uvm_gpu_fault_buffer_init(uvm_parent_gpu_t *parent_gpu)
229 {
230     NV_STATUS status = NV_OK;
231 
232     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
233     UVM_ASSERT(parent_gpu->replayable_faults_supported);
234 
235     status = uvm_rm_locked_call(nvUvmInterfaceInitFaultInfo(parent_gpu->rm_device,
236                                                             &parent_gpu->fault_buffer_info.rm_info));
237     if (status != NV_OK) {
238         UVM_ERR_PRINT("Failed to init fault buffer info from RM: %s, GPU %s\n",
239                       nvstatusToString(status),
240                       parent_gpu->name);
241 
242         // nvUvmInterfaceInitFaultInfo may leave fields in rm_info populated
243         // when it returns an error. Set the buffer handle to zero as it is
244         // used by the deinitialization logic to determine if it was correctly
245         // initialized.
246         parent_gpu->fault_buffer_info.rm_info.faultBufferHandle = 0;
247         goto fail;
248     }
249 
250     status = fault_buffer_init_replayable_faults(parent_gpu);
251     if (status != NV_OK)
252         goto fail;
253 
254     if (parent_gpu->non_replayable_faults_supported) {
255         status = uvm_gpu_fault_buffer_init_non_replayable_faults(parent_gpu);
256         if (status != NV_OK)
257             goto fail;
258     }
259 
260     return NV_OK;
261 
262 fail:
263     uvm_gpu_fault_buffer_deinit(parent_gpu);
264 
265     return status;
266 }
267 
268 // Reinitialize state relevant to replayable fault handling after returning
269 // from a power management cycle.
270 void uvm_gpu_fault_buffer_resume(uvm_parent_gpu_t *parent_gpu)
271 {
272     UVM_ASSERT(parent_gpu->replayable_faults_supported);
273 
274     fault_buffer_reinit_replayable_faults(parent_gpu);
275 }
276 
277 void uvm_gpu_fault_buffer_deinit(uvm_parent_gpu_t *parent_gpu)
278 {
279     NV_STATUS status = NV_OK;
280 
281     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
282 
283     if (parent_gpu->non_replayable_faults_supported)
284         uvm_gpu_fault_buffer_deinit_non_replayable_faults(parent_gpu);
285 
286     fault_buffer_deinit_replayable_faults(parent_gpu);
287 
288     if (parent_gpu->fault_buffer_info.rm_info.faultBufferHandle) {
289         status = uvm_rm_locked_call(nvUvmInterfaceOwnPageFaultIntr(parent_gpu->rm_device, NV_FALSE));
290         UVM_ASSERT(status == NV_OK);
291 
292         uvm_rm_locked_call_void(nvUvmInterfaceDestroyFaultInfo(parent_gpu->rm_device,
293                                                                &parent_gpu->fault_buffer_info.rm_info));
294 
295         parent_gpu->fault_buffer_info.rm_info.faultBufferHandle = 0;
296     }
297 }
298 
299 bool uvm_gpu_replayable_faults_pending(uvm_parent_gpu_t *parent_gpu)
300 {
301     uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable;
302 
303     UVM_ASSERT(parent_gpu->replayable_faults_supported);
304 
305     // Fast path 1: we left some faults unserviced in the buffer in the last pass
306     if (replayable_faults->cached_get != replayable_faults->cached_put)
307         return true;
308 
309     // Fast path 2: read the valid bit of the fault buffer entry pointed by the
310     // cached get pointer
311     if (!parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, replayable_faults->cached_get)) {
312         // Slow path: read the put pointer from the GPU register via BAR0
313         // over PCIe
314         replayable_faults->cached_put = parent_gpu->fault_buffer_hal->read_put(parent_gpu);
315 
316         // No interrupt pending
317         if (replayable_faults->cached_get == replayable_faults->cached_put)
318             return false;
319     }
320 
321     return true;
322 }
323 
324 // Push a fault cancel method on the given client. Any failure during this
325 // operation may lead to application hang (requiring manual Ctrl+C from the
326 // user) or system crash (requiring reboot).
327 // In that case we log an error message.
328 //
329 // gpc_id and client_id aren't used if global_cancel is true.
330 //
331 // This function acquires both the given tracker and the replay tracker
332 static NV_STATUS push_cancel_on_gpu(uvm_gpu_t *gpu,
333                                     uvm_gpu_phys_address_t instance_ptr,
334                                     bool global_cancel,
335                                     NvU32 gpc_id,
336                                     NvU32 client_id,
337                                     uvm_tracker_t *tracker)
338 {
339     NV_STATUS status;
340     uvm_push_t push;
341     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
342 
343     if (global_cancel) {
344         status = uvm_push_begin_acquire(gpu->channel_manager,
345                                         UVM_CHANNEL_TYPE_MEMOPS,
346                                         &replayable_faults->replay_tracker,
347                                         &push,
348                                         "Cancel targeting instance_ptr {0x%llx:%s}\n",
349                                         instance_ptr.address,
350                                         uvm_aperture_string(instance_ptr.aperture));
351     } else {
352         status = uvm_push_begin_acquire(gpu->channel_manager,
353                                         UVM_CHANNEL_TYPE_MEMOPS,
354                                         &replayable_faults->replay_tracker,
355                                         &push,
356                                         "Cancel targeting instance_ptr {0x%llx:%s} gpc %u client %u\n",
357                                         instance_ptr.address,
358                                         uvm_aperture_string(instance_ptr.aperture),
359                                         gpc_id,
360                                         client_id);
361     }
362 
363     UVM_ASSERT(status == NV_OK);
364     if (status != NV_OK) {
365         UVM_ERR_PRINT("Failed to create push and acquire replay tracker before pushing cancel: %s, GPU %s\n",
366                       nvstatusToString(status),
367                       uvm_gpu_name(gpu));
368         return status;
369     }
370 
371     uvm_push_acquire_tracker(&push, tracker);
372 
373     if (global_cancel)
374         gpu->parent->host_hal->cancel_faults_global(&push, instance_ptr);
375      else
376         gpu->parent->host_hal->cancel_faults_targeted(&push, instance_ptr, gpc_id, client_id);
377 
378     // We don't need to put the cancel in the GPU replay tracker since we wait
379     // on it immediately.
380     status = uvm_push_end_and_wait(&push);
381 
382     UVM_ASSERT(status == NV_OK);
383     if (status != NV_OK)
384         UVM_ERR_PRINT("Failed to wait for pushed cancel: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
385 
386     uvm_tracker_clear(&replayable_faults->replay_tracker);
387 
388     return status;
389 }
390 
391 static NV_STATUS push_cancel_on_gpu_targeted(uvm_gpu_t *gpu,
392                                              uvm_gpu_phys_address_t instance_ptr,
393                                              NvU32 gpc_id,
394                                              NvU32 client_id,
395                                              uvm_tracker_t *tracker)
396 {
397     return push_cancel_on_gpu(gpu, instance_ptr, false, gpc_id, client_id, tracker);
398 }
399 
400 static NV_STATUS push_cancel_on_gpu_global(uvm_gpu_t *gpu, uvm_gpu_phys_address_t instance_ptr, uvm_tracker_t *tracker)
401 {
402     UVM_ASSERT(!gpu->parent->smc.enabled);
403 
404     return push_cancel_on_gpu(gpu, instance_ptr, true, 0, 0, tracker);
405 }
406 
407 // Volta implements a targeted VA fault cancel that simplifies the fault cancel
408 // process. You only need to specify the address, type, and mmu_engine_id for
409 // the access to be cancelled. Caller must hold the VA space lock for the access
410 // to be cancelled.
411 static NV_STATUS cancel_fault_precise_va(uvm_gpu_t *gpu,
412                                          uvm_fault_buffer_entry_t *fault_entry,
413                                          uvm_fault_cancel_va_mode_t cancel_va_mode)
414 {
415     NV_STATUS status;
416     uvm_gpu_va_space_t *gpu_va_space;
417     uvm_gpu_phys_address_t pdb;
418     uvm_push_t push;
419     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
420     NvU64 offset;
421 
422     UVM_ASSERT(gpu->parent->replayable_faults_supported);
423     UVM_ASSERT(fault_entry->fatal_reason != UvmEventFatalReasonInvalid);
424     UVM_ASSERT(!fault_entry->filtered);
425 
426     gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(fault_entry->va_space, gpu->parent);
427     UVM_ASSERT(gpu_va_space);
428     pdb = uvm_page_tree_pdb(&gpu_va_space->page_tables)->addr;
429 
430     // Record fatal fault event
431     uvm_tools_record_gpu_fatal_fault(gpu->id, fault_entry->va_space, fault_entry, fault_entry->fatal_reason);
432 
433     status = uvm_push_begin_acquire(gpu->channel_manager,
434                                     UVM_CHANNEL_TYPE_MEMOPS,
435                                     &replayable_faults->replay_tracker,
436                                     &push,
437                                     "Precise cancel targeting PDB {0x%llx:%s} VA 0x%llx VEID %u with access type %s",
438                                     pdb.address,
439                                     uvm_aperture_string(pdb.aperture),
440                                     fault_entry->fault_address,
441                                     fault_entry->fault_source.ve_id,
442                                     uvm_fault_access_type_string(fault_entry->fault_access_type));
443     if (status != NV_OK) {
444         UVM_ERR_PRINT("Failed to create push and acquire replay tracker before pushing cancel: %s, GPU %s\n",
445                       nvstatusToString(status),
446                       uvm_gpu_name(gpu));
447         return status;
448     }
449 
450     // UVM aligns fault addresses to PAGE_SIZE as it is the smallest mapping
451     // and coherence tracking granularity. However, the cancel method requires
452     // the original address (4K-aligned) reported in the packet, which is lost
453     // at this point. Since the access permissions are the same for the whole
454     // 64K page, we issue a cancel per 4K range to make sure that the HW sees
455     // the address reported in the packet.
456     for (offset = 0; offset < PAGE_SIZE; offset += UVM_PAGE_SIZE_4K) {
457         gpu->parent->host_hal->cancel_faults_va(&push, pdb, fault_entry, cancel_va_mode);
458         fault_entry->fault_address += UVM_PAGE_SIZE_4K;
459     }
460     fault_entry->fault_address = UVM_PAGE_ALIGN_DOWN(fault_entry->fault_address - 1);
461 
462     // We don't need to put the cancel in the GPU replay tracker since we wait
463     // on it immediately.
464     status = uvm_push_end_and_wait(&push);
465     if (status != NV_OK) {
466         UVM_ERR_PRINT("Failed to wait for pushed VA global fault cancel: %s, GPU %s\n",
467                       nvstatusToString(status), uvm_gpu_name(gpu));
468     }
469 
470     uvm_tracker_clear(&replayable_faults->replay_tracker);
471 
472     return status;
473 }
474 
475 static NV_STATUS push_replay_on_gpu(uvm_gpu_t *gpu, uvm_fault_replay_type_t type, uvm_fault_service_batch_context_t *batch_context)
476 {
477     NV_STATUS status;
478     uvm_push_t push;
479     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
480     uvm_tracker_t *tracker = NULL;
481 
482     if (batch_context)
483         tracker = &batch_context->tracker;
484 
485     status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, tracker, &push,
486                                     "Replaying faults");
487     if (status != NV_OK)
488         return status;
489 
490     gpu->parent->host_hal->replay_faults(&push, type);
491 
492     // Do not count REPLAY_TYPE_START_ACK_ALL's toward the replay count.
493     // REPLAY_TYPE_START_ACK_ALL's are issued for cancels, and the cancel
494     // algorithm checks to make sure that no REPLAY_TYPE_START's have been
495     // issued using batch_context->replays.
496     if (batch_context && type != UVM_FAULT_REPLAY_TYPE_START_ACK_ALL) {
497         uvm_tools_broadcast_replay(gpu, &push, batch_context->batch_id, UVM_FAULT_CLIENT_TYPE_GPC);
498         ++batch_context->num_replays;
499     }
500 
501     uvm_push_end(&push);
502 
503     // Add this push to the GPU's replay_tracker so cancel can wait on it.
504     status = uvm_tracker_add_push_safe(&replayable_faults->replay_tracker, &push);
505 
506     if (uvm_procfs_is_debug_enabled()) {
507         if (type == UVM_FAULT_REPLAY_TYPE_START)
508             ++replayable_faults->stats.num_replays;
509         else
510             ++replayable_faults->stats.num_replays_ack_all;
511     }
512 
513     return status;
514 }
515 
516 static void write_get(uvm_parent_gpu_t *parent_gpu, NvU32 get)
517 {
518     uvm_replayable_fault_buffer_info_t *replayable_faults = &parent_gpu->fault_buffer_info.replayable;
519 
520     UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock));
521 
522     // Write get on the GPU only if it's changed.
523     if (replayable_faults->cached_get == get)
524         return;
525 
526     replayable_faults->cached_get = get;
527 
528     // Update get pointer on the GPU
529     parent_gpu->fault_buffer_hal->write_get(parent_gpu, get);
530 }
531 
532 static NV_STATUS fault_buffer_flush_locked(uvm_gpu_t *gpu,
533                                            uvm_gpu_buffer_flush_mode_t flush_mode,
534                                            uvm_fault_replay_type_t fault_replay,
535                                            uvm_fault_service_batch_context_t *batch_context)
536 {
537     NvU32 get;
538     NvU32 put;
539     uvm_spin_loop_t spin;
540     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
541 
542     UVM_ASSERT(uvm_sem_is_locked(&gpu->parent->isr.replayable_faults.service_lock));
543     UVM_ASSERT(gpu->parent->replayable_faults_supported);
544 
545     // Read PUT pointer from the GPU if requested
546     if (flush_mode == UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT)
547         replayable_faults->cached_put = gpu->parent->fault_buffer_hal->read_put(gpu->parent);
548 
549     get = replayable_faults->cached_get;
550     put = replayable_faults->cached_put;
551 
552     while (get != put) {
553         // Wait until valid bit is set
554         UVM_SPIN_WHILE(!gpu->parent->fault_buffer_hal->entry_is_valid(gpu->parent, get), &spin);
555 
556         gpu->parent->fault_buffer_hal->entry_clear_valid(gpu->parent, get);
557         ++get;
558         if (get == replayable_faults->max_faults)
559             get = 0;
560     }
561 
562     write_get(gpu->parent, get);
563 
564     // Issue fault replay
565     return push_replay_on_gpu(gpu, fault_replay, batch_context);
566 }
567 
568 NV_STATUS uvm_gpu_fault_buffer_flush(uvm_gpu_t *gpu)
569 {
570     NV_STATUS status = NV_OK;
571 
572     UVM_ASSERT(gpu->parent->replayable_faults_supported);
573 
574     // Disables replayable fault interrupts and fault servicing
575     uvm_gpu_replayable_faults_isr_lock(gpu->parent);
576 
577     status = fault_buffer_flush_locked(gpu,
578                                        UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
579                                        UVM_FAULT_REPLAY_TYPE_START,
580                                        NULL);
581 
582     // This will trigger the top half to start servicing faults again, if the
583     // replay brought any back in
584     uvm_gpu_replayable_faults_isr_unlock(gpu->parent);
585     return status;
586 }
587 
588 static inline int cmp_fault_instance_ptr(const uvm_fault_buffer_entry_t *a,
589                                          const uvm_fault_buffer_entry_t *b)
590 {
591     int result = uvm_gpu_phys_addr_cmp(a->instance_ptr, b->instance_ptr);
592     // On Volta+ we need to sort by {instance_ptr + subctx_id} pair since it can
593     // map to a different VA space
594     if (result != 0)
595         return result;
596     return UVM_CMP_DEFAULT(a->fault_source.ve_id, b->fault_source.ve_id);
597 }
598 
599 // Compare two VA spaces
600 static inline int cmp_va_space(const uvm_va_space_t *a, const uvm_va_space_t *b)
601 {
602     return UVM_CMP_DEFAULT(a, b);
603 }
604 
605 // Compare two virtual addresses
606 static inline int cmp_addr(NvU64 a, NvU64 b)
607 {
608     return UVM_CMP_DEFAULT(a, b);
609 }
610 
611 // Compare two fault access types
612 static inline int cmp_access_type(uvm_fault_access_type_t a, uvm_fault_access_type_t b)
613 {
614     UVM_ASSERT(a >= 0 && a < UVM_FAULT_ACCESS_TYPE_COUNT);
615     UVM_ASSERT(b >= 0 && b < UVM_FAULT_ACCESS_TYPE_COUNT);
616 
617     // Check that fault access type enum values are ordered by "intrusiveness"
618     BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG <= UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK);
619     BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK <= UVM_FAULT_ACCESS_TYPE_WRITE);
620     BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_WRITE <= UVM_FAULT_ACCESS_TYPE_READ);
621     BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_READ <= UVM_FAULT_ACCESS_TYPE_PREFETCH);
622 
623     return b - a;
624 }
625 
626 typedef enum
627 {
628     // Fetch a batch of faults from the buffer.
629     FAULT_FETCH_MODE_BATCH_ALL,
630 
631     // Fetch a batch of faults from the buffer. Stop at the first entry that is
632     // not ready yet
633     FAULT_FETCH_MODE_BATCH_READY,
634 
635     // Fetch all faults in the buffer before PUT. Wait for all faults to become
636     // ready
637     FAULT_FETCH_MODE_ALL,
638 } fault_fetch_mode_t;
639 
640 static void fetch_fault_buffer_merge_entry(uvm_fault_buffer_entry_t *current_entry,
641                                            uvm_fault_buffer_entry_t *last_entry)
642 {
643     UVM_ASSERT(last_entry->num_instances > 0);
644 
645     ++last_entry->num_instances;
646     uvm_fault_access_type_mask_set(&last_entry->access_type_mask, current_entry->fault_access_type);
647 
648     if (current_entry->fault_access_type > last_entry->fault_access_type) {
649         // If the new entry has a higher access type, it becomes the
650         // fault to be serviced. Add the previous one to the list of instances
651         current_entry->access_type_mask = last_entry->access_type_mask;
652         current_entry->num_instances = last_entry->num_instances;
653         last_entry->filtered = true;
654 
655         // We only merge faults from different uTLBs if the new fault has an
656         // access type with the same or lower level of intrusiveness.
657         UVM_ASSERT(current_entry->fault_source.utlb_id == last_entry->fault_source.utlb_id);
658 
659         list_replace(&last_entry->merged_instances_list, &current_entry->merged_instances_list);
660         list_add(&last_entry->merged_instances_list, &current_entry->merged_instances_list);
661     }
662     else {
663         // Add the new entry to the list of instances for reporting purposes
664         current_entry->filtered = true;
665         list_add(&current_entry->merged_instances_list, &last_entry->merged_instances_list);
666     }
667 }
668 
669 static bool fetch_fault_buffer_try_merge_entry(uvm_fault_buffer_entry_t *current_entry,
670                                                uvm_fault_service_batch_context_t *batch_context,
671                                                uvm_fault_utlb_info_t *current_tlb,
672                                                bool is_same_instance_ptr)
673 {
674     uvm_fault_buffer_entry_t *last_tlb_entry = current_tlb->last_fault;
675     uvm_fault_buffer_entry_t *last_global_entry = batch_context->last_fault;
676 
677     // Check the last coalesced fault and the coalesced fault that was
678     // originated from this uTLB
679     const bool is_last_tlb_fault = current_tlb->num_pending_faults > 0 &&
680                                    cmp_fault_instance_ptr(current_entry, last_tlb_entry) == 0 &&
681                                    current_entry->fault_address == last_tlb_entry->fault_address;
682 
683     // We only merge faults from different uTLBs if the new fault has an
684     // access type with the same or lower level of intrusiveness. This is to
685     // avoid having to update num_pending_faults on both uTLBs and recomputing
686     // last_fault.
687     const bool is_last_fault = is_same_instance_ptr &&
688                                current_entry->fault_address == last_global_entry->fault_address &&
689                                current_entry->fault_access_type <= last_global_entry->fault_access_type;
690 
691     if (is_last_tlb_fault) {
692         fetch_fault_buffer_merge_entry(current_entry, last_tlb_entry);
693         if (current_entry->fault_access_type > last_tlb_entry->fault_access_type)
694             current_tlb->last_fault = current_entry;
695 
696         return true;
697     }
698     else if (is_last_fault) {
699         fetch_fault_buffer_merge_entry(current_entry, last_global_entry);
700         if (current_entry->fault_access_type > last_global_entry->fault_access_type)
701             batch_context->last_fault = current_entry;
702 
703         return true;
704     }
705 
706     return false;
707 }
708 
709 // Fetch entries from the fault buffer, decode them and store them in the batch
710 // context. We implement the fetch modes described above.
711 //
712 // When possible, we coalesce duplicate entries to minimize the fault handling
713 // overhead. Basically, we merge faults with the same instance pointer and page
714 // virtual address. We keep track of the last fault per uTLB to detect
715 // duplicates due to local reuse and the last fault in the whole batch to
716 // detect reuse across CTAs.
717 //
718 // We will service the first fault entry with the most "intrusive" (atomic >
719 // write > read > prefetch) access type*. That fault entry is called the
720 // "representative". The rest of filtered faults have the "filtered" flag set
721 // and are added to a list in the representative fault entry for reporting
722 // purposes. The representative fault entry also contains a mask with all the
723 // access types that produced a fault on the page.
724 //
725 // *We only merge faults from different uTLBs if the new fault has an access
726 // type with the same or lower level of intrusiveness.
727 //
728 // This optimization cannot be performed during fault cancel on Pascal GPUs
729 // (fetch_mode == FAULT_FETCH_MODE_ALL) since we need accurate tracking of all
730 // the faults in each uTLB in order to guarantee precise fault attribution.
731 static void fetch_fault_buffer_entries(uvm_gpu_t *gpu,
732                                        uvm_fault_service_batch_context_t *batch_context,
733                                        fault_fetch_mode_t fetch_mode)
734 {
735     NvU32 get;
736     NvU32 put;
737     NvU32 fault_index;
738     NvU32 num_coalesced_faults;
739     NvU32 utlb_id;
740     uvm_fault_buffer_entry_t *fault_cache;
741     uvm_spin_loop_t spin;
742     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
743     const bool in_pascal_cancel_path = (!gpu->parent->fault_cancel_va_supported && fetch_mode == FAULT_FETCH_MODE_ALL);
744     const bool may_filter = uvm_perf_fault_coalesce && !in_pascal_cancel_path;
745 
746     UVM_ASSERT(uvm_sem_is_locked(&gpu->parent->isr.replayable_faults.service_lock));
747     UVM_ASSERT(gpu->parent->replayable_faults_supported);
748 
749     fault_cache = batch_context->fault_cache;
750 
751     get = replayable_faults->cached_get;
752 
753     // Read put pointer from GPU and cache it
754     if (get == replayable_faults->cached_put)
755         replayable_faults->cached_put = gpu->parent->fault_buffer_hal->read_put(gpu->parent);
756 
757     put = replayable_faults->cached_put;
758 
759     batch_context->is_single_instance_ptr = true;
760     batch_context->last_fault = NULL;
761 
762     fault_index = 0;
763     num_coalesced_faults = 0;
764 
765     // Clear uTLB counters
766     for (utlb_id = 0; utlb_id <= batch_context->max_utlb_id; ++utlb_id) {
767         batch_context->utlbs[utlb_id].num_pending_faults = 0;
768         batch_context->utlbs[utlb_id].has_fatal_faults = false;
769     }
770     batch_context->max_utlb_id = 0;
771 
772     if (get == put)
773         goto done;
774 
775     // Parse until get != put and have enough space to cache.
776     while ((get != put) &&
777            (fetch_mode == FAULT_FETCH_MODE_ALL || fault_index < gpu->parent->fault_buffer_info.max_batch_size)) {
778         bool is_same_instance_ptr = true;
779         uvm_fault_buffer_entry_t *current_entry = &fault_cache[fault_index];
780         uvm_fault_utlb_info_t *current_tlb;
781 
782         // We cannot just wait for the last entry (the one pointed by put) to
783         // become valid, we have to do it individually since entries can be
784         // written out of order
785         UVM_SPIN_WHILE(!gpu->parent->fault_buffer_hal->entry_is_valid(gpu->parent, get), &spin) {
786             // We have some entry to work on. Let's do the rest later.
787             if (fetch_mode != FAULT_FETCH_MODE_ALL &&
788                 fetch_mode != FAULT_FETCH_MODE_BATCH_ALL &&
789                 fault_index > 0)
790                 goto done;
791         }
792 
793         // Prevent later accesses being moved above the read of the valid bit
794         smp_mb__after_atomic();
795 
796         // Got valid bit set. Let's cache.
797         gpu->parent->fault_buffer_hal->parse_entry(gpu->parent, get, current_entry);
798 
799         // The GPU aligns the fault addresses to 4k, but all of our tracking is
800         // done in PAGE_SIZE chunks which might be larger.
801         current_entry->fault_address = UVM_PAGE_ALIGN_DOWN(current_entry->fault_address);
802 
803         // Make sure that all fields in the entry are properly initialized
804         current_entry->is_fatal = (current_entry->fault_type >= UVM_FAULT_TYPE_FATAL);
805 
806         if (current_entry->is_fatal) {
807             // Record the fatal fault event later as we need the va_space locked
808             current_entry->fatal_reason = UvmEventFatalReasonInvalidFaultType;
809         }
810         else {
811             current_entry->fatal_reason = UvmEventFatalReasonInvalid;
812         }
813 
814         current_entry->va_space = NULL;
815         current_entry->filtered = false;
816 
817         if (current_entry->fault_source.utlb_id > batch_context->max_utlb_id) {
818             UVM_ASSERT(current_entry->fault_source.utlb_id < replayable_faults->utlb_count);
819             batch_context->max_utlb_id = current_entry->fault_source.utlb_id;
820         }
821 
822         current_tlb = &batch_context->utlbs[current_entry->fault_source.utlb_id];
823 
824         if (fault_index > 0) {
825             UVM_ASSERT(batch_context->last_fault);
826             is_same_instance_ptr = cmp_fault_instance_ptr(current_entry, batch_context->last_fault) == 0;
827 
828             // Coalesce duplicate faults when possible
829             if (may_filter && !current_entry->is_fatal) {
830                 bool merged = fetch_fault_buffer_try_merge_entry(current_entry,
831                                                                  batch_context,
832                                                                  current_tlb,
833                                                                  is_same_instance_ptr);
834                 if (merged)
835                     goto next_fault;
836             }
837         }
838 
839         if (batch_context->is_single_instance_ptr && !is_same_instance_ptr)
840             batch_context->is_single_instance_ptr = false;
841 
842         current_entry->num_instances = 1;
843         current_entry->access_type_mask = uvm_fault_access_type_mask_bit(current_entry->fault_access_type);
844         INIT_LIST_HEAD(&current_entry->merged_instances_list);
845 
846         ++current_tlb->num_pending_faults;
847         current_tlb->last_fault = current_entry;
848         batch_context->last_fault = current_entry;
849 
850         ++num_coalesced_faults;
851 
852     next_fault:
853         ++fault_index;
854         ++get;
855         if (get == replayable_faults->max_faults)
856             get = 0;
857     }
858 
859 done:
860     write_get(gpu->parent, get);
861 
862     batch_context->num_cached_faults = fault_index;
863     batch_context->num_coalesced_faults = num_coalesced_faults;
864 }
865 
866 // Sort comparator for pointers to fault buffer entries that sorts by
867 // instance pointer
868 static int cmp_sort_fault_entry_by_instance_ptr(const void *_a, const void *_b)
869 {
870     const uvm_fault_buffer_entry_t **a = (const uvm_fault_buffer_entry_t **)_a;
871     const uvm_fault_buffer_entry_t **b = (const uvm_fault_buffer_entry_t **)_b;
872 
873     return cmp_fault_instance_ptr(*a, *b);
874 }
875 
876 // Sort comparator for pointers to fault buffer entries that sorts by va_space,
877 // fault address and fault access type
878 static int cmp_sort_fault_entry_by_va_space_address_access_type(const void *_a, const void *_b)
879 {
880     const uvm_fault_buffer_entry_t **a = (const uvm_fault_buffer_entry_t **)_a;
881     const uvm_fault_buffer_entry_t **b = (const uvm_fault_buffer_entry_t **)_b;
882 
883     int result;
884 
885     result = cmp_va_space((*a)->va_space, (*b)->va_space);
886     if (result != 0)
887         return result;
888 
889     result = cmp_addr((*a)->fault_address, (*b)->fault_address);
890     if (result != 0)
891         return result;
892 
893     return cmp_access_type((*a)->fault_access_type, (*b)->fault_access_type);
894 }
895 
896 // Translate all instance pointers to VA spaces. Since the buffer is ordered by
897 // instance_ptr, we minimize the number of translations
898 //
899 // This function returns NV_WARN_MORE_PROCESSING_REQUIRED if a fault buffer
900 // flush occurred and executed successfully, or the error code if it failed.
901 // NV_OK otherwise.
902 static NV_STATUS translate_instance_ptrs(uvm_gpu_t *gpu,
903                                          uvm_fault_service_batch_context_t *batch_context)
904 {
905     NvU32 i;
906     NV_STATUS status;
907 
908     for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
909         uvm_fault_buffer_entry_t *current_entry;
910 
911         current_entry = batch_context->ordered_fault_cache[i];
912 
913         // If this instance pointer matches the previous instance pointer, just
914         // copy over the already-translated va_space and move on.
915         if (i != 0 && cmp_fault_instance_ptr(current_entry, batch_context->ordered_fault_cache[i - 1]) == 0) {
916             current_entry->va_space = batch_context->ordered_fault_cache[i - 1]->va_space;
917             continue;
918         }
919 
920         status = uvm_gpu_fault_entry_to_va_space(gpu, current_entry, &current_entry->va_space);
921         if (status != NV_OK) {
922             if (status == NV_ERR_PAGE_TABLE_NOT_AVAIL) {
923                 // The channel is valid but the subcontext is not. This can only
924                 // happen if the subcontext is torn down before its work is
925                 // complete while other subcontexts in the same TSG are still
926                 // executing. This is a violation of the programming model. We
927                 // have limited options since the VA space is gone, meaning we
928                 // can't target the PDB for cancel even if we wanted to. So
929                 // we'll just throw away precise attribution and cancel this
930                 // fault using the SW method, which validates that the intended
931                 // context (TSG) is still running so we don't cancel an innocent
932                 // context.
933                 UVM_ASSERT(!current_entry->va_space);
934                 UVM_ASSERT(gpu->max_subcontexts > 0);
935 
936                 if (gpu->parent->smc.enabled) {
937                     status = push_cancel_on_gpu_targeted(gpu,
938                                                          current_entry->instance_ptr,
939                                                          current_entry->fault_source.gpc_id,
940                                                          current_entry->fault_source.client_id,
941                                                          &batch_context->tracker);
942                 }
943                 else {
944                     status = push_cancel_on_gpu_global(gpu, current_entry->instance_ptr, &batch_context->tracker);
945                 }
946 
947                 if (status != NV_OK)
948                     return status;
949 
950                 // Fall through and let the flush restart fault processing
951             }
952             else {
953                 UVM_ASSERT(status == NV_ERR_INVALID_CHANNEL);
954             }
955 
956             // If the channel is gone then we're looking at a stale fault entry.
957             // The fault must have been resolved already (serviced or
958             // cancelled), so we can just flush the fault buffer.
959             status = fault_buffer_flush_locked(gpu,
960                                                UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
961                                                UVM_FAULT_REPLAY_TYPE_START,
962                                                batch_context);
963             if (status != NV_OK)
964                  return status;
965 
966             return NV_WARN_MORE_PROCESSING_REQUIRED;
967         }
968         else {
969             UVM_ASSERT(current_entry->va_space);
970         }
971     }
972 
973     return NV_OK;
974 }
975 
976 // Fault cache preprocessing for fault coalescing
977 //
978 // This function generates an ordered view of the given fault_cache in which
979 // faults are sorted by VA space, fault address (aligned to 4K) and access type
980 // "intrusiveness". In order to minimize the number of instance_ptr to VA space
981 // translations we perform a first sort by instance_ptr.
982 //
983 // This function returns NV_WARN_MORE_PROCESSING_REQUIRED if a fault buffer
984 // flush occurred during instance_ptr translation and executed successfully, or
985 // the error code if it failed. NV_OK otherwise.
986 //
987 // Current scheme:
988 // 1) sort by instance_ptr
989 // 2) translate all instance_ptrs to VA spaces
990 // 3) sort by va_space, fault address (fault_address is page-aligned at this
991 //    point) and access type
992 static NV_STATUS preprocess_fault_batch(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
993 {
994     NV_STATUS status;
995     NvU32 i, j;
996     uvm_fault_buffer_entry_t **ordered_fault_cache = batch_context->ordered_fault_cache;
997 
998     UVM_ASSERT(batch_context->num_coalesced_faults > 0);
999     UVM_ASSERT(batch_context->num_cached_faults >= batch_context->num_coalesced_faults);
1000 
1001     // Generate an ordered view of the fault cache in ordered_fault_cache.
1002     // We sort the pointers, not the entries in fault_cache
1003 
1004     // Initialize pointers before they are sorted. We only sort one instance per
1005     // coalesced fault
1006     for (i = 0, j = 0; i < batch_context->num_cached_faults; ++i) {
1007         if (!batch_context->fault_cache[i].filtered)
1008             ordered_fault_cache[j++] = &batch_context->fault_cache[i];
1009     }
1010     UVM_ASSERT(j == batch_context->num_coalesced_faults);
1011 
1012     // 1) if the fault batch contains more than one, sort by instance_ptr
1013     if (!batch_context->is_single_instance_ptr) {
1014         sort(ordered_fault_cache,
1015              batch_context->num_coalesced_faults,
1016              sizeof(*ordered_fault_cache),
1017              cmp_sort_fault_entry_by_instance_ptr,
1018              NULL);
1019     }
1020 
1021     // 2) translate all instance_ptrs to VA spaces
1022     status = translate_instance_ptrs(gpu, batch_context);
1023     if (status != NV_OK)
1024         return status;
1025 
1026     // 3) sort by va_space, fault address (GPU already reports 4K-aligned
1027     // address) and access type
1028     sort(ordered_fault_cache,
1029          batch_context->num_coalesced_faults,
1030          sizeof(*ordered_fault_cache),
1031          cmp_sort_fault_entry_by_va_space_address_access_type,
1032          NULL);
1033 
1034     return NV_OK;
1035 }
1036 
1037 static bool check_fault_entry_duplicate(const uvm_fault_buffer_entry_t *current_entry,
1038                                         const uvm_fault_buffer_entry_t *previous_entry)
1039 {
1040     bool is_duplicate = false;
1041 
1042     if (previous_entry) {
1043         is_duplicate = (current_entry->va_space == previous_entry->va_space) &&
1044                        (current_entry->fault_address == previous_entry->fault_address);
1045     }
1046 
1047     return is_duplicate;
1048 }
1049 
1050 static void fault_entry_duplicate_flags(uvm_fault_buffer_entry_t *current_entry,
1051                                         const uvm_fault_buffer_entry_t *previous_entry)
1052 {
1053     UVM_ASSERT(previous_entry);
1054     UVM_ASSERT(check_fault_entry_duplicate(current_entry, previous_entry));
1055 
1056     // Propagate the is_invalid_prefetch flag across all prefetch faults
1057     // on the page
1058     if (previous_entry->is_invalid_prefetch)
1059         current_entry->is_invalid_prefetch = true;
1060 
1061     // If a page is throttled, all faults on the page must be skipped
1062     if (previous_entry->is_throttled)
1063         current_entry->is_throttled = true;
1064 }
1065 
1066 static void update_batch_context(uvm_fault_service_batch_context_t *batch_context,
1067                                  uvm_fault_buffer_entry_t *current_entry,
1068                                  const uvm_fault_buffer_entry_t *previous_entry)
1069 {
1070     bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
1071     uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[current_entry->fault_source.utlb_id];
1072 
1073     UVM_ASSERT(utlb->num_pending_faults > 0);
1074 
1075     if (is_duplicate)
1076         batch_context->num_duplicate_faults += current_entry->num_instances;
1077     else
1078         batch_context->num_duplicate_faults += current_entry->num_instances - 1;
1079 
1080     if (current_entry->is_invalid_prefetch)
1081         batch_context->num_invalid_prefetch_faults += current_entry->num_instances;
1082 
1083     if (current_entry->is_fatal) {
1084         utlb->has_fatal_faults = true;
1085         batch_context->has_fatal_faults = true;
1086     }
1087 
1088     if (current_entry->is_throttled)
1089         batch_context->has_throttled_faults = true;
1090 }
1091 
1092 // This function computes the maximum access type that can be serviced for the
1093 // reported fault instances given the logical permissions of the VA range. If
1094 // none of the fault instances can be serviced UVM_FAULT_ACCESS_TYPE_COUNT is
1095 // returned instead.
1096 //
1097 // In the case that there are faults that cannot be serviced, this function
1098 // also sets the flags required for fault cancellation. Prefetch faults do not
1099 // need to be cancelled since they disappear on replay.
1100 //
1101 // The UVM driver considers two scenarios for logical permissions violation:
1102 // - All access types are invalid. For example, when faulting from a processor
1103 // that doesn't have access to the preferred location of a range group when it
1104 // is not migratable. In this case all accesses to the page must be cancelled.
1105 // - Write/atomic accesses are invalid. Basically, when trying to modify a
1106 // read-only VA range. In this case we restrict fault cancelling to those types
1107 // of accesses.
1108 //
1109 // Return values:
1110 // - service_access_type: highest access type that can be serviced.
1111 static uvm_fault_access_type_t check_fault_access_permissions(uvm_gpu_t *gpu,
1112                                                               uvm_va_block_t *va_block,
1113                                                               uvm_va_block_context_t *va_block_context,
1114                                                               uvm_fault_buffer_entry_t *fault_entry,
1115                                                               bool allow_migration)
1116 {
1117     NV_STATUS perm_status;
1118 
1119     perm_status = uvm_va_block_check_logical_permissions(va_block,
1120                                                          va_block_context,
1121                                                          gpu->id,
1122                                                          uvm_va_block_cpu_page_index(va_block,
1123                                                                                      fault_entry->fault_address),
1124                                                          fault_entry->fault_access_type,
1125                                                          allow_migration);
1126     if (perm_status == NV_OK)
1127         return fault_entry->fault_access_type;
1128 
1129     if (fault_entry->fault_access_type == UVM_FAULT_ACCESS_TYPE_PREFETCH) {
1130         fault_entry->is_invalid_prefetch = true;
1131         return UVM_FAULT_ACCESS_TYPE_COUNT;
1132     }
1133 
1134     // At this point we know that some fault instances cannot be serviced
1135     fault_entry->is_fatal = true;
1136     fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(perm_status);
1137 
1138     if (fault_entry->fault_access_type > UVM_FAULT_ACCESS_TYPE_READ) {
1139         fault_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_WRITE_AND_ATOMIC;
1140 
1141         // If there are pending read accesses on the same page, we have to
1142         // service them before we can cancel the write/atomic faults. So we
1143         // retry with read fault access type.
1144         if (uvm_fault_access_type_mask_test(fault_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ)) {
1145             perm_status = uvm_va_block_check_logical_permissions(va_block,
1146                                                                  va_block_context,
1147                                                                  gpu->id,
1148                                                                  uvm_va_block_cpu_page_index(va_block,
1149                                                                                              fault_entry->fault_address),
1150                                                                  UVM_FAULT_ACCESS_TYPE_READ,
1151                                                                  allow_migration);
1152             if (perm_status == NV_OK)
1153                 return UVM_FAULT_ACCESS_TYPE_READ;
1154 
1155             // If that didn't succeed, cancel all faults
1156             fault_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
1157             fault_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(perm_status);
1158         }
1159     }
1160     else {
1161         fault_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
1162     }
1163 
1164     return UVM_FAULT_ACCESS_TYPE_COUNT;
1165 }
1166 
1167 // We notify the fault event for all faults within the block so that the
1168 // performance heuristics are updated. Then, all required actions for the block
1169 // data are performed by the performance heuristics code.
1170 //
1171 // Fatal faults are flagged as fatal for later cancellation. Servicing is not
1172 // interrupted on fatal faults due to insufficient permissions or invalid
1173 // addresses.
1174 //
1175 // Return codes:
1176 // - NV_OK if all faults were handled (both fatal and non-fatal)
1177 // - NV_ERR_MORE_PROCESSING_REQUIRED if servicing needs allocation retry
1178 // - NV_ERR_NO_MEMORY if the faults could not be serviced due to OOM
1179 // - Any other value is a UVM-global error
1180 static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
1181                                                   uvm_va_block_t *va_block,
1182                                                   uvm_va_block_retry_t *va_block_retry,
1183                                                   uvm_fault_service_batch_context_t *batch_context,
1184                                                   NvU32 first_fault_index,
1185                                                   NvU32 *block_faults)
1186 {
1187     NV_STATUS status = NV_OK;
1188     NvU32 i;
1189     uvm_page_index_t first_page_index;
1190     uvm_page_index_t last_page_index;
1191     NvU32 page_fault_count = 0;
1192     uvm_range_group_range_iter_t iter;
1193     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
1194     uvm_fault_buffer_entry_t **ordered_fault_cache = batch_context->ordered_fault_cache;
1195     uvm_service_block_context_t *block_context = &replayable_faults->block_service_context;
1196     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1197     NvU64 end;
1198 
1199     // Check that all uvm_fault_access_type_t values can fit into an NvU8
1200     BUILD_BUG_ON(UVM_FAULT_ACCESS_TYPE_COUNT > (int)(NvU8)-1);
1201 
1202     uvm_assert_mutex_locked(&va_block->lock);
1203 
1204     *block_faults = 0;
1205 
1206     first_page_index = PAGES_PER_UVM_VA_BLOCK;
1207     last_page_index = 0;
1208 
1209     // Initialize fault service block context
1210     uvm_processor_mask_zero(&block_context->resident_processors);
1211     block_context->thrashing_pin_count = 0;
1212     block_context->read_duplicate_count = 0;
1213 
1214     uvm_range_group_range_migratability_iter_first(va_space, va_block->start, va_block->end, &iter);
1215 
1216     // The first entry is guaranteed to fall within this block
1217     UVM_ASSERT(ordered_fault_cache[first_fault_index]->va_space == va_space);
1218     UVM_ASSERT(ordered_fault_cache[first_fault_index]->fault_address >= va_block->start);
1219     UVM_ASSERT(ordered_fault_cache[first_fault_index]->fault_address <= va_block->end);
1220 
1221     if (uvm_va_block_is_hmm(va_block)) {
1222         uvm_hmm_find_policy_end(va_block,
1223                                 &block_context->block_context,
1224                                 ordered_fault_cache[first_fault_index]->fault_address,
1225                                 &end);
1226     }
1227     else {
1228         block_context->block_context.policy = uvm_va_range_get_policy(va_block->va_range);
1229         end = va_block->end;
1230     }
1231 
1232     // Scan the sorted array and notify the fault event for all fault entries
1233     // in the block
1234     for (i = first_fault_index;
1235          i < batch_context->num_coalesced_faults &&
1236          ordered_fault_cache[i]->va_space == va_space &&
1237          ordered_fault_cache[i]->fault_address <= end;
1238          ++i) {
1239         uvm_fault_buffer_entry_t *current_entry = ordered_fault_cache[i];
1240         const uvm_fault_buffer_entry_t *previous_entry = NULL;
1241         bool read_duplicate;
1242         uvm_processor_id_t new_residency;
1243         uvm_perf_thrashing_hint_t thrashing_hint;
1244         uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, current_entry->fault_address);
1245         bool is_duplicate = false;
1246         uvm_fault_access_type_t service_access_type;
1247         NvU32 service_access_type_mask;
1248 
1249         UVM_ASSERT(current_entry->fault_access_type ==
1250                    uvm_fault_access_type_mask_highest(current_entry->access_type_mask));
1251 
1252         current_entry->is_fatal            = false;
1253         current_entry->is_throttled        = false;
1254         current_entry->is_invalid_prefetch = false;
1255 
1256         if (i > first_fault_index) {
1257             previous_entry = ordered_fault_cache[i - 1];
1258             is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
1259         }
1260 
1261         if (block_context->num_retries == 0) {
1262             uvm_perf_event_notify_gpu_fault(&va_space->perf_events,
1263                                             va_block,
1264                                             gpu->id,
1265                                             block_context->block_context.policy->preferred_location,
1266                                             current_entry,
1267                                             batch_context->batch_id,
1268                                             is_duplicate);
1269         }
1270 
1271         // Service the most intrusive fault per page, only. Waive the rest
1272         if (is_duplicate) {
1273             fault_entry_duplicate_flags(current_entry, previous_entry);
1274 
1275             // The previous fault was non-fatal so the page has been already
1276             // serviced
1277             if (!previous_entry->is_fatal)
1278                 goto next;
1279         }
1280 
1281         // Ensure that the migratability iterator covers the current fault
1282         // address
1283         while (iter.end < current_entry->fault_address)
1284             uvm_range_group_range_migratability_iter_next(va_space, &iter, va_block->end);
1285 
1286         UVM_ASSERT(iter.start <= current_entry->fault_address && iter.end >= current_entry->fault_address);
1287 
1288         service_access_type = check_fault_access_permissions(gpu,
1289                                                              va_block,
1290                                                              &block_context->block_context,
1291                                                              current_entry,
1292                                                              iter.migratable);
1293 
1294         // Do not exit early due to logical errors such as access permission
1295         // violation.
1296         if (service_access_type == UVM_FAULT_ACCESS_TYPE_COUNT)
1297             goto next;
1298 
1299         if (service_access_type != current_entry->fault_access_type) {
1300             // Some of the fault instances cannot be serviced due to invalid
1301             // access permissions. Recompute the access type service mask to
1302             // service the rest.
1303             UVM_ASSERT(service_access_type < current_entry->fault_access_type);
1304             service_access_type_mask = uvm_fault_access_type_mask_bit(service_access_type);
1305         }
1306         else {
1307             service_access_type_mask = current_entry->access_type_mask;
1308         }
1309 
1310         // If the GPU already has the necessary access permission, the fault
1311         // does not need to be serviced
1312         if (uvm_va_block_page_is_gpu_authorized(va_block,
1313                                                 page_index,
1314                                                 gpu->id,
1315                                                 uvm_fault_access_type_to_prot(service_access_type)))
1316             goto next;
1317 
1318         thrashing_hint = uvm_perf_thrashing_get_hint(va_block, current_entry->fault_address, gpu->id);
1319         if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) {
1320             // Throttling is implemented by sleeping in the fault handler on
1321             // the CPU and by continuing to process faults on other pages on
1322             // the GPU
1323             current_entry->is_throttled = true;
1324             goto next;
1325         }
1326         else if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
1327             if (block_context->thrashing_pin_count++ == 0)
1328                 uvm_page_mask_zero(&block_context->thrashing_pin_mask);
1329 
1330             uvm_page_mask_set(&block_context->thrashing_pin_mask, page_index);
1331         }
1332 
1333         // Compute new residency and update the masks
1334         new_residency = uvm_va_block_select_residency(va_block,
1335                                                       &block_context->block_context,
1336                                                       page_index,
1337                                                       gpu->id,
1338                                                       service_access_type_mask,
1339                                                       block_context->block_context.policy,
1340                                                       &thrashing_hint,
1341                                                       UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS,
1342                                                       &read_duplicate);
1343 
1344         if (!uvm_processor_mask_test_and_set(&block_context->resident_processors, new_residency))
1345             uvm_page_mask_zero(&block_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
1346 
1347         uvm_page_mask_set(&block_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index);
1348 
1349         if (read_duplicate) {
1350             if (block_context->read_duplicate_count++ == 0)
1351                 uvm_page_mask_zero(&block_context->read_duplicate_mask);
1352 
1353             uvm_page_mask_set(&block_context->read_duplicate_mask, page_index);
1354         }
1355 
1356         ++page_fault_count;
1357 
1358         block_context->access_type[page_index] = service_access_type;
1359 
1360         if (page_index < first_page_index)
1361             first_page_index = page_index;
1362         if (page_index > last_page_index)
1363             last_page_index = page_index;
1364 
1365     next:
1366         // Only update counters the first time since logical permissions cannot
1367         // change while we hold the VA space lock
1368         // TODO: Bug 1750144: That might not be true with HMM.
1369         if (block_context->num_retries == 0)
1370             update_batch_context(batch_context, current_entry, previous_entry);
1371     }
1372 
1373     // Apply the changes computed in the fault service block context, if there
1374     // are pages to be serviced
1375     if (page_fault_count > 0) {
1376         block_context->region = uvm_va_block_region(first_page_index, last_page_index + 1);
1377         status = uvm_va_block_service_locked(gpu->id, va_block, va_block_retry, block_context);
1378     }
1379 
1380     *block_faults = i - first_fault_index;
1381 
1382     ++block_context->num_retries;
1383 
1384     if (status == NV_OK && batch_context->has_fatal_faults)
1385         status = uvm_va_block_set_cancel(va_block, &block_context->block_context, gpu);
1386 
1387     return status;
1388 }
1389 
1390 // We notify the fault event for all faults within the block so that the
1391 // performance heuristics are updated. The VA block lock is taken for the whole
1392 // fault servicing although it might be temporarily dropped and re-taken if
1393 // memory eviction is required.
1394 //
1395 // See the comments for function service_fault_batch_block_locked for
1396 // implementation details and error codes.
1397 static NV_STATUS service_fault_batch_block(uvm_gpu_t *gpu,
1398                                            uvm_va_block_t *va_block,
1399                                            uvm_fault_service_batch_context_t *batch_context,
1400                                            NvU32 first_fault_index,
1401                                            NvU32 *block_faults)
1402 {
1403     NV_STATUS status;
1404     uvm_va_block_retry_t va_block_retry;
1405     NV_STATUS tracker_status;
1406     uvm_service_block_context_t *fault_block_context = &gpu->parent->fault_buffer_info.replayable.block_service_context;
1407 
1408     fault_block_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS;
1409     fault_block_context->num_retries = 0;
1410 
1411     uvm_mutex_lock(&va_block->lock);
1412 
1413     status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, &va_block_retry,
1414                                        service_fault_batch_block_locked(gpu,
1415                                                                         va_block,
1416                                                                         &va_block_retry,
1417                                                                         batch_context,
1418                                                                         first_fault_index,
1419                                                                         block_faults));
1420 
1421     tracker_status = uvm_tracker_add_tracker_safe(&batch_context->tracker, &va_block->tracker);
1422 
1423     uvm_mutex_unlock(&va_block->lock);
1424 
1425     return status == NV_OK? tracker_status: status;
1426 }
1427 
1428 typedef enum
1429 {
1430     // Use this mode when calling from the normal fault servicing path
1431     FAULT_SERVICE_MODE_REGULAR,
1432 
1433     // Use this mode when servicing faults from the fault cancelling algorithm.
1434     // In this mode no replays are issued
1435     FAULT_SERVICE_MODE_CANCEL,
1436 } fault_service_mode_t;
1437 
1438 static NV_STATUS service_fault_batch_ats(uvm_gpu_va_space_t *gpu_va_space,
1439                                          struct mm_struct *mm,
1440                                          uvm_fault_service_batch_context_t *batch_context,
1441                                          NvU32 first_fault_index,
1442                                          NvU32 *block_faults)
1443 {
1444     NV_STATUS status;
1445     uvm_gpu_t *gpu = gpu_va_space->gpu;
1446     uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate;
1447     uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index];
1448     const uvm_fault_buffer_entry_t *previous_entry = first_fault_index > 0 ?
1449                                                        batch_context->ordered_fault_cache[first_fault_index - 1] : NULL;
1450     bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
1451 
1452     if (is_duplicate)
1453         fault_entry_duplicate_flags(current_entry, previous_entry);
1454 
1455     // Generate fault events for all fault packets
1456     uvm_perf_event_notify_gpu_fault(&current_entry->va_space->perf_events,
1457                                     NULL,
1458                                     gpu->id,
1459                                     UVM_ID_INVALID,
1460                                     current_entry,
1461                                     batch_context->batch_id,
1462                                     is_duplicate);
1463 
1464     // The VA isn't managed. See if ATS knows about it, unless it is a
1465     // duplicate and the previous fault was non-fatal so the page has
1466     // already been serviced
1467     //
1468     // TODO: Bug 2103669: Service more than one ATS fault at a time so we
1469     //       don't do an unconditional VA range lookup for every ATS fault.
1470     if (!is_duplicate || previous_entry->is_fatal)
1471         status = uvm_ats_service_fault_entry(gpu_va_space, current_entry, ats_invalidate);
1472     else
1473         status = NV_OK;
1474 
1475     (*block_faults)++;
1476 
1477     update_batch_context(batch_context, current_entry, previous_entry);
1478 
1479     return status;
1480 }
1481 
1482 static void service_fault_batch_fatal(uvm_gpu_t *gpu,
1483                                       uvm_fault_service_batch_context_t *batch_context,
1484                                       NvU32 first_fault_index,
1485                                       NV_STATUS status,
1486                                       NvU32 *block_faults)
1487 {
1488     uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index];
1489     const uvm_fault_buffer_entry_t *previous_entry = first_fault_index > 0 ?
1490                                                        batch_context->ordered_fault_cache[first_fault_index - 1] : NULL;
1491     bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
1492 
1493     if (is_duplicate)
1494         fault_entry_duplicate_flags(current_entry, previous_entry);
1495 
1496     // The VA block cannot be found, set the fatal fault flag,
1497     // unless it is a prefetch fault
1498     if (current_entry->fault_access_type == UVM_FAULT_ACCESS_TYPE_PREFETCH) {
1499         current_entry->is_invalid_prefetch = true;
1500     }
1501     else {
1502         current_entry->is_fatal = true;
1503         current_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status);
1504         current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
1505     }
1506 
1507     update_batch_context(batch_context, current_entry, previous_entry);
1508 
1509     uvm_perf_event_notify_gpu_fault(&current_entry->va_space->perf_events,
1510                                     NULL,
1511                                     gpu->id,
1512                                     UVM_ID_INVALID,
1513                                     current_entry,
1514                                     batch_context->batch_id,
1515                                     is_duplicate);
1516 
1517     (*block_faults)++;
1518 }
1519 
1520 static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space,
1521                                               uvm_gpu_va_space_t *gpu_va_space,
1522                                               uvm_fault_service_batch_context_t *batch_context,
1523                                               NvU32 first_fault_index,
1524                                               NvU32 *block_faults)
1525 {
1526     NV_STATUS status;
1527     uvm_va_range_t *va_range;
1528     uvm_va_block_t *va_block;
1529     uvm_gpu_t *gpu = gpu_va_space->gpu;
1530     uvm_va_block_context_t *va_block_context =
1531         &gpu->parent->fault_buffer_info.replayable.block_service_context.block_context;
1532     uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index];
1533     struct mm_struct *mm = va_block_context->mm;
1534     NvU64 fault_address = current_entry->fault_address;
1535 
1536     (*block_faults) = 0;
1537 
1538     va_range = uvm_va_range_find(va_space, fault_address);
1539     status = uvm_va_block_find_create_in_range(va_space, va_range, fault_address, va_block_context, &va_block);
1540     if (status == NV_OK) {
1541         status = service_fault_batch_block(gpu, va_block, batch_context, first_fault_index, block_faults);
1542     }
1543     else if ((status == NV_ERR_INVALID_ADDRESS) && uvm_ats_can_service_faults(gpu_va_space, mm)) {
1544         status = service_fault_batch_ats(gpu_va_space, mm, batch_context, first_fault_index, block_faults);
1545     }
1546     else {
1547         service_fault_batch_fatal(gpu_va_space->gpu, batch_context, first_fault_index, status, block_faults);
1548 
1549         // Do not fail due to logical errors
1550         status = NV_OK;
1551     }
1552 
1553     return status;
1554 }
1555 
1556 // Scan the ordered view of faults and group them by different va_blocks
1557 // (managed faults) and service faults for each va_block, in batch.
1558 // Service non-managed faults one at a time as they are encountered during the
1559 // scan.
1560 //
1561 // This function returns NV_WARN_MORE_PROCESSING_REQUIRED if the fault buffer
1562 // was flushed because the needs_fault_buffer_flush flag was set on some GPU VA
1563 // space
1564 static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
1565                                      fault_service_mode_t service_mode,
1566                                      uvm_fault_service_batch_context_t *batch_context)
1567 {
1568     NV_STATUS status = NV_OK;
1569     NvU32 i;
1570     uvm_va_space_t *va_space = NULL;
1571     uvm_gpu_va_space_t *gpu_va_space = NULL;
1572     uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate;
1573     struct mm_struct *mm = NULL;
1574     const bool replay_per_va_block = service_mode != FAULT_SERVICE_MODE_CANCEL &&
1575                                      gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK;
1576     uvm_va_block_context_t *va_block_context =
1577         &gpu->parent->fault_buffer_info.replayable.block_service_context.block_context;
1578 
1579     UVM_ASSERT(gpu->parent->replayable_faults_supported);
1580 
1581     ats_invalidate->write_faults_in_batch = false;
1582 
1583     for (i = 0; i < batch_context->num_coalesced_faults;) {
1584         NvU32 block_faults;
1585         uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
1586         uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[current_entry->fault_source.utlb_id];
1587 
1588         UVM_ASSERT(current_entry->va_space);
1589 
1590         if (current_entry->va_space != va_space) {
1591             // Fault on a different va_space, drop the lock of the old one...
1592             if (va_space != NULL) {
1593                 // TLB entries are invalidated per GPU VA space
1594                 status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
1595                 if (status != NV_OK)
1596                     goto fail;
1597 
1598                 uvm_va_space_up_read(va_space);
1599                 uvm_va_space_mm_release_unlock(va_space, mm);
1600                 mm = NULL;
1601             }
1602 
1603             va_space = current_entry->va_space;
1604 
1605             // ... and take the lock of the new one
1606 
1607             // If an mm is registered with the VA space, we have to retain it
1608             // in order to lock it before locking the VA space. It is guaranteed
1609             // to remain valid until we release. If no mm is registered, we
1610             // can only service managed faults, not ATS/HMM faults.
1611             mm = uvm_va_space_mm_retain_lock(va_space);
1612             va_block_context->mm = mm;
1613 
1614             uvm_va_space_down_read(va_space);
1615 
1616             gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
1617             if (uvm_processor_mask_test_and_clear_atomic(&va_space->needs_fault_buffer_flush, gpu->id)) {
1618                 status = fault_buffer_flush_locked(gpu,
1619                                                    UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
1620                                                    UVM_FAULT_REPLAY_TYPE_START,
1621                                                    batch_context);
1622                 if (status == NV_OK)
1623                     status = NV_WARN_MORE_PROCESSING_REQUIRED;
1624 
1625                 break;
1626             }
1627 
1628             // The case where there is no valid GPU VA space for the GPU in this
1629             // VA space is handled next
1630         }
1631 
1632         // Some faults could be already fatal if they cannot be handled by
1633         // the UVM driver
1634         if (current_entry->is_fatal) {
1635             ++i;
1636             batch_context->has_fatal_faults = true;
1637             utlb->has_fatal_faults = true;
1638             UVM_ASSERT(utlb->num_pending_faults > 0);
1639             continue;
1640         }
1641 
1642         if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
1643             // If there is no GPU VA space for the GPU, ignore the fault. This
1644             // can happen if a GPU VA space is destroyed without explicitly
1645             // freeing all memory ranges (destroying the VA range triggers a
1646             // flush of the fault buffer) and there are stale entries in the
1647             // buffer that got fixed by the servicing in a previous batch.
1648             ++i;
1649             continue;
1650         }
1651 
1652         status = service_fault_batch_dispatch(va_space, gpu_va_space, batch_context, i, &block_faults);
1653         // TODO: Bug 3900733: clean up locking in service_fault_batch().
1654         if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
1655             uvm_va_space_up_read(va_space);
1656             uvm_va_space_mm_release_unlock(va_space, mm);
1657             mm = NULL;
1658             va_space = NULL;
1659             continue;
1660         }
1661         if (status != NV_OK)
1662             goto fail;
1663 
1664         i += block_faults;
1665 
1666         // Don't issue replays in cancel mode
1667         if (replay_per_va_block && !batch_context->has_fatal_faults) {
1668             status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
1669             if (status != NV_OK)
1670                 goto fail;
1671 
1672             // Increment the batch id if UVM_PERF_FAULT_REPLAY_POLICY_BLOCK
1673             // is used, as we issue a replay after servicing each VA block
1674             // and we can service a number of VA blocks before returning.
1675             ++batch_context->batch_id;
1676         }
1677     }
1678 
1679     // Only clobber status if invalidate_status != NV_OK, since status may also
1680     // contain NV_WARN_MORE_PROCESSING_REQUIRED.
1681     if (va_space != NULL) {
1682         NV_STATUS invalidate_status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
1683         if (invalidate_status != NV_OK)
1684             status = invalidate_status;
1685     }
1686 
1687 fail:
1688     if (va_space != NULL) {
1689         uvm_va_space_up_read(va_space);
1690         uvm_va_space_mm_release_unlock(va_space, mm);
1691     }
1692 
1693     return status;
1694 }
1695 
1696 // Tells if the given fault entry is the first one in its uTLB
1697 static bool is_first_fault_in_utlb(uvm_fault_service_batch_context_t *batch_context, NvU32 fault_index)
1698 {
1699     NvU32 i;
1700     NvU32 utlb_id = batch_context->fault_cache[fault_index].fault_source.utlb_id;
1701 
1702     for (i = 0; i < fault_index; ++i) {
1703         uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i];
1704 
1705         // We have found a prior fault in the same uTLB
1706         if (current_entry->fault_source.utlb_id == utlb_id)
1707             return false;
1708     }
1709 
1710     return true;
1711 }
1712 
1713 // Compute the number of fatal and non-fatal faults for a page in the given uTLB
1714 static void faults_for_page_in_utlb(uvm_fault_service_batch_context_t *batch_context,
1715                                     uvm_va_space_t *va_space,
1716                                     NvU64 addr,
1717                                     NvU32 utlb_id,
1718                                     NvU32 *fatal_faults,
1719                                     NvU32 *non_fatal_faults)
1720 {
1721     NvU32 i;
1722 
1723     *fatal_faults = 0;
1724     *non_fatal_faults = 0;
1725 
1726     // Fault filtering is not allowed in the TLB-based fault cancel path
1727     UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults);
1728 
1729     for (i = 0; i < batch_context->num_cached_faults; ++i) {
1730         uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i];
1731 
1732         if (current_entry->fault_source.utlb_id == utlb_id &&
1733             current_entry->va_space == va_space && current_entry->fault_address == addr) {
1734             // We have found the page
1735             if (current_entry->is_fatal)
1736                 ++(*fatal_faults);
1737             else
1738                 ++(*non_fatal_faults);
1739         }
1740     }
1741 }
1742 
1743 // Function that tells if there are addresses (reminder: they are aligned to 4K)
1744 // with non-fatal faults only
1745 static bool no_fatal_pages_in_utlb(uvm_fault_service_batch_context_t *batch_context,
1746                                    NvU32 start_index,
1747                                    NvU32 utlb_id)
1748 {
1749     NvU32 i;
1750 
1751     // Fault filtering is not allowed in the TLB-based fault cancel path
1752     UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults);
1753 
1754     for (i = start_index; i < batch_context->num_cached_faults; ++i) {
1755         uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i];
1756 
1757         if (current_entry->fault_source.utlb_id == utlb_id) {
1758             // We have found a fault for the uTLB
1759             NvU32 fatal_faults;
1760             NvU32 non_fatal_faults;
1761 
1762             faults_for_page_in_utlb(batch_context,
1763                                     current_entry->va_space,
1764                                     current_entry->fault_address,
1765                                     utlb_id,
1766                                     &fatal_faults,
1767                                     &non_fatal_faults);
1768 
1769             if (non_fatal_faults > 0 && fatal_faults == 0)
1770                 return true;
1771         }
1772     }
1773 
1774     return false;
1775 }
1776 
1777 static void record_fatal_fault_helper(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *entry, UvmEventFatalReason reason)
1778 {
1779     uvm_va_space_t *va_space;
1780 
1781     va_space = entry->va_space;
1782     UVM_ASSERT(va_space);
1783     uvm_va_space_down_read(va_space);
1784     // Record fatal fault event
1785     uvm_tools_record_gpu_fatal_fault(gpu->parent->id, va_space, entry, reason);
1786     uvm_va_space_up_read(va_space);
1787 }
1788 
1789 // This function tries to find and issue a cancel for each uTLB that meets
1790 // the requirements to guarantee precise fault attribution:
1791 // - No new faults can arrive on the uTLB (uTLB is in lockdown)
1792 // - The first fault in the buffer for a specific uTLB is fatal
1793 // - There are no other addresses in the uTLB with non-fatal faults only
1794 //
1795 // This function and the related helpers iterate over faults as read from HW,
1796 // not through the ordered fault view
1797 //
1798 // TODO: Bug 1766754
1799 // This is very costly, although not critical for performance since we are
1800 // cancelling.
1801 // - Build a list with all the faults within a uTLB
1802 // - Sort by uTLB id
1803 static NV_STATUS try_to_cancel_utlbs(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
1804 {
1805     NvU32 i;
1806 
1807     // Fault filtering is not allowed in the TLB-based fault cancel path
1808     UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults);
1809 
1810     for (i = 0; i < batch_context->num_cached_faults; ++i) {
1811         uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i];
1812         uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[current_entry->fault_source.utlb_id];
1813         NvU32 gpc_id = current_entry->fault_source.gpc_id;
1814         NvU32 utlb_id = current_entry->fault_source.utlb_id;
1815         NvU32 client_id = current_entry->fault_source.client_id;
1816 
1817         // Only fatal faults are considered
1818         if (!current_entry->is_fatal)
1819             continue;
1820 
1821         // Only consider uTLBs in lock-down
1822         if (!utlb->in_lockdown)
1823             continue;
1824 
1825         // Issue a single cancel per uTLB
1826         if (utlb->cancelled)
1827             continue;
1828 
1829         if (is_first_fault_in_utlb(batch_context, i) &&
1830             !no_fatal_pages_in_utlb(batch_context, i + 1, utlb_id)) {
1831             NV_STATUS status;
1832 
1833             record_fatal_fault_helper(gpu, current_entry, current_entry->fatal_reason);
1834 
1835             status = push_cancel_on_gpu_targeted(gpu,
1836                                                  current_entry->instance_ptr,
1837                                                  gpc_id,
1838                                                  client_id,
1839                                                  &batch_context->tracker);
1840             if (status != NV_OK)
1841                 return status;
1842 
1843             utlb->cancelled = true;
1844         }
1845     }
1846 
1847     return NV_OK;
1848 }
1849 
1850 static NvU32 find_fatal_fault_in_utlb(uvm_fault_service_batch_context_t *batch_context,
1851                                       NvU32 utlb_id)
1852 {
1853     NvU32 i;
1854 
1855     // Fault filtering is not allowed in the TLB-based fault cancel path
1856     UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults);
1857 
1858     for (i = 0; i < batch_context->num_cached_faults; ++i) {
1859         if (batch_context->fault_cache[i].is_fatal &&
1860             batch_context->fault_cache[i].fault_source.utlb_id == utlb_id)
1861             return i;
1862     }
1863 
1864     return i;
1865 }
1866 
1867 static NvU32 is_fatal_fault_in_buffer(uvm_fault_service_batch_context_t *batch_context,
1868                                       uvm_fault_buffer_entry_t *fault)
1869 {
1870     NvU32 i;
1871 
1872     // Fault filtering is not allowed in the TLB-based fault cancel path
1873     UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults);
1874 
1875     for (i = 0; i < batch_context->num_cached_faults; ++i) {
1876         uvm_fault_buffer_entry_t *current_entry = &batch_context->fault_cache[i];
1877         if (cmp_fault_instance_ptr(current_entry, fault) == 0 &&
1878             current_entry->fault_address == fault->fault_address &&
1879             current_entry->fault_access_type == fault->fault_access_type &&
1880             current_entry->fault_source.utlb_id == fault->fault_source.utlb_id) {
1881             return true;
1882         }
1883     }
1884 
1885     return false;
1886 }
1887 
1888 typedef enum
1889 {
1890     // Only cancel faults flagged as fatal
1891     FAULT_CANCEL_MODE_FATAL,
1892 
1893     // Cancel all faults in the batch unconditionally
1894     FAULT_CANCEL_MODE_ALL,
1895 } fault_cancel_mode_t;
1896 
1897 // Cancel faults in the given fault service batch context. The function provides
1898 // two different modes depending on the value of cancel_mode:
1899 // - If cancel_mode == FAULT_CANCEL_MODE_FATAL, only faults flagged as fatal
1900 // will be cancelled. In this case, the reason reported to tools is the one
1901 // contained in the fault entry itself.
1902 // - If cancel_mode == FAULT_CANCEL_MODE_ALL, all faults will be cancelled
1903 // unconditionally. In this case, the reason reported to tools for non-fatal
1904 // faults is the one passed to this function.
1905 static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu,
1906                                           uvm_fault_service_batch_context_t *batch_context,
1907                                           fault_cancel_mode_t cancel_mode,
1908                                           UvmEventFatalReason reason)
1909 {
1910     NV_STATUS status = NV_OK;
1911     NV_STATUS fault_status;
1912     uvm_va_space_t *va_space = NULL;
1913     NvU32 i;
1914 
1915     UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
1916     if (cancel_mode == FAULT_CANCEL_MODE_ALL)
1917         UVM_ASSERT(reason != UvmEventFatalReasonInvalid);
1918 
1919     for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
1920         uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
1921 
1922         UVM_ASSERT(current_entry->va_space);
1923 
1924         if (current_entry->va_space != va_space) {
1925             // Fault on a different va_space, drop the lock of the old one...
1926             if (va_space != NULL)
1927                 uvm_va_space_up_read(va_space);
1928 
1929             va_space = current_entry->va_space;
1930 
1931             // ... and take the lock of the new one
1932             uvm_va_space_down_read(va_space);
1933 
1934             // We don't need to check whether a buffer flush is required
1935             // (due to VA range destruction).
1936             // - For cancel_mode == FAULT_CANCEL_MODE_FATAL, once a fault is
1937             // flagged as fatal we need to cancel it, even if its VA range no
1938             // longer exists.
1939             // - For cancel_mode == FAULT_CANCEL_MODE_ALL we don't care about
1940             // any of this, we just want to trigger RC in RM.
1941         }
1942 
1943         if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
1944             // If there is no GPU VA space for the GPU, ignore the fault.
1945             // This can happen if the GPU VA did not exist in
1946             // service_fault_batch(), or it was destroyed since then.
1947             // This is to avoid targetting a PDB that might have been reused
1948             // by another process.
1949             continue;
1950         }
1951 
1952         // Cancel the fault
1953         if (cancel_mode == FAULT_CANCEL_MODE_ALL || current_entry->is_fatal) {
1954             uvm_fault_cancel_va_mode_t cancel_va_mode = current_entry->replayable.cancel_va_mode;
1955 
1956             // If cancelling unconditionally and the fault was not fatal,
1957             // set the cancel reason passed to this function
1958             if (!current_entry->is_fatal) {
1959                 current_entry->fatal_reason = reason;
1960                 cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
1961             }
1962 
1963             status = cancel_fault_precise_va(gpu, current_entry, cancel_va_mode);
1964             if (status != NV_OK)
1965                 break;
1966         }
1967     }
1968 
1969     if (va_space != NULL)
1970         uvm_va_space_up_read(va_space);
1971 
1972     // After cancelling the fatal faults, the fault buffer is flushed to remove
1973     // any potential duplicated fault that may have been added while processing
1974     // the faults in this batch. This flush also avoids doing unnecessary
1975     // processing after the fatal faults have been cancelled, so all the rest
1976     // are unlikely to remain after a replay because the context is probably in
1977     // the process of dying.
1978     fault_status = fault_buffer_flush_locked(gpu,
1979                                              UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
1980                                              UVM_FAULT_REPLAY_TYPE_START,
1981                                              batch_context);
1982 
1983     // We report the first encountered error.
1984     if (status == NV_OK)
1985         status = fault_status;
1986 
1987     return status;
1988 }
1989 
1990 // Function called when the system has found a global error and needs to
1991 // trigger RC in RM.
1992 static void cancel_fault_batch_tlb(uvm_gpu_t *gpu,
1993                                    uvm_fault_service_batch_context_t *batch_context,
1994                                    UvmEventFatalReason reason)
1995 {
1996     NvU32 i;
1997 
1998     for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
1999         NV_STATUS status = NV_OK;
2000         uvm_fault_buffer_entry_t *current_entry;
2001         uvm_fault_buffer_entry_t *coalesced_entry;
2002 
2003         current_entry = batch_context->ordered_fault_cache[i];
2004 
2005         // The list iteration below skips the entry used as 'head'.
2006         // Report the 'head' entry explicitly.
2007         uvm_va_space_down_read(current_entry->va_space);
2008         uvm_tools_record_gpu_fatal_fault(gpu->parent->id, current_entry->va_space, current_entry, reason);
2009 
2010         list_for_each_entry(coalesced_entry, &current_entry->merged_instances_list, merged_instances_list)
2011             uvm_tools_record_gpu_fatal_fault(gpu->parent->id, current_entry->va_space, coalesced_entry, reason);
2012         uvm_va_space_up_read(current_entry->va_space);
2013 
2014         // We need to cancel each instance pointer to correctly handle faults from multiple contexts.
2015         status = push_cancel_on_gpu_global(gpu, current_entry->instance_ptr, &batch_context->tracker);
2016         if (status != NV_OK)
2017             break;
2018     }
2019 }
2020 
2021 static void cancel_fault_batch(uvm_gpu_t *gpu,
2022                                uvm_fault_service_batch_context_t *batch_context,
2023                                UvmEventFatalReason reason)
2024 {
2025     if (gpu->parent->fault_cancel_va_supported) {
2026         cancel_faults_precise_va(gpu, batch_context, FAULT_CANCEL_MODE_ALL, reason);
2027         return;
2028     }
2029 
2030     cancel_fault_batch_tlb(gpu, batch_context, reason);
2031 }
2032 
2033 
2034 // Current fault cancel algorithm
2035 //
2036 // 1- Disable prefetching to avoid new requests keep coming and flooding the
2037 // buffer.
2038 // LOOP
2039 //   2- Record one fatal fault per uTLB to check if it shows up after the replay
2040 //   3- Flush fault buffer (REPLAY_TYPE_START_ACK_ALL to prevent new faults from
2041 //      coming to TLBs with pending faults)
2042 //   4- Wait for replay to finish
2043 //   5- Fetch all faults from buffer
2044 //   6- Check what uTLBs are in lockdown mode and can be cancelled
2045 //   7- Preprocess faults (order per va_space, fault address, access type)
2046 //   8- Service all non-fatal faults and mark all non-serviceable faults as fatal
2047 //      6.1- If fatal faults are not found, we are done
2048 //   9- Search for a uTLB which can be targeted for cancel, as described in
2049 //      try_to_cancel_utlbs. If found, cancel it.
2050 // END LOOP
2051 // 10- Re-enable prefetching
2052 //
2053 // NOTE: prefetch faults MUST NOT trigger fault cancel. We make sure that no
2054 // prefetch faults are left in the buffer by disabling prefetching and
2055 // flushing the fault buffer afterwards (prefetch faults are not replayed and,
2056 // therefore, will not show up again)
2057 static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
2058 {
2059     NV_STATUS status;
2060     NV_STATUS tracker_status;
2061     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
2062     bool first = true;
2063 
2064     UVM_ASSERT(gpu->parent->replayable_faults_supported);
2065 
2066     // 1) Disable prefetching to avoid new requests keep coming and flooding
2067     //    the buffer
2068     if (gpu->parent->fault_buffer_info.prefetch_faults_enabled)
2069         gpu->parent->arch_hal->disable_prefetch_faults(gpu->parent);
2070 
2071     while (1) {
2072         NvU32 utlb_id;
2073 
2074         // 2) Record one fatal fault per uTLB to check if it shows up after
2075         // the replay. This is used to handle the case in which the uTLB is
2076         // being cancelled from behind our backs by RM. See the comment in
2077         // step 6.
2078         for (utlb_id = 0; utlb_id <= batch_context->max_utlb_id; ++utlb_id) {
2079             uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[utlb_id];
2080 
2081             if (!first && utlb->has_fatal_faults) {
2082                 NvU32 idx = find_fatal_fault_in_utlb(batch_context, utlb_id);
2083                 UVM_ASSERT(idx < batch_context->num_cached_faults);
2084 
2085                 utlb->prev_fatal_fault = batch_context->fault_cache[idx];
2086             }
2087             else {
2088                 utlb->prev_fatal_fault.fault_address = (NvU64)-1;
2089             }
2090         }
2091         first = false;
2092 
2093         // 3) Flush fault buffer. After this call, all faults from any of the
2094         // faulting uTLBs are before PUT. New faults from other uTLBs can keep
2095         // arriving. Therefore, in each iteration we just try to cancel faults
2096         // from uTLBs that contained fatal faults in the previous iterations
2097         // and will cause the TLB to stop generating new page faults after the
2098         // following replay with type UVM_FAULT_REPLAY_TYPE_START_ACK_ALL
2099         status = fault_buffer_flush_locked(gpu,
2100                                            UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
2101                                            UVM_FAULT_REPLAY_TYPE_START_ACK_ALL,
2102                                            batch_context);
2103         if (status != NV_OK)
2104             break;
2105 
2106         // 4) Wait for replay to finish
2107         status = uvm_tracker_wait(&replayable_faults->replay_tracker);
2108         if (status != NV_OK)
2109             break;
2110 
2111         batch_context->num_invalid_prefetch_faults = 0;
2112         batch_context->num_replays                 = 0;
2113         batch_context->has_fatal_faults            = false;
2114         batch_context->has_throttled_faults        = false;
2115 
2116         // 5) Fetch all faults from buffer
2117         fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_ALL);
2118         ++batch_context->batch_id;
2119 
2120         UVM_ASSERT(batch_context->num_cached_faults == batch_context->num_coalesced_faults);
2121 
2122         // No more faults left, we are done
2123         if (batch_context->num_cached_faults == 0)
2124             break;
2125 
2126         // 6) Check what uTLBs are in lockdown mode and can be cancelled
2127         for (utlb_id = 0; utlb_id <= batch_context->max_utlb_id; ++utlb_id) {
2128             uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[utlb_id];
2129 
2130             utlb->in_lockdown = false;
2131             utlb->cancelled   = false;
2132 
2133             if (utlb->prev_fatal_fault.fault_address != (NvU64)-1) {
2134                 // If a previously-reported fault shows up again we can "safely"
2135                 // assume that the uTLB that contains it is in lockdown mode
2136                 // and no new translations will show up before cancel.
2137                 // A fatal fault could only be removed behind our backs by RM
2138                 // issuing a cancel, which only happens when RM is resetting the
2139                 // engine. That means the instance pointer can't generate any
2140                 // new faults, so we won't have an ABA problem where a new
2141                 // fault arrives with the same state.
2142                 if (is_fatal_fault_in_buffer(batch_context, &utlb->prev_fatal_fault))
2143                     utlb->in_lockdown = true;
2144             }
2145         }
2146 
2147         // 7) Preprocess faults
2148         status = preprocess_fault_batch(gpu, batch_context);
2149         if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
2150             continue;
2151         else if (status != NV_OK)
2152             break;
2153 
2154         // 8) Service all non-fatal faults and mark all non-serviceable faults
2155         // as fatal
2156         status = service_fault_batch(gpu, FAULT_SERVICE_MODE_CANCEL, batch_context);
2157         if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
2158             continue;
2159 
2160         UVM_ASSERT(batch_context->num_replays == 0);
2161         if (status == NV_ERR_NO_MEMORY)
2162             continue;
2163         else if (status != NV_OK)
2164             break;
2165 
2166         // No more fatal faults left, we are done
2167         if (!batch_context->has_fatal_faults)
2168             break;
2169 
2170         // 9) Search for uTLBs that contain fatal faults and meet the
2171         // requirements to be cancelled
2172         try_to_cancel_utlbs(gpu, batch_context);
2173     }
2174 
2175     // 10) Re-enable prefetching
2176     if (gpu->parent->fault_buffer_info.prefetch_faults_enabled)
2177         gpu->parent->arch_hal->enable_prefetch_faults(gpu->parent);
2178 
2179     if (status == NV_OK)
2180         status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
2181 
2182     tracker_status = uvm_tracker_wait(&batch_context->tracker);
2183 
2184     return status == NV_OK? tracker_status: status;
2185 }
2186 
2187 static NV_STATUS cancel_faults_precise(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
2188 {
2189     UVM_ASSERT(batch_context->has_fatal_faults);
2190     if (gpu->parent->fault_cancel_va_supported) {
2191         return cancel_faults_precise_va(gpu,
2192                                         batch_context,
2193                                         FAULT_CANCEL_MODE_FATAL,
2194                                         UvmEventFatalReasonInvalid);
2195     }
2196 
2197     return cancel_faults_precise_tlb(gpu, batch_context);
2198 }
2199 
2200 static void enable_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu, uvm_fault_service_batch_context_t *batch_context)
2201 {
2202     if (!parent_gpu->prefetch_fault_supported)
2203         return;
2204 
2205     // If more than 66% of faults are invalid prefetch accesses, disable
2206     // prefetch faults for a while.
2207     // Some tests rely on this logic (and ratio) to correctly disable prefetch
2208     // fault reporting. If the logic changes, the tests will have to be changed.
2209     if (parent_gpu->fault_buffer_info.prefetch_faults_enabled &&
2210         uvm_perf_reenable_prefetch_faults_lapse_msec > 0 &&
2211         ((batch_context->num_invalid_prefetch_faults * 3 > parent_gpu->fault_buffer_info.max_batch_size * 2) ||
2212          (uvm_enable_builtin_tests &&
2213           parent_gpu->rm_info.isSimulated &&
2214           batch_context->num_invalid_prefetch_faults > 5))) {
2215         uvm_gpu_disable_prefetch_faults(parent_gpu);
2216     }
2217     else if (!parent_gpu->fault_buffer_info.prefetch_faults_enabled) {
2218         NvU64 lapse = NV_GETTIME() - parent_gpu->fault_buffer_info.disable_prefetch_faults_timestamp;
2219 
2220         // Reenable prefetch faults after some time
2221         if (lapse > ((NvU64)uvm_perf_reenable_prefetch_faults_lapse_msec * (1000 * 1000)))
2222             uvm_gpu_enable_prefetch_faults(parent_gpu);
2223     }
2224 }
2225 
2226 void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
2227 {
2228     NvU32 num_replays = 0;
2229     NvU32 num_batches = 0;
2230     NvU32 num_throttled = 0;
2231     NV_STATUS status = NV_OK;
2232     uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
2233     uvm_fault_service_batch_context_t *batch_context = &replayable_faults->batch_service_context;
2234 
2235     UVM_ASSERT(gpu->parent->replayable_faults_supported);
2236 
2237     uvm_tracker_init(&batch_context->tracker);
2238 
2239     // Process all faults in the buffer
2240     while (1) {
2241         if (num_throttled >= uvm_perf_fault_max_throttle_per_service ||
2242             num_batches >= uvm_perf_fault_max_batches_per_service) {
2243             break;
2244         }
2245 
2246         batch_context->num_invalid_prefetch_faults = 0;
2247         batch_context->num_duplicate_faults        = 0;
2248         batch_context->num_replays                 = 0;
2249         batch_context->has_fatal_faults            = false;
2250         batch_context->has_throttled_faults        = false;
2251 
2252         fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_BATCH_READY);
2253         if (batch_context->num_cached_faults == 0)
2254             break;
2255 
2256         ++batch_context->batch_id;
2257 
2258         status = preprocess_fault_batch(gpu, batch_context);
2259 
2260         num_replays += batch_context->num_replays;
2261 
2262         if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
2263             continue;
2264         else if (status != NV_OK)
2265             break;
2266 
2267         status = service_fault_batch(gpu, FAULT_SERVICE_MODE_REGULAR, batch_context);
2268 
2269         // We may have issued replays even if status != NV_OK if
2270         // UVM_PERF_FAULT_REPLAY_POLICY_BLOCK is being used or the fault buffer
2271         // was flushed
2272         num_replays += batch_context->num_replays;
2273 
2274         if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
2275             continue;
2276 
2277         enable_disable_prefetch_faults(gpu->parent, batch_context);
2278 
2279         if (status != NV_OK) {
2280             // Unconditionally cancel all faults to trigger RC. This will not
2281             // provide precise attribution, but this case handles global
2282             // errors such as OOM or ECC where it's not reasonable to
2283             // guarantee precise attribution. We ignore the return value of
2284             // the cancel operation since this path is already returning an
2285             // error code.
2286             cancel_fault_batch(gpu, batch_context, uvm_tools_status_to_fatal_fault_reason(status));
2287             break;
2288         }
2289 
2290         if (batch_context->has_fatal_faults) {
2291             status = uvm_tracker_wait(&batch_context->tracker);
2292             if (status == NV_OK)
2293                 status = cancel_faults_precise(gpu, batch_context);
2294 
2295             break;
2296         }
2297 
2298         if (replayable_faults->replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BATCH) {
2299             status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
2300             if (status != NV_OK)
2301                 break;
2302             ++num_replays;
2303         }
2304         else if (replayable_faults->replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BATCH_FLUSH) {
2305             uvm_gpu_buffer_flush_mode_t flush_mode = UVM_GPU_BUFFER_FLUSH_MODE_CACHED_PUT;
2306 
2307             if (batch_context->num_duplicate_faults * 100 >
2308                 batch_context->num_cached_faults * replayable_faults->replay_update_put_ratio) {
2309                 flush_mode = UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT;
2310             }
2311 
2312             status = fault_buffer_flush_locked(gpu, flush_mode, UVM_FAULT_REPLAY_TYPE_START, batch_context);
2313             if (status != NV_OK)
2314                 break;
2315             ++num_replays;
2316             status = uvm_tracker_wait(&replayable_faults->replay_tracker);
2317             if (status != NV_OK)
2318                 break;
2319         }
2320 
2321         if (batch_context->has_throttled_faults)
2322             ++num_throttled;
2323 
2324         ++num_batches;
2325     }
2326 
2327     if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
2328         status = NV_OK;
2329 
2330     // Make sure that we issue at least one replay if no replay has been
2331     // issued yet to avoid dropping faults that do not show up in the buffer
2332     if ((status == NV_OK && replayable_faults->replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_ONCE) ||
2333         num_replays == 0)
2334         status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
2335 
2336     uvm_tracker_deinit(&batch_context->tracker);
2337 
2338     if (status != NV_OK)
2339         UVM_DBG_PRINT("Error servicing replayable faults on GPU: %s\n", uvm_gpu_name(gpu));
2340 }
2341 
2342 void uvm_gpu_enable_prefetch_faults(uvm_parent_gpu_t *parent_gpu)
2343 {
2344     UVM_ASSERT(parent_gpu->isr.replayable_faults.handling);
2345     UVM_ASSERT(parent_gpu->prefetch_fault_supported);
2346 
2347     if (!parent_gpu->fault_buffer_info.prefetch_faults_enabled) {
2348         parent_gpu->arch_hal->enable_prefetch_faults(parent_gpu);
2349         parent_gpu->fault_buffer_info.prefetch_faults_enabled = true;
2350     }
2351 }
2352 
2353 void uvm_gpu_disable_prefetch_faults(uvm_parent_gpu_t *parent_gpu)
2354 {
2355     UVM_ASSERT(parent_gpu->isr.replayable_faults.handling);
2356     UVM_ASSERT(parent_gpu->prefetch_fault_supported);
2357 
2358     if (parent_gpu->fault_buffer_info.prefetch_faults_enabled) {
2359         parent_gpu->arch_hal->disable_prefetch_faults(parent_gpu);
2360         parent_gpu->fault_buffer_info.prefetch_faults_enabled = false;
2361         parent_gpu->fault_buffer_info.disable_prefetch_faults_timestamp = NV_GETTIME();
2362     }
2363 }
2364 
2365 const char *uvm_perf_fault_replay_policy_string(uvm_perf_fault_replay_policy_t replay_policy)
2366 {
2367     BUILD_BUG_ON(UVM_PERF_FAULT_REPLAY_POLICY_MAX != 4);
2368 
2369     switch (replay_policy) {
2370         UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_BLOCK);
2371         UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_BATCH);
2372         UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_BATCH_FLUSH);
2373         UVM_ENUM_STRING_CASE(UVM_PERF_FAULT_REPLAY_POLICY_ONCE);
2374         UVM_ENUM_STRING_DEFAULT();
2375     }
2376 }
2377 
2378 NV_STATUS uvm_test_get_prefetch_faults_reenable_lapse(UVM_TEST_GET_PREFETCH_FAULTS_REENABLE_LAPSE_PARAMS *params,
2379                                                       struct file *filp)
2380 {
2381     params->reenable_lapse = uvm_perf_reenable_prefetch_faults_lapse_msec;
2382 
2383     return NV_OK;
2384 }
2385 
2386 NV_STATUS uvm_test_set_prefetch_faults_reenable_lapse(UVM_TEST_SET_PREFETCH_FAULTS_REENABLE_LAPSE_PARAMS *params,
2387                                                       struct file *filp)
2388 {
2389     uvm_perf_reenable_prefetch_faults_lapse_msec = params->reenable_lapse;
2390 
2391     return NV_OK;
2392 }
2393 
2394 NV_STATUS uvm_test_drain_replayable_faults(UVM_TEST_DRAIN_REPLAYABLE_FAULTS_PARAMS *params, struct file *filp)
2395 {
2396     uvm_gpu_t *gpu;
2397     NV_STATUS status = NV_OK;
2398     uvm_spin_loop_t spin;
2399     bool pending = true;
2400     uvm_va_space_t *va_space = uvm_va_space_get(filp);
2401 
2402     gpu = uvm_va_space_retain_gpu_by_uuid(va_space, &params->gpu_uuid);
2403     if (!gpu)
2404         return NV_ERR_INVALID_DEVICE;
2405 
2406     uvm_spin_loop_init(&spin);
2407 
2408     do {
2409         uvm_gpu_replayable_faults_isr_lock(gpu->parent);
2410         pending = uvm_gpu_replayable_faults_pending(gpu->parent);
2411         uvm_gpu_replayable_faults_isr_unlock(gpu->parent);
2412 
2413         if (!pending)
2414             break;
2415 
2416         if (fatal_signal_pending(current)) {
2417             status = NV_ERR_SIGNAL_PENDING;
2418             break;
2419         }
2420 
2421         UVM_SPIN_LOOP(&spin);
2422     } while (uvm_spin_loop_elapsed(&spin) < params->timeout_ns);
2423 
2424     if (pending && status == NV_OK)
2425         status = NV_ERR_TIMEOUT;
2426 
2427     uvm_gpu_release(gpu);
2428 
2429     return status;
2430 }
2431