1 /*******************************************************************************
2     Copyright (c) 2016-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN hint OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_api.h"
25 #include "uvm_conf_computing.h"
26 #include "uvm_perf_events.h"
27 #include "uvm_perf_module.h"
28 #include "uvm_perf_thrashing.h"
29 #include "uvm_perf_utils.h"
30 #include "uvm_va_block.h"
31 #include "uvm_va_range.h"
32 #include "uvm_kvmalloc.h"
33 #include "uvm_tools.h"
34 #include "uvm_procfs.h"
35 #include "uvm_test.h"
36 
37 // Number of bits for page-granularity time stamps. Currently we ignore the first 6 bits
38 // of the timestamp (i.e. we have 64ns resolution, which is good enough)
39 #define PAGE_THRASHING_LAST_TIME_STAMP_BITS 58
40 #define PAGE_THRASHING_NUM_EVENTS_BITS      3
41 
42 #define PAGE_THRASHING_THROTTLING_END_TIME_STAMP_BITS 58
43 #define PAGE_THRASHING_THROTTLING_COUNT_BITS          8
44 
45 // Per-page thrashing detection structure.
46 typedef struct
47 {
48     struct
49     {
50         // Last time stamp when a thrashing-related event was recorded
51         NvU64                        last_time_stamp : PAGE_THRASHING_LAST_TIME_STAMP_BITS;
52 
53         bool                    has_migration_events : 1;
54 
55         bool                   has_revocation_events : 1;
56 
57         // Number of consecutive "thrashing" events (within the configured
58         // thrashing lapse)
59         NvU8                    num_thrashing_events : PAGE_THRASHING_NUM_EVENTS_BITS;
60 
61         bool                                  pinned : 1;
62     };
63 
64     struct
65     {
66         // Deadline for throttled processors to wake up
67         NvU64              throttling_end_time_stamp : PAGE_THRASHING_THROTTLING_END_TIME_STAMP_BITS;
68 
69         // Number of times a processor has been throttled. This is used to
70         // determine when the page needs to get pinned. After getting pinned
71         // this field is always 0.
72         NvU8                        throttling_count : PAGE_THRASHING_THROTTLING_COUNT_BITS;
73     };
74 
75     // Processors accessing this page
76     uvm_processor_mask_t                  processors;
77 
78     // Processors that have been throttled. This must be a subset of processors
79     uvm_processor_mask_t        throttled_processors;
80 
81     // Memory residency for the page when in pinning phase
82     uvm_processor_id_t           pinned_residency_id;
83 
84     // Processor not to be throttled in the current throttling period
85     uvm_processor_id_t  do_not_throttle_processor_id;
86 } page_thrashing_info_t;
87 
88 // Per-VA block thrashing detection structure. This state is protected by the
89 // VA block lock.
90 typedef struct
91 {
92     page_thrashing_info_t                     *pages;
93 
94     NvU16                        num_thrashing_pages;
95 
96     NvU8                       thrashing_reset_count;
97 
98     uvm_processor_id_t                last_processor;
99 
100     NvU64                            last_time_stamp;
101 
102     NvU64                  last_thrashing_time_stamp;
103 
104     // Stats
105     NvU32                           throttling_count;
106 
107     uvm_page_mask_t                  thrashing_pages;
108 
109     struct
110     {
111         NvU32                                  count;
112 
113         uvm_page_mask_t                         mask;
114 
115         // List of pinned pages. This list is only used if the pinning timeout
116         // is not 0.
117         struct list_head                        list;
118     } pinned_pages;
119 } block_thrashing_info_t;
120 
121 // Descriptor for a page that has been pinned due to thrashing. This structure
122 // is only used if the pinning timeout is not 0.
123 typedef struct
124 {
125     uvm_va_block_t                         *va_block;
126 
127     // Page index within va_block
128     uvm_page_index_t                      page_index;
129 
130     // Absolute timestamp after which the page will be unpinned
131     NvU64                                   deadline;
132 
133     // Entry in the per-VA Space list of pinned pages. See
134     // va_space_thrashing_info_t::pinned_pages::list.
135     struct list_head             va_space_list_entry;
136 
137     // Entry in the per-VA Block list of pinned pages. See
138     // block_thrashing_info_t::pinned_pages::list.
139     struct list_head             va_block_list_entry;
140 } pinned_page_t;
141 
142 // Per-VA space data structures and policy configuration
143 typedef struct
144 {
145     // Per-VA space accounting of pinned pages that is used to speculatively
146     // unpin pages after the configured timeout. This struct is only used if
147     // the pinning timeout is not 0.
148     struct
149     {
150         // Work descriptor that is executed asynchronously by a helper thread
151         struct delayed_work                    dwork;
152 
153         // List of pinned pages. They are (mostly) ordered by unpin deadline.
154         // New entries are inserted blindly at the tail since the expectation
155         // is that they will have the largest deadline value. However, given
156         // the drift between when multiple threads query their timestamps and
157         // add those pages to the list under the lock, it might not be
158         // strictly ordered. But this is OK since the difference will be very
159         // small and they will be eventually removed from the list.
160         //
161         // Entries are removed when they reach the deadline by the function
162         // configured in dwork. This list is protected by lock.
163         struct list_head                        list;
164 
165         uvm_spinlock_t                          lock;
166 
167         uvm_va_block_context_t      *va_block_context;
168 
169         // Flag used to avoid scheduling delayed unpinning operations after
170         // uvm_perf_thrashing_stop has been called.
171         bool                    in_va_space_teardown;
172     } pinned_pages;
173 
174     struct
175     {
176         // Whether thrashing mitigation is enabled on this VA space
177         bool                                  enable;
178 
179         // true if the thrashing mitigation parameters have been modified using
180         // test ioctls
181         bool                          test_overrides;
182 
183         //
184         // Fields below are the thrashing mitigation parameters on the VA space
185         //
186         unsigned                           threshold;
187 
188         unsigned                       pin_threshold;
189 
190         NvU64                               lapse_ns;
191 
192         NvU64                                 nap_ns;
193 
194         NvU64                               epoch_ns;
195 
196         unsigned                          max_resets;
197 
198         NvU64                                 pin_ns;
199     } params;
200 
201     uvm_va_space_t                         *va_space;
202 } va_space_thrashing_info_t;
203 
204 typedef struct
205 {
206     // Entry for the per-processor thrashing_stats file in procfs
207     struct proc_dir_entry *procfs_file;
208 
209     // Number of times thrashing is detected
210     atomic64_t num_thrashing;
211 
212     // Number of times the processor was throttled while thrashing
213     atomic64_t num_throttle;
214 
215     // Number of times a page was pinned on this processor while thrashing
216     atomic64_t num_pin_local;
217 
218     // Number of times a page was pinned on a different processor while thrashing
219     atomic64_t num_pin_remote;
220 } processor_thrashing_stats_t;
221 
222 // Pre-allocated thrashing stats structure for the CPU. This is only valid if
223 // uvm_procfs_is_debug_enabled() returns true.
224 static processor_thrashing_stats_t g_cpu_thrashing_stats;
225 
226 #define PROCESSOR_THRASHING_STATS_INC(va_space, proc, field)                                         \
227     do {                                                                                             \
228         processor_thrashing_stats_t *_processor_stats = thrashing_stats_get_or_null(va_space, proc); \
229         if (_processor_stats)                                                                        \
230             atomic64_inc(&_processor_stats->field);                                                  \
231     } while (0)
232 
233 // Global caches for the per-VA block thrashing detection structures
234 static struct kmem_cache *g_va_block_thrashing_info_cache __read_mostly;
235 static struct kmem_cache *g_pinned_page_cache __read_mostly;
236 
237 //
238 // Tunables for thrashing detection/prevention (configurable via module parameters)
239 //
240 
241 #define UVM_PERF_THRASHING_ENABLE_DEFAULT 1
242 
243 // Enable/disable thrashing performance heuristics
244 static unsigned uvm_perf_thrashing_enable = UVM_PERF_THRASHING_ENABLE_DEFAULT;
245 
246 #define UVM_PERF_THRASHING_THRESHOLD_DEFAULT 3
247 #define UVM_PERF_THRASHING_THRESHOLD_MAX     ((1 << PAGE_THRASHING_NUM_EVENTS_BITS) - 1)
248 
249 // Number of consecutive thrashing events to initiate thrashing prevention
250 //
251 // Maximum value is UVM_PERF_THRASHING_THRESHOLD_MAX
252 static unsigned uvm_perf_thrashing_threshold = UVM_PERF_THRASHING_THRESHOLD_DEFAULT;
253 
254 #define UVM_PERF_THRASHING_PIN_THRESHOLD_DEFAULT 10
255 #define UVM_PERF_THRASHING_PIN_THRESHOLD_MAX     ((1 << PAGE_THRASHING_THROTTLING_COUNT_BITS) - 1)
256 
257 // Number of consecutive throttling operations before trying to map remotely
258 //
259 // Maximum value is UVM_PERF_THRASHING_PIN_THRESHOLD_MAX
260 static unsigned uvm_perf_thrashing_pin_threshold = UVM_PERF_THRASHING_PIN_THRESHOLD_DEFAULT;
261 
262 // TODO: Bug 1768615: [uvm] Automatically tune default values for thrashing
263 // detection/prevention parameters
264 #define UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT 500
265 #define UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT_EMULATION (UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT * 800)
266 #define UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT_HCC (UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT * 10)
267 
268 // Lapse of time in microseconds that determines if two consecutive events on
269 // the same page can be considered thrashing
270 static unsigned uvm_perf_thrashing_lapse_usec = UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT;
271 
272 #define UVM_PERF_THRASHING_NAP_DEFAULT 1
273 #define UVM_PERF_THRASHING_NAP_MAX     100
274 
275 // Time that the processor being throttled is forbidden to work on the thrashing
276 // page. This value is a multiplier of uvm_perf_thrashing_lapse_usec.
277 static unsigned uvm_perf_thrashing_nap = UVM_PERF_THRASHING_NAP_DEFAULT;
278 
279 #define UVM_PERF_THRASHING_EPOCH_DEFAULT 2000
280 
281 // Time lapse after which we consider thrashing is no longer happening. This
282 // value is a multiplier of uvm_perf_thrashing_lapse_usec.
283 static unsigned uvm_perf_thrashing_epoch = UVM_PERF_THRASHING_EPOCH_DEFAULT;
284 
285 // When pages are pinned and the rest of thrashing processors are mapped
286 // remotely we lose track of who is accessing the page for the rest of
287 // program execution. This can lead to tremendous performance loss if the page
288 // is not thrashing anymore and it is always being accessed remotely.
289 // In order to avoid that scenario, we use a timer that unpins memory after
290 // some time. We use a per-VA space list of pinned pages, sorted by the
291 // deadline at which it will be unmapped from remote processors. Therefore,
292 // next remote access will trigger a fault that will migrate the page.
293 #define UVM_PERF_THRASHING_PIN_DEFAULT 300
294 #define UVM_PERF_THRASHING_PIN_DEFAULT_EMULATION 10
295 
296 // Time for which a page remains pinned. This value is a multiplier of
297 // uvm_perf_thrashing_lapse_usec. 0 means that it is pinned forever.
298 static unsigned uvm_perf_thrashing_pin = UVM_PERF_THRASHING_PIN_DEFAULT;
299 
300 // Number of times a VA block can be reset back to non-thrashing. This
301 // mechanism tries to avoid performing optimizations on a block that periodically
302 // causes thrashing
303 #define UVM_PERF_THRASHING_MAX_RESETS_DEFAULT 4
304 
305 static unsigned uvm_perf_thrashing_max_resets = UVM_PERF_THRASHING_MAX_RESETS_DEFAULT;
306 
307 // Module parameters for the tunables
308 module_param(uvm_perf_thrashing_enable,        uint, S_IRUGO);
309 module_param(uvm_perf_thrashing_threshold,     uint, S_IRUGO);
310 module_param(uvm_perf_thrashing_pin_threshold, uint, S_IRUGO);
311 module_param(uvm_perf_thrashing_lapse_usec,    uint, S_IRUGO);
312 module_param(uvm_perf_thrashing_nap,           uint, S_IRUGO);
313 module_param(uvm_perf_thrashing_epoch,         uint, S_IRUGO);
314 module_param(uvm_perf_thrashing_pin,           uint, S_IRUGO);
315 module_param(uvm_perf_thrashing_max_resets,    uint, S_IRUGO);
316 
317 // See map_remote_on_atomic_fault uvm_va_block.c
318 unsigned uvm_perf_map_remote_on_native_atomics_fault = 0;
319 module_param(uvm_perf_map_remote_on_native_atomics_fault, uint, S_IRUGO);
320 
321 // Global post-processed values of the module parameters. They can be overriden
322 // per VA-space.
323 static bool g_uvm_perf_thrashing_enable;
324 static unsigned g_uvm_perf_thrashing_threshold;
325 static unsigned g_uvm_perf_thrashing_pin_threshold;
326 static NvU64 g_uvm_perf_thrashing_lapse_usec;
327 static NvU64 g_uvm_perf_thrashing_nap;
328 static NvU64 g_uvm_perf_thrashing_epoch;
329 static NvU64 g_uvm_perf_thrashing_pin;
330 static unsigned g_uvm_perf_thrashing_max_resets;
331 
332 // Helper macros to initialize thrashing parameters from module parameters
333 //
334 // This helper returns whether the type for the parameter is signed
335 #define THRASHING_PARAMETER_IS_SIGNED(v) (((typeof(v)) -1) < 0)
336 
337 // Macro that initializes the given thrashing parameter and checks its validity
338 // (within [_mi:_ma]). Otherwise it is initialized with the given default
339 // parameter _d. The user value is read from _v, and the final value is stored
340 // in a variable named g_##_v, so it must be declared, too. Only unsigned
341 // parameters are supported.
342 #define INIT_THRASHING_PARAMETER_MIN_MAX(_v, _d, _mi, _ma)                      \
343     do {                                                                        \
344         unsigned v = (_v);                                                      \
345         unsigned d = (_d);                                                      \
346         unsigned mi = (_mi);                                                    \
347         unsigned ma = (_ma);                                                    \
348                                                                                 \
349         BUILD_BUG_ON(sizeof(_v) > sizeof(unsigned));                            \
350         BUILD_BUG_ON(THRASHING_PARAMETER_IS_SIGNED(_v));                        \
351                                                                                 \
352         UVM_ASSERT(mi <= ma);                                                   \
353         UVM_ASSERT(d >= mi);                                                    \
354         UVM_ASSERT(d <= ma);                                                    \
355                                                                                 \
356         if (v >= mi && v <= ma) {                                               \
357             g_##_v = v;                                                         \
358         }                                                                       \
359         else {                                                                  \
360             pr_info("Invalid value %u for " #_v ". Using %u instead\n", v, d);  \
361                                                                                 \
362             g_##_v = d;                                                         \
363         }                                                                       \
364     } while (0)
365 
366 #define INIT_THRASHING_PARAMETER(v, d)                 INIT_THRASHING_PARAMETER_MIN_MAX(v, d, 0u, UINT_MAX)
367 
368 #define INIT_THRASHING_PARAMETER_MIN(v, d, mi)         INIT_THRASHING_PARAMETER_MIN_MAX(v, d, mi, UINT_MAX)
369 #define INIT_THRASHING_PARAMETER_MAX(v, d, ma)         INIT_THRASHING_PARAMETER_MIN_MAX(v, d, 0u, ma)
370 
371 #define INIT_THRASHING_PARAMETER_NONZERO(v, d)         INIT_THRASHING_PARAMETER_MIN_MAX(v, d, 1u, UINT_MAX)
372 #define INIT_THRASHING_PARAMETER_NONZERO_MAX(v, d, ma) INIT_THRASHING_PARAMETER_MIN_MAX(v, d, 1u, ma)
373 
374 #define INIT_THRASHING_PARAMETER_TOGGLE(v, d)          INIT_THRASHING_PARAMETER_MIN_MAX(v, d, 0u, 1u)
375 
376 // Helpers to get/set the time stamp
377 static NvU64 page_thrashing_get_time_stamp(page_thrashing_info_t *entry)
378 {
379     return entry->last_time_stamp << (64 - PAGE_THRASHING_LAST_TIME_STAMP_BITS);
380 }
381 
382 static void page_thrashing_set_time_stamp(page_thrashing_info_t *entry, NvU64 time_stamp)
383 {
384     entry->last_time_stamp = time_stamp >> (64 - PAGE_THRASHING_LAST_TIME_STAMP_BITS);
385 }
386 
387 static NvU64 page_thrashing_get_throttling_end_time_stamp(page_thrashing_info_t *entry)
388 {
389     return entry->throttling_end_time_stamp << (64 - PAGE_THRASHING_THROTTLING_END_TIME_STAMP_BITS);
390 }
391 
392 static void page_thrashing_set_throttling_end_time_stamp(page_thrashing_info_t *entry, NvU64 time_stamp)
393 {
394     entry->throttling_end_time_stamp = time_stamp >> (64 - PAGE_THRASHING_THROTTLING_END_TIME_STAMP_BITS);
395 }
396 
397 // Performance heuristics module for thrashing
398 static uvm_perf_module_t g_module_thrashing;
399 
400 // Callback declaration for the performance heuristics events
401 static void thrashing_event_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data);
402 static void thrashing_block_destroy_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data);
403 static void thrashing_block_munmap_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data);
404 
405 static uvm_perf_module_event_callback_desc_t g_callbacks_thrashing[] = {
406     { UVM_PERF_EVENT_BLOCK_DESTROY, thrashing_block_destroy_cb },
407     { UVM_PERF_EVENT_MODULE_UNLOAD, thrashing_block_destroy_cb },
408     { UVM_PERF_EVENT_BLOCK_SHRINK , thrashing_block_destroy_cb },
409     { UVM_PERF_EVENT_BLOCK_MUNMAP , thrashing_block_munmap_cb  },
410     { UVM_PERF_EVENT_MIGRATION,     thrashing_event_cb         },
411     { UVM_PERF_EVENT_REVOCATION,    thrashing_event_cb         }
412 };
413 
414 static int nv_procfs_read_thrashing_stats(struct seq_file *s, void *v)
415 {
416     processor_thrashing_stats_t *processor_stats = (processor_thrashing_stats_t *)s->private;
417 
418     UVM_ASSERT(processor_stats);
419 
420     if (!uvm_down_read_trylock(&g_uvm_global.pm.lock))
421             return -EAGAIN;
422 
423     UVM_SEQ_OR_DBG_PRINT(s, "thrashing     %llu\n", (NvU64)atomic64_read(&processor_stats->num_thrashing));
424     UVM_SEQ_OR_DBG_PRINT(s, "throttle      %llu\n", (NvU64)atomic64_read(&processor_stats->num_throttle));
425     UVM_SEQ_OR_DBG_PRINT(s, "pin_local     %llu\n", (NvU64)atomic64_read(&processor_stats->num_pin_local));
426     UVM_SEQ_OR_DBG_PRINT(s, "pin_remote    %llu\n", (NvU64)atomic64_read(&processor_stats->num_pin_remote));
427 
428     uvm_up_read(&g_uvm_global.pm.lock);
429 
430     return 0;
431 }
432 
433 static int nv_procfs_read_thrashing_stats_entry(struct seq_file *s, void *v)
434 {
435     UVM_ENTRY_RET(nv_procfs_read_thrashing_stats(s, v));
436 }
437 
438 UVM_DEFINE_SINGLE_PROCFS_FILE(thrashing_stats_entry);
439 
440 #define THRASHING_STATS_FILE_NAME "thrashing_stats"
441 
442 // Initialization/deinitialization of CPU thrashing stats
443 //
444 static NV_STATUS cpu_thrashing_stats_init(void)
445 {
446     struct proc_dir_entry *cpu_base_dir_entry = uvm_procfs_get_cpu_base_dir();
447 
448     if (uvm_procfs_is_debug_enabled()) {
449         UVM_ASSERT(!g_cpu_thrashing_stats.procfs_file);
450         g_cpu_thrashing_stats.procfs_file = NV_CREATE_PROC_FILE(THRASHING_STATS_FILE_NAME,
451                                                                 cpu_base_dir_entry,
452                                                                 thrashing_stats_entry,
453                                                                 &g_cpu_thrashing_stats);
454         if (!g_cpu_thrashing_stats.procfs_file)
455             return NV_ERR_OPERATING_SYSTEM;
456     }
457 
458     return NV_OK;
459 }
460 
461 static void cpu_thrashing_stats_exit(void)
462 {
463     if (g_cpu_thrashing_stats.procfs_file) {
464         UVM_ASSERT(uvm_procfs_is_debug_enabled());
465         proc_remove(g_cpu_thrashing_stats.procfs_file);
466         g_cpu_thrashing_stats.procfs_file = NULL;
467     }
468 }
469 
470 // Get the thrashing stats struct for the given VA space if it exists
471 //
472 // No lock may be held. Therefore, the stats must be updated using atomics
473 static processor_thrashing_stats_t *gpu_thrashing_stats_get_or_null(uvm_gpu_t *gpu)
474 {
475     return uvm_perf_module_type_data(gpu->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
476 }
477 
478 static processor_thrashing_stats_t *thrashing_stats_get_or_null(uvm_va_space_t *va_space, uvm_processor_id_t id)
479 {
480     if (UVM_ID_IS_CPU(id)) {
481         if (g_cpu_thrashing_stats.procfs_file)
482             return &g_cpu_thrashing_stats;
483 
484         return NULL;
485     }
486 
487     return gpu_thrashing_stats_get_or_null(uvm_va_space_get_gpu(va_space, id));
488 }
489 
490 // Create the thrashing stats struct for the given GPU
491 //
492 // Global lock needs to be held
493 static NV_STATUS gpu_thrashing_stats_create(uvm_gpu_t *gpu)
494 {
495     processor_thrashing_stats_t *gpu_thrashing;
496 
497     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
498     UVM_ASSERT(gpu_thrashing_stats_get_or_null(gpu) == NULL);
499     UVM_ASSERT(uvm_procfs_is_debug_enabled());
500 
501     gpu_thrashing = uvm_kvmalloc_zero(sizeof(*gpu_thrashing));
502     if (!gpu_thrashing)
503         return NV_ERR_NO_MEMORY;
504 
505     gpu_thrashing->procfs_file = NV_CREATE_PROC_FILE(THRASHING_STATS_FILE_NAME,
506                                                      gpu->procfs.dir,
507                                                      thrashing_stats_entry,
508                                                      gpu_thrashing);
509     if (!gpu_thrashing->procfs_file) {
510         uvm_kvfree(gpu_thrashing);
511         return NV_ERR_OPERATING_SYSTEM;
512     }
513 
514     uvm_perf_module_type_set_data(gpu->perf_modules_data, gpu_thrashing, UVM_PERF_MODULE_TYPE_THRASHING);
515 
516     return NV_OK;
517 }
518 
519 static void gpu_thrashing_stats_destroy(uvm_gpu_t *gpu)
520 {
521     processor_thrashing_stats_t *gpu_thrashing = gpu_thrashing_stats_get_or_null(gpu);
522 
523     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
524 
525     if (gpu_thrashing) {
526         uvm_perf_module_type_unset_data(gpu->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
527 
528         if (gpu_thrashing->procfs_file)
529             proc_remove(gpu_thrashing->procfs_file);
530 
531         uvm_kvfree(gpu_thrashing);
532     }
533 }
534 
535 // Get the thrashing detection struct for the given VA space if it exists
536 //
537 // The caller must ensure that the va_space cannot be deleted, for the
538 // duration of this call. Holding either the va_block or va_space lock will do
539 // that.
540 static va_space_thrashing_info_t *va_space_thrashing_info_get_or_null(uvm_va_space_t *va_space)
541 {
542     return uvm_perf_module_type_data(va_space->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
543 }
544 
545 // Get the thrashing detection struct for the given VA space. It asserts that
546 // the information has been previously created.
547 //
548 // The caller must ensure that the va_space cannot be deleted, for the
549 // duration of this call. Holding either the va_block or va_space lock will do
550 // that.
551 static va_space_thrashing_info_t *va_space_thrashing_info_get(uvm_va_space_t *va_space)
552 {
553     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get_or_null(va_space);
554     UVM_ASSERT(va_space_thrashing);
555 
556     return va_space_thrashing;
557 }
558 
559 static void va_space_thrashing_info_init_params(va_space_thrashing_info_t *va_space_thrashing)
560 {
561     UVM_ASSERT(!va_space_thrashing->params.test_overrides);
562 
563     va_space_thrashing->params.enable = g_uvm_perf_thrashing_enable;
564 
565     // Snap the thrashing parameters so that they can be tuned per VA space
566     va_space_thrashing->params.threshold     = g_uvm_perf_thrashing_threshold;
567     va_space_thrashing->params.pin_threshold = g_uvm_perf_thrashing_pin_threshold;
568 
569     // Default thrashing parameters are overriden for simulated/emulated GPUs
570     if (g_uvm_global.num_simulated_devices > 0 &&
571         (g_uvm_perf_thrashing_lapse_usec == UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT)) {
572         va_space_thrashing->params.lapse_ns  = UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT_EMULATION * 1000;
573     }
574     else {
575         va_space_thrashing->params.lapse_ns  = g_uvm_perf_thrashing_lapse_usec * 1000;
576     }
577 
578     va_space_thrashing->params.nap_ns        = va_space_thrashing->params.lapse_ns * g_uvm_perf_thrashing_nap;
579     va_space_thrashing->params.epoch_ns      = va_space_thrashing->params.lapse_ns * g_uvm_perf_thrashing_epoch;
580 
581     if (g_uvm_global.num_simulated_devices > 0 && (g_uvm_perf_thrashing_pin == UVM_PERF_THRASHING_PIN_DEFAULT)) {
582         va_space_thrashing->params.pin_ns    = va_space_thrashing->params.lapse_ns
583                                                * UVM_PERF_THRASHING_PIN_DEFAULT_EMULATION;
584     }
585     else {
586         va_space_thrashing->params.pin_ns    = va_space_thrashing->params.lapse_ns * g_uvm_perf_thrashing_pin;
587     }
588 
589     va_space_thrashing->params.max_resets    = g_uvm_perf_thrashing_max_resets;
590 }
591 
592 // Create the thrashing detection struct for the given VA space
593 //
594 // VA space lock needs to be held in write mode
595 static va_space_thrashing_info_t *va_space_thrashing_info_create(uvm_va_space_t *va_space)
596 {
597     va_space_thrashing_info_t *va_space_thrashing;
598     uvm_assert_rwsem_locked_write(&va_space->lock);
599 
600     UVM_ASSERT(va_space_thrashing_info_get_or_null(va_space) == NULL);
601 
602     va_space_thrashing = uvm_kvmalloc_zero(sizeof(*va_space_thrashing));
603     if (va_space_thrashing) {
604         uvm_va_block_context_t *block_context = uvm_va_block_context_alloc(NULL);
605 
606         if (!block_context) {
607             uvm_kvfree(va_space_thrashing);
608             return NULL;
609         }
610 
611         va_space_thrashing->pinned_pages.va_block_context = block_context;
612         va_space_thrashing->va_space = va_space;
613 
614         va_space_thrashing_info_init_params(va_space_thrashing);
615 
616         uvm_perf_module_type_set_data(va_space->perf_modules_data, va_space_thrashing, UVM_PERF_MODULE_TYPE_THRASHING);
617     }
618 
619     return va_space_thrashing;
620 }
621 
622 // Destroy the thrashing detection struct for the given VA space
623 //
624 // VA space lock needs to be in write mode
625 static void va_space_thrashing_info_destroy(uvm_va_space_t *va_space)
626 {
627     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get_or_null(va_space);
628     uvm_assert_rwsem_locked_write(&va_space->lock);
629 
630     if (va_space_thrashing) {
631         uvm_perf_module_type_unset_data(va_space->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
632         uvm_va_block_context_free(va_space_thrashing->pinned_pages.va_block_context);
633         uvm_kvfree(va_space_thrashing);
634     }
635 }
636 
637 // Get the thrashing detection struct for the given block
638 static block_thrashing_info_t *thrashing_info_get(uvm_va_block_t *va_block)
639 {
640     uvm_assert_mutex_locked(&va_block->lock);
641     return uvm_perf_module_type_data(va_block->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
642 }
643 
644 // Get the thrashing detection struct for the given block or create it if it
645 // does not exist
646 static block_thrashing_info_t *thrashing_info_get_create(uvm_va_block_t *va_block)
647 {
648     block_thrashing_info_t *block_thrashing = thrashing_info_get(va_block);
649 
650     BUILD_BUG_ON((1 << 8 * sizeof(block_thrashing->num_thrashing_pages)) < PAGES_PER_UVM_VA_BLOCK);
651     BUILD_BUG_ON((1 << 16) < UVM_ID_MAX_PROCESSORS);
652 
653     if (!block_thrashing) {
654         block_thrashing = nv_kmem_cache_zalloc(g_va_block_thrashing_info_cache, NV_UVM_GFP_FLAGS);
655         if (!block_thrashing)
656             goto done;
657 
658         block_thrashing->last_processor = UVM_ID_INVALID;
659         INIT_LIST_HEAD(&block_thrashing->pinned_pages.list);
660 
661         uvm_perf_module_type_set_data(va_block->perf_modules_data, block_thrashing, UVM_PERF_MODULE_TYPE_THRASHING);
662     }
663 
664 done:
665     return block_thrashing;
666 }
667 
668 static void thrashing_reset_pages_in_region(uvm_va_block_t *va_block, NvU64 address, NvU64 bytes);
669 
670 void uvm_perf_thrashing_info_destroy(uvm_va_block_t *va_block)
671 {
672     block_thrashing_info_t *block_thrashing = thrashing_info_get(va_block);
673 
674     if (block_thrashing) {
675         thrashing_reset_pages_in_region(va_block, va_block->start, uvm_va_block_size(va_block));
676 
677         uvm_perf_module_type_unset_data(va_block->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
678 
679         uvm_kvfree(block_thrashing->pages);
680         kmem_cache_free(g_va_block_thrashing_info_cache, block_thrashing);
681     }
682 }
683 
684 void thrashing_block_destroy_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data)
685 {
686     uvm_va_block_t *va_block;
687 
688     UVM_ASSERT(g_uvm_perf_thrashing_enable);
689 
690     UVM_ASSERT(event_id == UVM_PERF_EVENT_BLOCK_DESTROY ||
691                event_id == UVM_PERF_EVENT_BLOCK_SHRINK ||
692                event_id == UVM_PERF_EVENT_MODULE_UNLOAD);
693 
694     if (event_id == UVM_PERF_EVENT_BLOCK_DESTROY)
695         va_block = event_data->block_destroy.block;
696     else if (event_id == UVM_PERF_EVENT_BLOCK_SHRINK)
697         va_block = event_data->block_shrink.block;
698     else
699         va_block = event_data->module_unload.block;
700 
701     if (!va_block)
702         return;
703 
704     uvm_perf_thrashing_info_destroy(va_block);
705 }
706 
707 void thrashing_block_munmap_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data)
708 {
709     uvm_va_block_t *va_block = event_data->block_munmap.block;
710     uvm_va_block_region_t region = event_data->block_munmap.region;
711 
712     UVM_ASSERT(g_uvm_perf_thrashing_enable);
713     UVM_ASSERT(event_id == UVM_PERF_EVENT_BLOCK_MUNMAP);
714     UVM_ASSERT(va_block);
715 
716     thrashing_reset_pages_in_region(va_block,
717                                     uvm_va_block_region_start(va_block, region),
718                                     uvm_va_block_region_size(region));
719 }
720 
721 // Sanity checks of the thrashing tracking state
722 static bool thrashing_state_checks(uvm_va_block_t *va_block,
723                                    block_thrashing_info_t *block_thrashing,
724                                    page_thrashing_info_t *page_thrashing,
725                                    uvm_page_index_t page_index)
726 {
727     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
728     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get(va_space);
729 
730     if (!block_thrashing) {
731         UVM_ASSERT(!page_thrashing);
732         return true;
733     }
734 
735     UVM_ASSERT(uvm_page_mask_subset(&block_thrashing->pinned_pages.mask, &block_thrashing->thrashing_pages));
736 
737     if (page_thrashing) {
738         UVM_ASSERT(block_thrashing->pages);
739         UVM_ASSERT(page_thrashing == &block_thrashing->pages[page_index]);
740     }
741     else {
742         UVM_ASSERT(!uvm_page_mask_test(&block_thrashing->thrashing_pages, page_index));
743         return true;
744     }
745 
746     UVM_ASSERT(uvm_processor_mask_subset(&page_thrashing->throttled_processors,
747                                          &page_thrashing->processors));
748 
749     if (uvm_page_mask_test(&block_thrashing->thrashing_pages, page_index))
750         UVM_ASSERT(page_thrashing->num_thrashing_events >= va_space_thrashing->params.threshold);
751 
752     if (page_thrashing->pinned) {
753         UVM_ASSERT(uvm_page_mask_test(&block_thrashing->pinned_pages.mask, page_index));
754         UVM_ASSERT(UVM_ID_IS_VALID(page_thrashing->pinned_residency_id));
755         UVM_ASSERT(page_thrashing->throttling_count == 0);
756     }
757     else {
758         UVM_ASSERT(!uvm_page_mask_test(&block_thrashing->pinned_pages.mask, page_index));
759         UVM_ASSERT(UVM_ID_IS_INVALID(page_thrashing->pinned_residency_id));
760 
761         if (!uvm_processor_mask_empty(&page_thrashing->throttled_processors)) {
762             UVM_ASSERT(page_thrashing->throttling_count > 0);
763             UVM_ASSERT(uvm_page_mask_test(&block_thrashing->thrashing_pages, page_index));
764         }
765     }
766 
767     return true;
768 }
769 
770 // Update throttling heuristics. Mainly check if a new throttling period has
771 // started and choose the next processor not to be throttled. This function
772 // is executed before the thrashing mitigation logic kicks in.
773 static void thrashing_throttle_update(va_space_thrashing_info_t *va_space_thrashing,
774                                       uvm_va_block_t *va_block,
775                                       page_thrashing_info_t *page_thrashing,
776                                       uvm_processor_id_t processor,
777                                       NvU64 time_stamp)
778 {
779     NvU64 current_end_time_stamp = page_thrashing_get_throttling_end_time_stamp(page_thrashing);
780 
781     uvm_assert_mutex_locked(&va_block->lock);
782 
783     if (time_stamp > current_end_time_stamp) {
784         NvU64 throttling_end_time_stamp = time_stamp + va_space_thrashing->params.nap_ns;
785         page_thrashing_set_throttling_end_time_stamp(page_thrashing, throttling_end_time_stamp);
786 
787         // Avoid choosing the same processor in consecutive thrashing periods
788         if (uvm_id_equal(page_thrashing->do_not_throttle_processor_id, processor))
789             page_thrashing->do_not_throttle_processor_id = UVM_ID_INVALID;
790         else
791             page_thrashing->do_not_throttle_processor_id = processor;
792     }
793     else if (UVM_ID_IS_INVALID(page_thrashing->do_not_throttle_processor_id)) {
794         page_thrashing->do_not_throttle_processor_id = processor;
795     }
796 }
797 
798 // Throttle the execution of a processor. If this is the first processor being
799 // throttled for a throttling period, compute the time stamp until which the
800 // rest of processors will be throttled on fault.
801 //
802 // - Page may be pinned (possible in thrashing due to revocation, such as
803 //   in system-wide atomics)
804 // - Requesting processor must not be throttled at this point.
805 //
806 static void thrashing_throttle_processor(uvm_va_block_t *va_block,
807                                          block_thrashing_info_t *block_thrashing,
808                                          page_thrashing_info_t *page_thrashing,
809                                          uvm_page_index_t page_index,
810                                          uvm_processor_id_t processor)
811 {
812     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
813     NvU64 address = uvm_va_block_cpu_page_address(va_block, page_index);
814 
815     uvm_assert_mutex_locked(&va_block->lock);
816 
817     UVM_ASSERT(!uvm_id_equal(processor, page_thrashing->do_not_throttle_processor_id));
818 
819     if (!uvm_processor_mask_test_and_set(&page_thrashing->throttled_processors, processor)) {
820         // CPU is throttled by sleeping. This is done in uvm_vm_fault so it
821         // drops the VA block and VA space locks. Throttling start/end events
822         // are recorded around the sleep calls.
823         if (UVM_ID_IS_GPU(processor))
824             uvm_tools_record_throttling_start(va_space, address, processor);
825 
826         if (!page_thrashing->pinned)
827             UVM_PERF_SATURATING_INC(page_thrashing->throttling_count);
828 
829         UVM_PERF_SATURATING_INC(block_thrashing->throttling_count);
830     }
831 
832     UVM_ASSERT(thrashing_state_checks(va_block, block_thrashing, page_thrashing, page_index));
833 }
834 
835 // Stop throttling on the given processor. If this is the last processor being
836 // throttled for a throttling period, it will clear the throttling period.
837 //
838 // - Page may be pinned (possible in thrashing due to revocation, such as
839 //   in system-wide atomics)
840 // - Requesting processor must be throttled at this point.
841 //
842 static void thrashing_throttle_end_processor(uvm_va_block_t *va_block,
843                                              block_thrashing_info_t *block_thrashing,
844                                              page_thrashing_info_t *page_thrashing,
845                                              uvm_page_index_t page_index,
846                                              uvm_processor_id_t processor)
847 {
848     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
849     NvU64 address = uvm_va_block_cpu_page_address(va_block, page_index);
850 
851     UVM_ASSERT(uvm_processor_mask_test(&page_thrashing->throttled_processors, processor));
852     uvm_processor_mask_clear(&page_thrashing->throttled_processors, processor);
853     if (uvm_processor_mask_empty(&page_thrashing->throttled_processors))
854         page_thrashing_set_throttling_end_time_stamp(page_thrashing, 0);
855 
856     // See comment regarding throttling start/end events for CPU in
857     // thrashing_throttle_processor
858     if (UVM_ID_IS_GPU(processor))
859         uvm_tools_record_throttling_end(va_space, address, processor);
860 
861     UVM_ASSERT(thrashing_state_checks(va_block, block_thrashing, page_thrashing, page_index));
862 }
863 
864 // Clear the throttling state for all processors. This is used while
865 // transitioning to pinned state and during thrashing information reset.
866 static void thrashing_throttling_reset_page(uvm_va_block_t *va_block,
867                                             block_thrashing_info_t *block_thrashing,
868                                             page_thrashing_info_t *page_thrashing,
869                                             uvm_page_index_t page_index)
870 {
871     uvm_processor_id_t processor_id;
872 
873     for_each_id_in_mask(processor_id, &page_thrashing->throttled_processors) {
874         thrashing_throttle_end_processor(va_block,
875                                          block_thrashing,
876                                          page_thrashing,
877                                          page_index,
878                                          processor_id);
879     }
880 
881     UVM_ASSERT(uvm_processor_mask_empty(&page_thrashing->throttled_processors));
882 }
883 
884 // Find the pinned page descriptor for the given page index. Return NULL if the
885 // page is not pinned.
886 static pinned_page_t *find_pinned_page(block_thrashing_info_t *block_thrashing, uvm_page_index_t page_index)
887 {
888     pinned_page_t *pinned_page;
889 
890     list_for_each_entry(pinned_page, &block_thrashing->pinned_pages.list, va_block_list_entry) {
891         if (pinned_page->page_index == page_index)
892             return pinned_page;
893     }
894 
895     return NULL;
896 }
897 
898 // Pin a page on the specified processor. All thrashing processors will be
899 // mapped remotely on this location, when possible
900 //
901 // - Requesting processor cannot be throttled
902 //
903 static NV_STATUS thrashing_pin_page(va_space_thrashing_info_t *va_space_thrashing,
904                                     uvm_va_block_t *va_block,
905                                     block_thrashing_info_t *block_thrashing,
906                                     page_thrashing_info_t *page_thrashing,
907                                     uvm_page_index_t page_index,
908                                     NvU64 time_stamp,
909                                     uvm_processor_id_t residency,
910                                     uvm_processor_id_t requester)
911 {
912     uvm_processor_mask_t current_residency;
913 
914     uvm_assert_mutex_locked(&va_block->lock);
915     UVM_ASSERT(!uvm_processor_mask_test(&page_thrashing->throttled_processors, requester));
916 
917     uvm_va_block_page_resident_processors(va_block, page_index, &current_residency);
918 
919     // If we are pinning the page for the first time or we are pinning it on a
920     // different location that the current location, reset the throttling state
921     // to make sure that we flush any pending ThrottlingEnd events.
922     if (!page_thrashing->pinned || !uvm_processor_mask_test(&current_residency, residency))
923         thrashing_throttling_reset_page(va_block, block_thrashing, page_thrashing, page_index);
924 
925     if (!page_thrashing->pinned) {
926         if (va_space_thrashing->params.pin_ns > 0) {
927             pinned_page_t *pinned_page = nv_kmem_cache_zalloc(g_pinned_page_cache, NV_UVM_GFP_FLAGS);
928             if (!pinned_page)
929                 return NV_ERR_NO_MEMORY;
930 
931             pinned_page->va_block = va_block;
932             pinned_page->page_index = page_index;
933             pinned_page->deadline = time_stamp + va_space_thrashing->params.pin_ns;
934 
935             uvm_spin_lock(&va_space_thrashing->pinned_pages.lock);
936 
937             list_add_tail(&pinned_page->va_space_list_entry, &va_space_thrashing->pinned_pages.list);
938             list_add_tail(&pinned_page->va_block_list_entry, &block_thrashing->pinned_pages.list);
939 
940             // We only schedule the delayed work if the list was empty before
941             // adding this page. Otherwise, we just add it to the list. The
942             // unpinning helper will remove from the list those pages with
943             // deadline prior to its wakeup timestamp and will reschedule
944             // itself if there are remaining pages in the list.
945             if (list_is_singular(&va_space_thrashing->pinned_pages.list) &&
946                 !va_space_thrashing->pinned_pages.in_va_space_teardown) {
947                 int scheduled;
948                 scheduled = schedule_delayed_work(&va_space_thrashing->pinned_pages.dwork,
949                                                   usecs_to_jiffies(va_space_thrashing->params.pin_ns / 1000));
950                 UVM_ASSERT(scheduled != 0);
951             }
952 
953             uvm_spin_unlock(&va_space_thrashing->pinned_pages.lock);
954         }
955 
956         page_thrashing->throttling_count = 0;
957         page_thrashing->pinned = true;
958         UVM_PERF_SATURATING_INC(block_thrashing->pinned_pages.count);
959         uvm_page_mask_set(&block_thrashing->pinned_pages.mask, page_index);
960     }
961 
962     page_thrashing->pinned_residency_id = residency;
963 
964     UVM_ASSERT(thrashing_state_checks(va_block, block_thrashing, page_thrashing, page_index));
965 
966     return NV_OK;
967 }
968 
969 // Unpin a page. This function just clears the pinning tracking state, and does
970 // not remove remote mappings on the page. Callers will need to do it manually
971 // BEFORE calling this function, if so desired.
972 // - Page must be pinned
973 //
974 static void thrashing_unpin_page(va_space_thrashing_info_t *va_space_thrashing,
975                                  uvm_va_block_t *va_block,
976                                  block_thrashing_info_t *block_thrashing,
977                                  page_thrashing_info_t *page_thrashing,
978                                  uvm_page_index_t page_index)
979 {
980     uvm_assert_mutex_locked(&va_block->lock);
981     UVM_ASSERT(page_thrashing->pinned);
982 
983     if (va_space_thrashing->params.pin_ns > 0) {
984         bool do_free = false;
985         pinned_page_t *pinned_page = find_pinned_page(block_thrashing, page_index);
986 
987         UVM_ASSERT(pinned_page);
988         UVM_ASSERT(pinned_page->page_index == page_index);
989         UVM_ASSERT(pinned_page->va_block == va_block);
990 
991         // The va_space_list_entry and va_block_list_entry have special
992         // meanings here:
993         // - va_space_list_entry: when the delayed unpin worker removes the
994         // pinned_page from this list, it takes the ownership of the page and
995         // is in charge of freeing it.
996         // - va_block_list_entry: by removing the page from this list,
997         // thrashing_unpin_page tells the unpin delayed worker to skip
998         // unpinning that page.
999         uvm_spin_lock(&va_space_thrashing->pinned_pages.lock);
1000         list_del_init(&pinned_page->va_block_list_entry);
1001 
1002         if (!list_empty(&pinned_page->va_space_list_entry)) {
1003             do_free = true;
1004             list_del_init(&pinned_page->va_space_list_entry);
1005 
1006             if (list_empty(&va_space_thrashing->pinned_pages.list))
1007                 cancel_delayed_work(&va_space_thrashing->pinned_pages.dwork);
1008         }
1009 
1010         uvm_spin_unlock(&va_space_thrashing->pinned_pages.lock);
1011 
1012         if (do_free)
1013             kmem_cache_free(g_pinned_page_cache, pinned_page);
1014     }
1015 
1016     page_thrashing->pinned_residency_id = UVM_ID_INVALID;
1017     page_thrashing->pinned = false;
1018     uvm_page_mask_clear(&block_thrashing->pinned_pages.mask, page_index);
1019 
1020     UVM_ASSERT(thrashing_state_checks(va_block, block_thrashing, page_thrashing, page_index));
1021 }
1022 
1023 static void thrashing_detected(uvm_va_block_t *va_block,
1024                                block_thrashing_info_t *block_thrashing,
1025                                page_thrashing_info_t *page_thrashing,
1026                                uvm_page_index_t page_index,
1027                                uvm_processor_id_t processor_id)
1028 {
1029     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1030     NvU64 address = uvm_va_block_cpu_page_address(va_block, page_index);
1031 
1032     // Thrashing detected, record the event
1033     uvm_tools_record_thrashing(va_space, address, PAGE_SIZE, &page_thrashing->processors);
1034     if (!uvm_page_mask_test_and_set(&block_thrashing->thrashing_pages, page_index))
1035         ++block_thrashing->num_thrashing_pages;
1036 
1037     PROCESSOR_THRASHING_STATS_INC(va_space, processor_id, num_thrashing);
1038 
1039     UVM_ASSERT(thrashing_state_checks(va_block, block_thrashing, page_thrashing, page_index));
1040 }
1041 
1042 // Clear the thrashing information for the given page. This function does not
1043 // unmap remote mappings on the page. Callers will need to do it BEFORE calling
1044 // this function, if so desired
1045 static void thrashing_reset_page(va_space_thrashing_info_t *va_space_thrashing,
1046                                  uvm_va_block_t *va_block,
1047                                  block_thrashing_info_t *block_thrashing,
1048                                  uvm_page_index_t page_index)
1049 {
1050     page_thrashing_info_t *page_thrashing = &block_thrashing->pages[page_index];
1051     uvm_assert_mutex_locked(&va_block->lock);
1052 
1053     UVM_ASSERT(block_thrashing->num_thrashing_pages > 0);
1054     UVM_ASSERT(uvm_page_mask_test(&block_thrashing->thrashing_pages, page_index));
1055     UVM_ASSERT(page_thrashing->num_thrashing_events > 0);
1056 
1057     thrashing_throttling_reset_page(va_block, block_thrashing, page_thrashing, page_index);
1058     UVM_ASSERT(uvm_processor_mask_empty(&page_thrashing->throttled_processors));
1059 
1060     if (page_thrashing->pinned)
1061         thrashing_unpin_page(va_space_thrashing, va_block, block_thrashing, page_thrashing, page_index);
1062 
1063     page_thrashing->last_time_stamp       = 0;
1064     page_thrashing->has_migration_events  = 0;
1065     page_thrashing->has_revocation_events = 0;
1066     page_thrashing->num_thrashing_events  = 0;
1067     uvm_processor_mask_zero(&page_thrashing->processors);
1068 
1069     if (uvm_page_mask_test_and_clear(&block_thrashing->thrashing_pages, page_index))
1070         --block_thrashing->num_thrashing_pages;
1071 
1072     UVM_ASSERT(thrashing_state_checks(va_block, block_thrashing, page_thrashing, page_index));
1073 }
1074 
1075 // Call thrashing_reset_page for all the thrashing pages in the region
1076 // described by address and bytes
1077 static void thrashing_reset_pages_in_region(uvm_va_block_t *va_block, NvU64 address, NvU64 bytes)
1078 {
1079     uvm_page_index_t page_index;
1080     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1081     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get(va_space);
1082     block_thrashing_info_t *block_thrashing = NULL;
1083     uvm_va_block_region_t region = uvm_va_block_region_from_start_size(va_block, address, bytes);
1084 
1085     block_thrashing = thrashing_info_get(va_block);
1086     if (!block_thrashing || !block_thrashing->pages)
1087         return;
1088 
1089     // Update all pages in the region
1090     for_each_va_block_page_in_region_mask(page_index, &block_thrashing->thrashing_pages, region)
1091         thrashing_reset_page(va_space_thrashing, va_block, block_thrashing, page_index);
1092 }
1093 
1094 
1095 // Unmap remote mappings from the given processors on the pinned pages
1096 // described by region and block_thrashing->pinned pages.
1097 static NV_STATUS unmap_remote_pinned_pages(uvm_va_block_t *va_block,
1098                                            uvm_va_block_context_t *va_block_context,
1099                                            block_thrashing_info_t *block_thrashing,
1100                                            uvm_va_block_region_t region,
1101                                            const uvm_processor_mask_t *unmap_processors)
1102 {
1103     NV_STATUS status = NV_OK;
1104     NV_STATUS tracker_status;
1105     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
1106     uvm_processor_id_t processor_id;
1107     const uvm_va_policy_t *policy = uvm_va_policy_get(va_block, uvm_va_block_region_start(va_block, region));
1108 
1109     uvm_assert_mutex_locked(&va_block->lock);
1110 
1111     for_each_id_in_mask(processor_id, unmap_processors) {
1112         UVM_ASSERT(uvm_id_equal(processor_id, policy->preferred_location) ||
1113                    !uvm_processor_mask_test(&policy->accessed_by, processor_id));
1114 
1115         if (uvm_processor_mask_test(&va_block->resident, processor_id)) {
1116             const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, processor_id, NUMA_NO_NODE);
1117 
1118             if (!uvm_page_mask_andnot(&va_block_context->caller_page_mask,
1119                                       &block_thrashing->pinned_pages.mask,
1120                                       resident_mask))
1121                 continue;
1122         }
1123         else {
1124             uvm_page_mask_copy(&va_block_context->caller_page_mask,
1125                                &block_thrashing->pinned_pages.mask);
1126         }
1127 
1128         status = uvm_va_block_unmap(va_block,
1129                                     va_block_context,
1130                                     processor_id,
1131                                     region,
1132                                     &va_block_context->caller_page_mask,
1133                                     &local_tracker);
1134         if (status != NV_OK)
1135             break;
1136     }
1137 
1138     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
1139     if (status == NV_OK)
1140         status = tracker_status;
1141 
1142     uvm_tracker_deinit(&local_tracker);
1143 
1144     return status;
1145 }
1146 
1147 NV_STATUS uvm_perf_thrashing_unmap_remote_pinned_pages_all(uvm_va_block_t *va_block,
1148                                                            uvm_va_block_context_t *va_block_context,
1149                                                            uvm_va_block_region_t region)
1150 {
1151     block_thrashing_info_t *block_thrashing;
1152     uvm_processor_mask_t unmap_processors;
1153     const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region);
1154 
1155     uvm_assert_mutex_locked(&va_block->lock);
1156 
1157     block_thrashing = thrashing_info_get(va_block);
1158     if (!block_thrashing || !block_thrashing->pages)
1159         return NV_OK;
1160 
1161     if (uvm_page_mask_empty(&block_thrashing->pinned_pages.mask))
1162         return NV_OK;
1163 
1164     // Unmap all mapped processors (that are not SetAccessedBy) with
1165     // no copy of the page
1166     uvm_processor_mask_andnot(&unmap_processors, &va_block->mapped, &policy->accessed_by);
1167 
1168     return unmap_remote_pinned_pages(va_block, va_block_context, block_thrashing, region, &unmap_processors);
1169 }
1170 
1171 // Check that we are not migrating pages away from its pinned location and
1172 // that we are not prefetching thrashing pages.
1173 static bool migrating_wrong_pages(uvm_va_block_t *va_block,
1174                                   NvU64 address,
1175                                   NvU64 bytes,
1176                                   uvm_processor_id_t proc_id,
1177                                   uvm_make_resident_cause_t cause)
1178 {
1179     uvm_page_index_t page_index;
1180     block_thrashing_info_t *block_thrashing = NULL;
1181     uvm_va_block_region_t region = uvm_va_block_region_from_start_size(va_block, address, bytes);
1182 
1183     block_thrashing = thrashing_info_get(va_block);
1184     if (!block_thrashing || !block_thrashing->pages)
1185         return false;
1186 
1187     for_each_va_block_page_in_region(page_index, region) {
1188         page_thrashing_info_t *page_thrashing = &block_thrashing->pages[page_index];
1189         UVM_ASSERT_MSG(!page_thrashing->pinned || uvm_id_equal(proc_id, page_thrashing->pinned_residency_id),
1190                        "Migrating to %u instead of %u\n",
1191                        uvm_id_value(proc_id), uvm_id_value(page_thrashing->pinned_residency_id));
1192         if (cause == UVM_MAKE_RESIDENT_CAUSE_PREFETCH)
1193             UVM_ASSERT(!uvm_page_mask_test(&block_thrashing->thrashing_pages, page_index));
1194     }
1195 
1196     return false;
1197 }
1198 
1199 static bool is_migration_pinned_pages_update(uvm_va_block_t *va_block,
1200                                              const uvm_perf_event_data_t *event_data,
1201                                              NvU64 address,
1202                                              NvU64 bytes)
1203 {
1204     const block_thrashing_info_t *block_thrashing = NULL;
1205     uvm_va_block_region_t region = uvm_va_block_region_from_start_size(va_block, address, bytes);
1206     bool ret;
1207 
1208     if (event_data->migration.cause != UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT &&
1209         event_data->migration.cause != UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER) {
1210         return false;
1211     }
1212 
1213     block_thrashing = thrashing_info_get(va_block);
1214     if (!block_thrashing || !block_thrashing->pages)
1215         return false;
1216 
1217     ret = uvm_page_mask_region_full(&block_thrashing->pinned_pages.mask, region);
1218     if (ret) {
1219         uvm_page_index_t page_index;
1220         for_each_va_block_page_in_region(page_index, region) {
1221             page_thrashing_info_t *page_thrashing = &block_thrashing->pages[page_index];
1222             UVM_ASSERT(uvm_id_equal(page_thrashing->pinned_residency_id, event_data->migration.dst));
1223         }
1224     }
1225 
1226     return ret;
1227 }
1228 
1229 // This function processes migration/revocation events and determines if the
1230 // affected pages are thrashing or not.
1231 void thrashing_event_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data)
1232 {
1233     va_space_thrashing_info_t *va_space_thrashing;
1234     block_thrashing_info_t *block_thrashing = NULL;
1235     uvm_va_block_t *va_block;
1236     uvm_va_space_t *va_space;
1237     NvU64 address;
1238     NvU64 bytes;
1239     uvm_processor_id_t processor_id;
1240     uvm_page_index_t page_index;
1241     NvU64 time_stamp;
1242     uvm_va_block_region_t region;
1243     uvm_read_duplication_policy_t read_duplication;
1244 
1245     UVM_ASSERT(g_uvm_perf_thrashing_enable);
1246 
1247     UVM_ASSERT(event_id == UVM_PERF_EVENT_MIGRATION || event_id == UVM_PERF_EVENT_REVOCATION);
1248 
1249     if (event_id == UVM_PERF_EVENT_MIGRATION) {
1250         va_block     = event_data->migration.block;
1251         address      = event_data->migration.address;
1252         bytes        = event_data->migration.bytes;
1253         processor_id = event_data->migration.dst;
1254 
1255         // Skip the thrashing detection logic on eviction as we cannot take
1256         // the VA space lock
1257         if (event_data->migration.cause == UVM_MAKE_RESIDENT_CAUSE_EVICTION)
1258             return;
1259 
1260         // Do not perform checks during the first part of staging copies
1261         if (!uvm_id_equal(event_data->migration.dst, event_data->migration.make_resident_context->dest_id))
1262             return;
1263 
1264         va_space = uvm_va_block_get_va_space(va_block);
1265         va_space_thrashing = va_space_thrashing_info_get(va_space);
1266         if (!va_space_thrashing->params.enable)
1267             return;
1268 
1269         // TODO: Bug 3660922: HMM will need to look up the policy when
1270         // read duplication is supported.
1271         read_duplication = uvm_va_block_is_hmm(va_block) ?
1272                            UVM_READ_DUPLICATION_UNSET :
1273                            uvm_va_range_get_policy(va_block->va_range)->read_duplication;
1274 
1275         // We only care about migrations due to replayable faults, access
1276         // counters and page prefetching. For non-replayable faults, UVM will
1277         // try not to migrate memory since CE is transferring data anyway.
1278         // However, we can still see migration events due to initial
1279         // population. The rest of migrations are triggered due to user
1280         // commands or advice (such as read duplication) which takes precedence
1281         // over our heuristics. Therefore, we clear our internal tracking
1282         // state.
1283         if ((event_data->migration.cause != UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT &&
1284              event_data->migration.cause != UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER &&
1285              event_data->migration.cause != UVM_MAKE_RESIDENT_CAUSE_PREFETCH) ||
1286             (event_data->migration.transfer_mode != UVM_VA_BLOCK_TRANSFER_MODE_MOVE) ||
1287             (read_duplication == UVM_READ_DUPLICATION_ENABLED)) {
1288             thrashing_reset_pages_in_region(va_block, address, bytes);
1289             return;
1290         }
1291 
1292         // Assert that we are not migrating pages that are pinned away from
1293         // their pinning residency, or prefetching pages that are thrashing
1294         UVM_ASSERT(!migrating_wrong_pages(va_block, address, bytes, processor_id, event_data->migration.cause));
1295 
1296         // If we are being migrated due to pinning just return
1297         if (is_migration_pinned_pages_update(va_block, event_data, address, bytes))
1298             return;
1299     }
1300     else {
1301         va_block     = event_data->revocation.block;
1302         address      = event_data->revocation.address;
1303         bytes        = event_data->revocation.bytes;
1304         processor_id = event_data->revocation.proc_id;
1305 
1306         va_space = uvm_va_block_get_va_space(va_block);
1307         va_space_thrashing = va_space_thrashing_info_get(va_space);
1308         if (!va_space_thrashing->params.enable)
1309             return;
1310     }
1311 
1312     block_thrashing = thrashing_info_get_create(va_block);
1313     if (!block_thrashing)
1314         return;
1315 
1316     time_stamp = NV_GETTIME();
1317 
1318     if (!block_thrashing->pages) {
1319         // Don't create the per-page tracking structure unless there is some potential thrashing within the block
1320         NvU16 num_block_pages;
1321 
1322         if (block_thrashing->last_time_stamp == 0 ||
1323             uvm_id_equal(block_thrashing->last_processor, processor_id) ||
1324             time_stamp - block_thrashing->last_time_stamp > va_space_thrashing->params.lapse_ns)
1325             goto done;
1326 
1327         num_block_pages = uvm_va_block_size(va_block) / PAGE_SIZE;
1328 
1329         block_thrashing->pages = uvm_kvmalloc_zero(sizeof(*block_thrashing->pages) * num_block_pages);
1330         if (!block_thrashing->pages)
1331             goto done;
1332 
1333         for (page_index = 0; page_index < num_block_pages; ++page_index) {
1334             block_thrashing->pages[page_index].pinned_residency_id = UVM_ID_INVALID;
1335             block_thrashing->pages[page_index].do_not_throttle_processor_id = UVM_ID_INVALID;
1336         }
1337     }
1338 
1339     region = uvm_va_block_region_from_start_size(va_block, address, bytes);
1340 
1341     // Update all pages in the region
1342     for_each_va_block_page_in_region(page_index, region) {
1343         page_thrashing_info_t *page_thrashing = &block_thrashing->pages[page_index];
1344         NvU64 last_time_stamp = page_thrashing_get_time_stamp(page_thrashing);
1345 
1346         // It is not possible that a pinned page is migrated here, since the
1347         // fault that triggered the migration should have unpinned it in its
1348         // call to uvm_perf_thrashing_get_hint. Moreover page prefetching never
1349         // includes pages that are thrashing (including pinning)
1350         if (event_id == UVM_PERF_EVENT_MIGRATION)
1351             UVM_ASSERT(page_thrashing->pinned == 0);
1352 
1353         uvm_processor_mask_set(&page_thrashing->processors, processor_id);
1354         page_thrashing_set_time_stamp(page_thrashing, time_stamp);
1355 
1356         if (last_time_stamp == 0)
1357             continue;
1358 
1359         if (time_stamp - last_time_stamp <= va_space_thrashing->params.lapse_ns) {
1360             UVM_PERF_SATURATING_INC(page_thrashing->num_thrashing_events);
1361             if (page_thrashing->num_thrashing_events == va_space_thrashing->params.threshold)
1362                 thrashing_detected(va_block, block_thrashing, page_thrashing, page_index, processor_id);
1363 
1364             if (page_thrashing->num_thrashing_events >= va_space_thrashing->params.threshold)
1365                 block_thrashing->last_thrashing_time_stamp = time_stamp;
1366 
1367             if (event_id == UVM_PERF_EVENT_MIGRATION)
1368                 page_thrashing->has_migration_events = true;
1369             else
1370                 page_thrashing->has_revocation_events = true;
1371         }
1372         else if (page_thrashing->num_thrashing_events >= va_space_thrashing->params.threshold &&
1373                  !page_thrashing->pinned) {
1374             thrashing_reset_page(va_space_thrashing, va_block, block_thrashing, page_index);
1375         }
1376     }
1377 
1378 done:
1379     block_thrashing->last_time_stamp = time_stamp;
1380     block_thrashing->last_processor  = processor_id;
1381 }
1382 
1383 static bool thrashing_processors_can_access(uvm_va_space_t *va_space,
1384                                             page_thrashing_info_t *page_thrashing,
1385                                             uvm_processor_id_t to)
1386 {
1387     if (UVM_ID_IS_INVALID(to))
1388         return false;
1389 
1390     return uvm_processor_mask_subset(&page_thrashing->processors,
1391                                      &va_space->accessible_from[uvm_id_value(to)]);
1392 }
1393 
1394 static bool thrashing_processors_have_fast_access_to(uvm_va_space_t *va_space,
1395                                                      page_thrashing_info_t *page_thrashing,
1396                                                      uvm_processor_id_t to)
1397 {
1398     uvm_processor_mask_t fast_to;
1399 
1400     if (UVM_ID_IS_INVALID(to))
1401         return false;
1402 
1403     // Combine NVLINK and native atomics mask since we could have PCIe
1404     // atomics in the future
1405     uvm_processor_mask_and(&fast_to,
1406                            &va_space->has_nvlink[uvm_id_value(to)],
1407                            &va_space->has_native_atomics[uvm_id_value(to)]);
1408     uvm_processor_mask_set(&fast_to, to);
1409 
1410     return uvm_processor_mask_subset(&page_thrashing->processors, &fast_to);
1411 }
1412 
1413 static void thrashing_processors_common_locations(uvm_va_space_t *va_space,
1414                                                   page_thrashing_info_t *page_thrashing,
1415                                                   uvm_processor_mask_t *common_locations)
1416 {
1417     bool is_first = true;
1418     uvm_processor_id_t id;
1419 
1420     // Find processors that can be accessed from all thrashing processors. For
1421     // example: if A, B and C are thrashing, and A can access B and C can access
1422     // B, too, B would be the common location.
1423     uvm_processor_mask_zero(common_locations);
1424 
1425     for_each_id_in_mask(id, &page_thrashing->processors) {
1426         if (is_first)
1427             uvm_processor_mask_copy(common_locations, &va_space->can_access[uvm_id_value(id)]);
1428         else
1429             uvm_processor_mask_and(common_locations, common_locations, &va_space->can_access[uvm_id_value(id)]);
1430 
1431         is_first = false;
1432     }
1433 }
1434 
1435 static bool preferred_location_is_thrashing(uvm_processor_id_t preferred_location,
1436                                             page_thrashing_info_t *page_thrashing)
1437 {
1438     if (UVM_ID_IS_INVALID(preferred_location))
1439         return false;
1440 
1441     return uvm_processor_mask_test(&page_thrashing->processors, preferred_location);
1442 }
1443 
1444 static uvm_perf_thrashing_hint_t get_hint_for_migration_thrashing(va_space_thrashing_info_t *va_space_thrashing,
1445                                                                   uvm_va_block_t *va_block,
1446                                                                   uvm_page_index_t page_index,
1447                                                                   page_thrashing_info_t *page_thrashing,
1448                                                                   uvm_processor_id_t requester)
1449 {
1450     uvm_perf_thrashing_hint_t hint;
1451     uvm_processor_id_t closest_resident_id;
1452     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1453     uvm_processor_id_t do_not_throttle_processor = page_thrashing->do_not_throttle_processor_id;
1454     uvm_processor_id_t pinned_residency = page_thrashing->pinned_residency_id;
1455     const uvm_va_policy_t *policy;
1456     uvm_processor_id_t preferred_location;
1457 
1458     policy = uvm_va_policy_get(va_block, uvm_va_block_cpu_page_address(va_block, page_index));
1459 
1460     preferred_location = policy->preferred_location;
1461 
1462     hint.type = UVM_PERF_THRASHING_HINT_TYPE_NONE;
1463 
1464     closest_resident_id = uvm_va_block_page_get_closest_resident(va_block, page_index, requester);
1465     if (uvm_va_block_is_hmm(va_block)) {
1466         // HMM pages always start out resident on the CPU but may not be
1467         // recorded in the va_block state because hmm_range_fault() or
1468         // similar functions haven't been called to get an accurate snapshot
1469         // of the Linux state. We can assume pages are CPU resident for the
1470         // purpose of deciding where to migrate to reduce thrashing.
1471         if (UVM_ID_IS_INVALID(closest_resident_id))
1472             closest_resident_id = UVM_ID_CPU;
1473     }
1474     else {
1475         UVM_ASSERT(UVM_ID_IS_VALID(closest_resident_id));
1476     }
1477 
1478     if (thrashing_processors_can_access(va_space, page_thrashing, preferred_location)) {
1479         // The logic in uvm_va_block_select_residency chooses the preferred
1480         // location if the requester can access it, so all processors should
1481         // naturally get mapped to the preferred without thrashing. However,
1482         // we can get here if preferred location was set after processors
1483         // started thrashing.
1484         //
1485         // TODO: Bug 2527408. Reset thrashing history when a user policy
1486         //       changes in a VA block.
1487         hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1488         hint.pin.residency = preferred_location;
1489     }
1490     else if (!preferred_location_is_thrashing(preferred_location, page_thrashing) &&
1491              thrashing_processors_have_fast_access_to(va_space, page_thrashing, closest_resident_id)) {
1492         // This is a fast path for those scenarios in which all thrashing
1493         // processors have fast (NVLINK + native atomics) access to the current
1494         // residency. This is skipped if the preferred location is thrashing and
1495         // not accessible by the rest of thrashing processors. Otherwise, we
1496         // would be in the condition above.
1497         if (UVM_ID_IS_CPU(closest_resident_id)) {
1498             // On P9 systems, we prefer the CPU to map vidmem (since it can
1499             // cache it), so don't map the GPU to sysmem.
1500             if (UVM_ID_IS_GPU(requester)) {
1501                 hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1502                 hint.pin.residency = requester;
1503             }
1504         }
1505         else {
1506             hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1507             hint.pin.residency = closest_resident_id;
1508         }
1509     }
1510     else if (uvm_id_equal(requester, preferred_location)) {
1511         if (page_thrashing->pinned) {
1512             // If the faulting processor is the preferred location, we can
1513             // only:
1514             // 1) Pin to the preferred location
1515             // 2) Throttle if it's pinned elsewhere and we are not the
1516             //    do_not_throttle_processor
1517             if (uvm_id_equal(preferred_location, pinned_residency) ||
1518                 uvm_id_equal(preferred_location, do_not_throttle_processor)) {
1519                 hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1520                 hint.pin.residency = preferred_location;
1521             }
1522             else {
1523                 hint.type = UVM_PERF_THRASHING_HINT_TYPE_THROTTLE;
1524             }
1525         }
1526         else if (!uvm_id_equal(preferred_location, do_not_throttle_processor)) {
1527             hint.type = UVM_PERF_THRASHING_HINT_TYPE_THROTTLE;
1528         }
1529         else if (page_thrashing->throttling_count >= va_space_thrashing->params.pin_threshold) {
1530             hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1531             hint.pin.residency = preferred_location;
1532         }
1533     }
1534     else if (page_thrashing->pinned) {
1535         // 1) If the requester is the do_not_throttle_processor pin it to the
1536         //    requester if all thrashing processors can access the requester,
1537         //    or to a common location, or to the requester anyway if no common
1538         //    location found.
1539         // 2) Try to map the current pinned residency.
1540         // 3) Throttle.
1541         if (uvm_id_equal(requester, do_not_throttle_processor)) {
1542             hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1543 
1544             if (thrashing_processors_can_access(va_space, page_thrashing, requester)) {
1545                 hint.pin.residency = requester;
1546             }
1547             else {
1548                 uvm_processor_mask_t common_locations;
1549 
1550                 thrashing_processors_common_locations(va_space, page_thrashing, &common_locations);
1551                 if (uvm_processor_mask_empty(&common_locations)) {
1552                     hint.pin.residency = requester;
1553                 }
1554                 else {
1555                     // Find the common location that is closest to the requester
1556                     hint.pin.residency = uvm_processor_mask_find_closest_id(va_space, &common_locations, requester);
1557                 }
1558             }
1559         }
1560         else if (uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(pinned_residency)], requester)) {
1561             if (!uvm_va_block_is_hmm(va_block))
1562                 UVM_ASSERT(uvm_id_equal(closest_resident_id, pinned_residency));
1563 
1564             hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1565             hint.pin.residency = pinned_residency;
1566         }
1567         else {
1568             hint.type = UVM_PERF_THRASHING_HINT_TYPE_THROTTLE;
1569         }
1570     }
1571     else if (!uvm_id_equal(requester, do_not_throttle_processor)) {
1572         hint.type = UVM_PERF_THRASHING_HINT_TYPE_THROTTLE;
1573     }
1574     else if (page_thrashing->throttling_count >= va_space_thrashing->params.pin_threshold) {
1575         hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1576         hint.pin.residency = requester;
1577     }
1578 
1579     if (hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN &&
1580         !uvm_va_space_processor_has_memory(va_space, hint.pin.residency))
1581         hint.pin.residency = UVM_ID_CPU;
1582 
1583     return hint;
1584 }
1585 
1586 // Function called on fault that tells the fault handler if any operation
1587 // should be performed to minimize thrashing. The logic is as follows:
1588 //
1589 // - Phase0: Block thrashing. If a number of consecutive thrashing events have
1590 //   been detected on the VA block, per-page thrashing tracking information is
1591 //   created.
1592 // - Phase1: Throttling. When several processors fight over a page, we start a
1593 //   "throttling period". During that period, only one processor will be able
1594 //   to service faults on the page, and the rest will be throttled. All CPU
1595 //   faults are considered to belong to the same device, even if they come from
1596 //   different CPU threads.
1597 // - Phase2: Pinning. After a number of consecutive throttling periods, the page
1598 //   is pinned on a specific processor which all of the thrashing processors can
1599 //   access.
1600 // - Phase3: Revocation throttling. Even if the page is pinned, it can be still
1601 //   thrashing due to revocation events (mainly due to system-wide atomics). In
1602 //   that case we keep the page pinned while applying the same algorithm as in
1603 //   Phase1.
1604 uvm_perf_thrashing_hint_t uvm_perf_thrashing_get_hint(uvm_va_block_t *va_block,
1605                                                       NvU64 address,
1606                                                       uvm_processor_id_t requester)
1607 {
1608     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1609     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get(va_space);
1610     block_thrashing_info_t *block_thrashing = NULL;
1611     page_thrashing_info_t *page_thrashing = NULL;
1612     uvm_perf_thrashing_hint_t hint;
1613     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, address);
1614     NvU64 time_stamp;
1615     NvU64 last_time_stamp;
1616 
1617     hint.type = UVM_PERF_THRASHING_HINT_TYPE_NONE;
1618 
1619     if (!va_space_thrashing->params.enable)
1620         return hint;
1621 
1622     // If we don't have enough memory to store thrashing information, we assume
1623     // no thrashing
1624     block_thrashing = thrashing_info_get(va_block);
1625     if (!block_thrashing)
1626         return hint;
1627 
1628     // If the per-page tracking structure has not been created yet, we assume
1629     // no thrashing
1630     if (!block_thrashing->pages)
1631         return hint;
1632 
1633     time_stamp = NV_GETTIME();
1634 
1635     if (block_thrashing->last_thrashing_time_stamp != 0 &&
1636         (time_stamp - block_thrashing->last_thrashing_time_stamp > va_space_thrashing->params.epoch_ns) &&
1637         block_thrashing->pinned_pages.count == 0 &&
1638         block_thrashing->thrashing_reset_count < va_space_thrashing->params.max_resets) {
1639         uvm_page_index_t reset_page_index;
1640 
1641         ++block_thrashing->thrashing_reset_count;
1642 
1643         // Clear the state of throttled processors to make sure that we flush
1644         // any pending ThrottlingEnd events
1645         for_each_va_block_page_in_mask(reset_page_index, &block_thrashing->thrashing_pages, va_block) {
1646             thrashing_throttling_reset_page(va_block,
1647                                             block_thrashing,
1648                                             &block_thrashing->pages[reset_page_index],
1649                                             reset_page_index);
1650         }
1651 
1652         // Reset per-page tracking structure
1653         // TODO: Bug 1769904 [uvm] Speculatively unpin pages that were pinned on a specific memory due to thrashing
1654         UVM_ASSERT(uvm_page_mask_empty(&block_thrashing->pinned_pages.mask));
1655         uvm_kvfree(block_thrashing->pages);
1656         block_thrashing->pages                     = NULL;
1657         block_thrashing->num_thrashing_pages       = 0;
1658         block_thrashing->last_processor            = UVM_ID_INVALID;
1659         block_thrashing->last_time_stamp           = 0;
1660         block_thrashing->last_thrashing_time_stamp = 0;
1661         uvm_page_mask_zero(&block_thrashing->thrashing_pages);
1662         goto done;
1663     }
1664 
1665     page_thrashing = &block_thrashing->pages[page_index];
1666 
1667     // Not enough thrashing events yet
1668     if (page_thrashing->num_thrashing_events < va_space_thrashing->params.threshold)
1669         goto done;
1670 
1671     // If the requesting processor is throttled, check the throttling end time
1672     // stamp
1673     if (uvm_processor_mask_test(&page_thrashing->throttled_processors, requester)) {
1674         NvU64 throttling_end_time_stamp = page_thrashing_get_throttling_end_time_stamp(page_thrashing);
1675         if (time_stamp < throttling_end_time_stamp &&
1676             !uvm_id_equal(requester, page_thrashing->do_not_throttle_processor_id)) {
1677             hint.type = UVM_PERF_THRASHING_HINT_TYPE_THROTTLE;
1678             goto done;
1679         }
1680 
1681         thrashing_throttle_end_processor(va_block, block_thrashing, page_thrashing, page_index, requester);
1682     }
1683 
1684     UVM_ASSERT(!uvm_processor_mask_test(&page_thrashing->throttled_processors, requester));
1685 
1686     last_time_stamp = page_thrashing_get_time_stamp(page_thrashing);
1687 
1688     // If the lapse since the last thrashing event is longer than a thrashing
1689     // lapse we are no longer thrashing
1690     if (time_stamp - last_time_stamp > va_space_thrashing->params.lapse_ns &&
1691         !page_thrashing->pinned) {
1692         goto done;
1693     }
1694 
1695     // Set the requesting processor in the thrashing processors mask
1696     uvm_processor_mask_set(&page_thrashing->processors, requester);
1697 
1698     UVM_ASSERT(page_thrashing->has_migration_events || page_thrashing->has_revocation_events);
1699 
1700     // Update throttling heuristics
1701     thrashing_throttle_update(va_space_thrashing, va_block, page_thrashing, requester, time_stamp);
1702 
1703     if (page_thrashing->pinned &&
1704         page_thrashing->has_revocation_events &&
1705         !uvm_id_equal(requester, page_thrashing->do_not_throttle_processor_id)) {
1706 
1707         // When we get revocation thrashing, this is due to system-wide atomics
1708         // downgrading the permissions of other processors. Revocations only
1709         // happen when several processors are mapping the same page and there
1710         // are no migrations. In this case, the only thing we can do is to
1711         // throttle the execution of the processors.
1712         hint.type = UVM_PERF_THRASHING_HINT_TYPE_THROTTLE;
1713     }
1714     else {
1715         hint = get_hint_for_migration_thrashing(va_space_thrashing,
1716                                                 va_block,
1717                                                 page_index,
1718                                                 page_thrashing,
1719                                                 requester);
1720     }
1721 
1722 done:
1723     if (hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
1724         NV_STATUS status = thrashing_pin_page(va_space_thrashing,
1725                                               va_block,
1726                                               block_thrashing,
1727                                               page_thrashing,
1728                                               page_index,
1729                                               time_stamp,
1730                                               hint.pin.residency,
1731                                               requester);
1732 
1733         // If there was some problem pinning the page (i.e. OOM), demote to
1734         // throttling)
1735         if (status != NV_OK) {
1736             hint.type = UVM_PERF_THRASHING_HINT_TYPE_THROTTLE;
1737         }
1738         else {
1739             if (uvm_id_equal(hint.pin.residency, requester))
1740                 PROCESSOR_THRASHING_STATS_INC(va_space, requester, num_pin_local);
1741             else
1742                 PROCESSOR_THRASHING_STATS_INC(va_space, requester, num_pin_remote);
1743 
1744             uvm_processor_mask_copy(&hint.pin.processors, &page_thrashing->processors);
1745         }
1746     }
1747 
1748     if (hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) {
1749         thrashing_throttle_processor(va_block,
1750                                      block_thrashing,
1751                                      page_thrashing,
1752                                      page_index,
1753                                      requester);
1754 
1755         PROCESSOR_THRASHING_STATS_INC(va_space, requester, num_throttle);
1756 
1757         hint.throttle.end_time_stamp = page_thrashing_get_throttling_end_time_stamp(page_thrashing);
1758     }
1759     else if (hint.type == UVM_PERF_THRASHING_HINT_TYPE_NONE && page_thrashing) {
1760         UVM_ASSERT(!uvm_processor_mask_test(&page_thrashing->throttled_processors, requester));
1761         UVM_ASSERT(!page_thrashing->pinned);
1762         UVM_ASSERT(UVM_ID_IS_INVALID(page_thrashing->pinned_residency_id));
1763     }
1764 
1765     return hint;
1766 }
1767 
1768 uvm_processor_mask_t *uvm_perf_thrashing_get_thrashing_processors(uvm_va_block_t *va_block, NvU64 address)
1769 {
1770     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1771     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get(va_space);
1772     block_thrashing_info_t *block_thrashing = NULL;
1773     page_thrashing_info_t *page_thrashing = NULL;
1774     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, address);
1775 
1776     UVM_ASSERT(g_uvm_perf_thrashing_enable);
1777     UVM_ASSERT(va_space_thrashing->params.enable);
1778 
1779     block_thrashing = thrashing_info_get(va_block);
1780     UVM_ASSERT(block_thrashing);
1781 
1782     UVM_ASSERT(block_thrashing->pages);
1783 
1784     page_thrashing = &block_thrashing->pages[page_index];
1785 
1786     return &page_thrashing->processors;
1787 }
1788 
1789 const uvm_page_mask_t *uvm_perf_thrashing_get_thrashing_pages(uvm_va_block_t *va_block)
1790 {
1791     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1792     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get(va_space);
1793     block_thrashing_info_t *block_thrashing = NULL;
1794 
1795     if (!va_space_thrashing->params.enable)
1796         return NULL;
1797 
1798     block_thrashing = thrashing_info_get(va_block);
1799     if (!block_thrashing)
1800         return NULL;
1801 
1802     if (block_thrashing->num_thrashing_pages == 0)
1803         return NULL;
1804 
1805     return &block_thrashing->thrashing_pages;
1806 }
1807 
1808 #define TIMER_GRANULARITY_NS 20000ULL
1809 static void thrashing_unpin_pages(struct work_struct *work)
1810 {
1811     struct delayed_work *dwork = to_delayed_work(work);
1812     va_space_thrashing_info_t *va_space_thrashing = container_of(dwork, va_space_thrashing_info_t, pinned_pages.dwork);
1813     uvm_va_space_t *va_space = va_space_thrashing->va_space;
1814     uvm_va_block_context_t *va_block_context = va_space_thrashing->pinned_pages.va_block_context;
1815 
1816     // Take the VA space lock so that VA blocks don't go away during this
1817     // operation.
1818     uvm_va_space_down_read(va_space);
1819 
1820     if (va_space_thrashing->pinned_pages.in_va_space_teardown)
1821         goto exit_no_list_lock;
1822 
1823     while (1) {
1824         pinned_page_t *pinned_page;
1825         uvm_va_block_t *va_block;
1826 
1827         uvm_spin_lock(&va_space_thrashing->pinned_pages.lock);
1828         pinned_page = list_first_entry_or_null(&va_space_thrashing->pinned_pages.list,
1829                                                pinned_page_t,
1830                                                va_space_list_entry);
1831 
1832         if (pinned_page) {
1833             NvU64 now = NV_GETTIME();
1834 
1835             if (pinned_page->deadline <= (now + TIMER_GRANULARITY_NS)) {
1836                 list_del_init(&pinned_page->va_space_list_entry);
1837 
1838                 // Work cancellation is left to thrashing_unpin_page() as this
1839                 // would only catch the following pattern:
1840                 // - Worker thread A is in thrashing_unpin_pages but hasn't
1841                 // looked at the list yet
1842                 // - Thread B then removes the last entry
1843                 // - Thread C then adds a new entry and re-schedules work
1844                 // - Worker thread A removes the entry added by C because the
1845                 // deadline has passed (unlikely), then cancels the work
1846                 // scheduled by C.
1847             }
1848             else {
1849                 NvU64 elapsed_us = (pinned_page->deadline - now) / 1000;
1850 
1851                 schedule_delayed_work(&va_space_thrashing->pinned_pages.dwork, usecs_to_jiffies(elapsed_us));
1852                 uvm_spin_unlock(&va_space_thrashing->pinned_pages.lock);
1853                 break;
1854             }
1855         }
1856 
1857         uvm_spin_unlock(&va_space_thrashing->pinned_pages.lock);
1858 
1859         if (!pinned_page)
1860             break;
1861 
1862         va_block = pinned_page->va_block;
1863         if (uvm_va_block_is_hmm(va_block))
1864             uvm_hmm_migrate_begin_wait(va_block);
1865         uvm_mutex_lock(&va_block->lock);
1866 
1867         // Only operate if the pinned page's tracking state isn't already
1868         // cleared by thrashing_unpin_page()
1869         if (!list_empty(&pinned_page->va_block_list_entry)) {
1870             uvm_page_index_t page_index = pinned_page->page_index;
1871             block_thrashing_info_t *block_thrashing = thrashing_info_get(va_block);
1872 
1873             UVM_ASSERT(block_thrashing);
1874             UVM_ASSERT(uvm_page_mask_test(&block_thrashing->pinned_pages.mask, page_index));
1875 
1876             uvm_va_block_context_init(va_block_context, NULL);
1877 
1878             uvm_perf_thrashing_unmap_remote_pinned_pages_all(va_block,
1879                                                              va_block_context,
1880                                                              uvm_va_block_region_for_page(page_index));
1881             thrashing_reset_page(va_space_thrashing, va_block, block_thrashing, page_index);
1882         }
1883 
1884         uvm_mutex_unlock(&va_block->lock);
1885         if (uvm_va_block_is_hmm(va_block))
1886             uvm_hmm_migrate_finish(va_block);
1887         kmem_cache_free(g_pinned_page_cache, pinned_page);
1888     }
1889 
1890 exit_no_list_lock:
1891     uvm_va_space_up_read(va_space);
1892 }
1893 
1894 static void thrashing_unpin_pages_entry(struct work_struct *work)
1895 {
1896     UVM_ENTRY_VOID(thrashing_unpin_pages(work));
1897 }
1898 
1899 NV_STATUS uvm_perf_thrashing_load(uvm_va_space_t *va_space)
1900 {
1901     va_space_thrashing_info_t *va_space_thrashing;
1902     NV_STATUS status;
1903 
1904     status = uvm_perf_module_load(&g_module_thrashing, va_space);
1905     if (status != NV_OK)
1906         return status;
1907 
1908     va_space_thrashing = va_space_thrashing_info_create(va_space);
1909     if (!va_space_thrashing)
1910         return NV_ERR_NO_MEMORY;
1911 
1912     uvm_spin_lock_init(&va_space_thrashing->pinned_pages.lock, UVM_LOCK_ORDER_LEAF);
1913     INIT_LIST_HEAD(&va_space_thrashing->pinned_pages.list);
1914     INIT_DELAYED_WORK(&va_space_thrashing->pinned_pages.dwork, thrashing_unpin_pages_entry);
1915 
1916     return NV_OK;
1917 }
1918 
1919 void uvm_perf_thrashing_stop(uvm_va_space_t *va_space)
1920 {
1921     va_space_thrashing_info_t *va_space_thrashing;
1922 
1923     uvm_va_space_down_write(va_space);
1924     va_space_thrashing = va_space_thrashing_info_get_or_null(va_space);
1925 
1926     // Prevent further unpinning operations from being scheduled
1927     if (va_space_thrashing)
1928         va_space_thrashing->pinned_pages.in_va_space_teardown = true;
1929 
1930     uvm_va_space_up_write(va_space);
1931 
1932     // Cancel any pending work. We can safely access va_space_thrashing
1933     // because this function is called once from the VA space teardown path,
1934     // and the only function that frees it is uvm_perf_thrashing_unload,
1935     // which is called later in the teardown path.
1936     if (va_space_thrashing)
1937         (void)cancel_delayed_work_sync(&va_space_thrashing->pinned_pages.dwork);
1938 }
1939 
1940 void uvm_perf_thrashing_unload(uvm_va_space_t *va_space)
1941 {
1942     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get_or_null(va_space);
1943 
1944     uvm_perf_module_unload(&g_module_thrashing, va_space);
1945 
1946     // Make sure that there are not pending work items
1947     if (va_space_thrashing) {
1948         UVM_ASSERT(list_empty(&va_space_thrashing->pinned_pages.list));
1949 
1950         va_space_thrashing_info_destroy(va_space);
1951     }
1952 }
1953 
1954 NV_STATUS uvm_perf_thrashing_register_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu)
1955 {
1956     // If a simulated GPU is registered, re-initialize thrashing parameters in
1957     // case they need to be adjusted.
1958     bool params_need_readjusting = g_uvm_global.num_simulated_devices > 0;
1959 
1960     // Likewise, when the Confidential Computing feature is enabled, the DMA
1961     // path is slower due to cryptographic operations & other associated
1962     // overhead. Enforce a larger window to allow the thrashing mitigation
1963     // mechanisms to work properly.
1964     params_need_readjusting = params_need_readjusting || uvm_conf_computing_mode_enabled(gpu);
1965 
1966     if (params_need_readjusting) {
1967         va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get(va_space);
1968 
1969         if (!va_space_thrashing->params.test_overrides) {
1970             if (uvm_conf_computing_mode_enabled(gpu))
1971                 g_uvm_perf_thrashing_lapse_usec = UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT_HCC;
1972 
1973             va_space_thrashing_info_init_params(va_space_thrashing);
1974         }
1975     }
1976 
1977     return NV_OK;
1978 }
1979 
1980 NV_STATUS uvm_perf_thrashing_init(void)
1981 {
1982     NV_STATUS status;
1983 
1984     INIT_THRASHING_PARAMETER_TOGGLE(uvm_perf_thrashing_enable, UVM_PERF_THRASHING_ENABLE_DEFAULT);
1985     if (!g_uvm_perf_thrashing_enable)
1986         return NV_OK;
1987 
1988     uvm_perf_module_init("perf_thrashing",
1989                          UVM_PERF_MODULE_TYPE_THRASHING,
1990                          g_callbacks_thrashing,
1991                          ARRAY_SIZE(g_callbacks_thrashing),
1992                          &g_module_thrashing);
1993 
1994     INIT_THRASHING_PARAMETER_NONZERO_MAX(uvm_perf_thrashing_threshold,
1995                                          UVM_PERF_THRASHING_THRESHOLD_DEFAULT,
1996                                          UVM_PERF_THRASHING_THRESHOLD_MAX);
1997 
1998     INIT_THRASHING_PARAMETER_NONZERO_MAX(uvm_perf_thrashing_pin_threshold,
1999                                          UVM_PERF_THRASHING_PIN_THRESHOLD_DEFAULT,
2000                                          UVM_PERF_THRASHING_PIN_THRESHOLD_MAX);
2001 
2002     INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_lapse_usec, UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT);
2003 
2004     INIT_THRASHING_PARAMETER_NONZERO_MAX(uvm_perf_thrashing_nap,
2005                                          UVM_PERF_THRASHING_NAP_DEFAULT,
2006                                          UVM_PERF_THRASHING_NAP_MAX);
2007 
2008 
2009     INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_epoch, UVM_PERF_THRASHING_EPOCH_DEFAULT);
2010 
2011     INIT_THRASHING_PARAMETER(uvm_perf_thrashing_pin, UVM_PERF_THRASHING_PIN_DEFAULT);
2012 
2013     INIT_THRASHING_PARAMETER(uvm_perf_thrashing_max_resets, UVM_PERF_THRASHING_MAX_RESETS_DEFAULT);
2014 
2015     g_va_block_thrashing_info_cache = NV_KMEM_CACHE_CREATE("uvm_block_thrashing_info_t", block_thrashing_info_t);
2016     if (!g_va_block_thrashing_info_cache) {
2017         status = NV_ERR_NO_MEMORY;
2018         goto error;
2019     }
2020 
2021     g_pinned_page_cache = NV_KMEM_CACHE_CREATE("uvm_pinned_page_t", pinned_page_t);
2022     if (!g_pinned_page_cache) {
2023         status = NV_ERR_NO_MEMORY;
2024         goto error;
2025     }
2026 
2027     status = cpu_thrashing_stats_init();
2028     if (status != NV_OK)
2029         goto error;
2030 
2031     return NV_OK;
2032 
2033 error:
2034     uvm_perf_thrashing_exit();
2035 
2036     return status;
2037 }
2038 
2039 void uvm_perf_thrashing_exit(void)
2040 {
2041     cpu_thrashing_stats_exit();
2042 
2043     kmem_cache_destroy_safe(&g_va_block_thrashing_info_cache);
2044     kmem_cache_destroy_safe(&g_pinned_page_cache);
2045 }
2046 
2047 NV_STATUS uvm_perf_thrashing_add_gpu(uvm_gpu_t *gpu)
2048 {
2049     if (!uvm_procfs_is_debug_enabled())
2050         return NV_OK;
2051 
2052     return gpu_thrashing_stats_create(gpu);
2053 }
2054 
2055 void uvm_perf_thrashing_remove_gpu(uvm_gpu_t *gpu)
2056 {
2057     gpu_thrashing_stats_destroy(gpu);
2058 }
2059 
2060 NV_STATUS uvm_test_get_page_thrashing_policy(UVM_TEST_GET_PAGE_THRASHING_POLICY_PARAMS *params, struct file *filp)
2061 {
2062     uvm_va_space_t *va_space = uvm_va_space_get(filp);
2063     va_space_thrashing_info_t *va_space_thrashing;
2064 
2065     uvm_va_space_down_read(va_space);
2066 
2067     va_space_thrashing = va_space_thrashing_info_get(va_space);
2068 
2069     if (va_space_thrashing->params.enable) {
2070         params->policy = UVM_TEST_PAGE_THRASHING_POLICY_ENABLE;
2071         params->nap_ns = va_space_thrashing->params.nap_ns;
2072         params->pin_ns = va_space_thrashing->params.pin_ns;
2073         params->map_remote_on_native_atomics_fault = uvm_perf_map_remote_on_native_atomics_fault != 0;
2074     }
2075     else {
2076         params->policy = UVM_TEST_PAGE_THRASHING_POLICY_DISABLE;
2077     }
2078 
2079     uvm_va_space_up_read(va_space);
2080 
2081     return NV_OK;
2082 }
2083 
2084 NV_STATUS uvm_test_set_page_thrashing_policy(UVM_TEST_SET_PAGE_THRASHING_POLICY_PARAMS *params, struct file *filp)
2085 {
2086     NV_STATUS status = NV_OK;
2087     uvm_va_space_t *va_space = uvm_va_space_get(filp);
2088     va_space_thrashing_info_t *va_space_thrashing;
2089 
2090     if (params->policy >= UVM_TEST_PAGE_THRASHING_POLICY_MAX)
2091         return NV_ERR_INVALID_ARGUMENT;
2092 
2093     if (!g_uvm_perf_thrashing_enable)
2094         return NV_ERR_INVALID_STATE;
2095 
2096     uvm_va_space_down_write(va_space);
2097 
2098     va_space_thrashing = va_space_thrashing_info_get(va_space);
2099     va_space_thrashing->params.test_overrides = true;
2100 
2101     if (params->policy == UVM_TEST_PAGE_THRASHING_POLICY_ENABLE) {
2102         if (va_space_thrashing->params.enable)
2103             goto done_unlock_va_space;
2104 
2105         va_space_thrashing->params.pin_ns = params->pin_ns;
2106         va_space_thrashing->params.enable = true;
2107     }
2108     else {
2109         if (!va_space_thrashing->params.enable)
2110             goto done_unlock_va_space;
2111 
2112         va_space_thrashing->params.enable = false;
2113     }
2114 
2115     // When disabling thrashing detection, destroy the thrashing tracking
2116     // information for all VA blocks and unpin pages
2117     if (!va_space_thrashing->params.enable) {
2118         uvm_va_range_t *va_range;
2119 
2120         uvm_for_each_va_range(va_range, va_space) {
2121             uvm_va_block_t *va_block;
2122 
2123             if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
2124                 continue;
2125 
2126             for_each_va_block_in_va_range(va_range, va_block) {
2127                 uvm_va_block_region_t va_block_region = uvm_va_block_region_from_block(va_block);
2128                 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
2129 
2130                 uvm_mutex_lock(&va_block->lock);
2131 
2132                 // Unmap may split PTEs and require a retry. Needs to be called
2133                 // before the pinned pages information is destroyed.
2134                 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL,
2135                              uvm_perf_thrashing_unmap_remote_pinned_pages_all(va_block,
2136                                                                               block_context,
2137                                                                               va_block_region));
2138 
2139                 uvm_perf_thrashing_info_destroy(va_block);
2140 
2141                 uvm_mutex_unlock(&va_block->lock);
2142 
2143                 // Re-enable thrashing on failure to avoid getting asserts
2144                 // about having state while thrashing is disabled
2145                 if (status != NV_OK) {
2146                     va_space_thrashing->params.enable = true;
2147                     goto done_unlock_va_space;
2148                 }
2149             }
2150         }
2151 
2152         status = uvm_hmm_clear_thrashing_policy(va_space);
2153 
2154         // Re-enable thrashing on failure to avoid getting asserts
2155         // about having state while thrashing is disabled
2156         if (status != NV_OK) {
2157             va_space_thrashing->params.enable = true;
2158             goto done_unlock_va_space;
2159         }
2160     }
2161 
2162 done_unlock_va_space:
2163     uvm_va_space_up_write(va_space);
2164 
2165     return status;
2166 }
2167