1 /*******************************************************************************
2     Copyright (c) 2016-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_api.h"
25 #include "uvm_global.h"
26 #include "uvm_perf_events.h"
27 #include "uvm_perf_module.h"
28 #include "uvm_perf_thrashing.h"
29 #include "uvm_perf_utils.h"
30 #include "uvm_va_block.h"
31 #include "uvm_va_range.h"
32 #include "uvm_kvmalloc.h"
33 #include "uvm_tools.h"
34 #include "uvm_procfs.h"
35 #include "uvm_test.h"
36 
37 // Number of bits for page-granularity time stamps. Currently we ignore the first 6 bits
38 // of the timestamp (i.e. we have 64ns resolution, which is good enough)
39 #define PAGE_THRASHING_LAST_TIME_STAMP_BITS 58
40 #define PAGE_THRASHING_NUM_EVENTS_BITS      3
41 
42 #define PAGE_THRASHING_THROTTLING_END_TIME_STAMP_BITS 58
43 #define PAGE_THRASHING_THROTTLING_COUNT_BITS          8
44 
45 // Per-page thrashing detection structure.
46 typedef struct
47 {
48     struct
49     {
50         // Last time stamp when a thrashing-related event was recorded
51         NvU64                        last_time_stamp : PAGE_THRASHING_LAST_TIME_STAMP_BITS;
52 
53         bool                    has_migration_events : 1;
54 
55         bool                   has_revocation_events : 1;
56 
57         // Number of consecutive "thrashing" events (within the configured
58         // thrashing lapse)
59         NvU8                    num_thrashing_events : PAGE_THRASHING_NUM_EVENTS_BITS;
60 
61         bool                                  pinned : 1;
62     };
63 
64     struct
65     {
66         // Deadline for throttled processors to wake up
67         NvU64              throttling_end_time_stamp : PAGE_THRASHING_THROTTLING_END_TIME_STAMP_BITS;
68 
69         // Number of times a processor has been throttled. This is used to
70         // determine when the page needs to get pinned. After getting pinned
71         // this field is always 0.
72         NvU8                        throttling_count : PAGE_THRASHING_THROTTLING_COUNT_BITS;
73     };
74 
75     // Processors accessing this page
76     uvm_processor_mask_t                  processors;
77 
78     // Processors that have been throttled. This must be a subset of processors
79     uvm_processor_mask_t        throttled_processors;
80 
81     // Memory residency for the page when in pinning phase
82     uvm_processor_id_t           pinned_residency_id;
83 
84     // Processor not to be throttled in the current throttling period
85     uvm_processor_id_t  do_not_throttle_processor_id;
86 } page_thrashing_info_t;
87 
88 // Per-VA block thrashing detection structure. This state is protected by the
89 // VA block lock.
90 typedef struct
91 {
92     page_thrashing_info_t                     *pages;
93 
94     NvU16                        num_thrashing_pages;
95 
96     NvU8                       thrashing_reset_count;
97 
98     uvm_processor_id_t                last_processor;
99 
100     NvU64                            last_time_stamp;
101 
102     NvU64                  last_thrashing_time_stamp;
103 
104     // Stats
105     NvU32                           throttling_count;
106 
107     uvm_page_mask_t                  thrashing_pages;
108 
109     struct
110     {
111         NvU32                                  count;
112 
113         uvm_page_mask_t                         mask;
114 
115         // List of pinned pages. This list is only used if the pinning timeout
116         // is not 0.
117         struct list_head                        list;
118     } pinned_pages;
119 } block_thrashing_info_t;
120 
121 // Descriptor for a page that has been pinned due to thrashing. This structure
122 // is only used if the pinning timeout is not 0.
123 typedef struct
124 {
125     uvm_va_block_t                         *va_block;
126 
127     // Page index within va_block
128     uvm_page_index_t                      page_index;
129 
130     // Absolute timestamp after which the page will be unpinned
131     NvU64                                   deadline;
132 
133     // Entry in the per-VA Space list of pinned pages. See
134     // va_space_thrashing_info_t::pinned_pages::list.
135     struct list_head             va_space_list_entry;
136 
137     // Entry in the per-VA Block list of pinned pages. See
138     // block_thrashing_info_t::pinned_pages::list.
139     struct list_head             va_block_list_entry;
140 } pinned_page_t;
141 
142 // Per-VA space data structures and policy configuration
143 typedef struct
144 {
145     // Per-VA space accounting of pinned pages that is used to speculatively
146     // unpin pages after the configured timeout. This struct is only used if
147     // the pinning timeout is not 0.
148     struct
149     {
150         // Work descriptor that is executed asynchronously by a helper thread
151         struct delayed_work                    dwork;
152 
153         // List of pinned pages. They are (mostly) ordered by unpin deadline.
154         // New entries are inserted blindly at the tail since the expectation
155         // is that they will have the largest deadline value. However, given
156         // the drift between when multiple threads query their timestamps and
157         // add those pages to the list under the lock, it might not be
158         // strictly ordered. But this is OK since the difference will be very
159         // small and they will be eventually removed from the list.
160         //
161         // Entries are removed when they reach the deadline by the function
162         // configured in dwork. This list is protected by lock.
163         struct list_head                        list;
164 
165         uvm_spinlock_t                          lock;
166 
167         uvm_va_block_context_t      *va_block_context;
168 
169         // Flag used to avoid scheduling delayed unpinning operations after
170         // uvm_perf_thrashing_stop has been called.
171         bool                    in_va_space_teardown;
172     } pinned_pages;
173 
174     struct
175     {
176         // Whether thrashing mitigation is enabled on this VA space
177         bool                                  enable;
178 
179         // true if the thrashing mitigation parameters have been modified using
180         // test ioctls
181         bool                          test_overrides;
182 
183         //
184         // Fields below are the thrashing mitigation parameters on the VA space
185         //
186         unsigned                           threshold;
187 
188         unsigned                       pin_threshold;
189 
190         NvU64                               lapse_ns;
191 
192         NvU64                                 nap_ns;
193 
194         NvU64                               epoch_ns;
195 
196         unsigned                          max_resets;
197 
198         NvU64                                 pin_ns;
199     } params;
200 
201     uvm_va_space_t                         *va_space;
202 } va_space_thrashing_info_t;
203 
204 typedef struct
205 {
206     // Entry for the per-processor thrashing_stats file in procfs
207     struct proc_dir_entry *procfs_file;
208 
209     // Number of times thrashing is detected
210     atomic64_t num_thrashing;
211 
212     // Number of times the processor was throttled while thrashing
213     atomic64_t num_throttle;
214 
215     // Number of times a page was pinned on this processor while thrashing
216     atomic64_t num_pin_local;
217 
218     // Number of times a page was pinned on a different processor while thrashing
219     atomic64_t num_pin_remote;
220 } processor_thrashing_stats_t;
221 
222 // Pre-allocated thrashing stats structure for the CPU. This is only valid if
223 // uvm_procfs_is_debug_enabled() returns true.
224 static processor_thrashing_stats_t g_cpu_thrashing_stats;
225 
226 #define PROCESSOR_THRASHING_STATS_INC(va_space, proc, field)                                         \
227     do {                                                                                             \
228         processor_thrashing_stats_t *_processor_stats = thrashing_stats_get_or_null(va_space, proc); \
229         if (_processor_stats)                                                                        \
230             atomic64_inc(&_processor_stats->field);                                                  \
231     } while (0)
232 
233 // Global caches for the per-VA block thrashing detection structures
234 static struct kmem_cache *g_va_block_thrashing_info_cache __read_mostly;
235 static struct kmem_cache *g_pinned_page_cache __read_mostly;
236 
237 //
238 // Tunables for thrashing detection/prevention (configurable via module parameters)
239 //
240 
241 #define UVM_PERF_THRASHING_ENABLE_DEFAULT 1
242 
243 // Enable/disable thrashing performance heuristics
244 static unsigned uvm_perf_thrashing_enable = UVM_PERF_THRASHING_ENABLE_DEFAULT;
245 
246 #define UVM_PERF_THRASHING_THRESHOLD_DEFAULT 3
247 #define UVM_PERF_THRASHING_THRESHOLD_MAX     ((1 << PAGE_THRASHING_NUM_EVENTS_BITS) - 1)
248 
249 // Number of consecutive thrashing events to initiate thrashing prevention
250 //
251 // Maximum value is UVM_PERF_THRASHING_THRESHOLD_MAX
252 static unsigned uvm_perf_thrashing_threshold = UVM_PERF_THRASHING_THRESHOLD_DEFAULT;
253 
254 #define UVM_PERF_THRASHING_PIN_THRESHOLD_DEFAULT 10
255 #define UVM_PERF_THRASHING_PIN_THRESHOLD_MAX     ((1 << PAGE_THRASHING_THROTTLING_COUNT_BITS) - 1)
256 
257 // Number of consecutive throttling operations before trying to map remotely
258 //
259 // Maximum value is UVM_PERF_THRASHING_PIN_THRESHOLD_MAX
260 static unsigned uvm_perf_thrashing_pin_threshold = UVM_PERF_THRASHING_PIN_THRESHOLD_DEFAULT;
261 
262 // TODO: Bug 1768615: [uvm] Automatically tune default values for thrashing
263 // detection/prevention parameters
264 #define UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT 500
265 #define UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT_EMULATION (UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT * 800)
266 
267 // Lapse of time in microseconds that determines if two consecutive events on
268 // the same page can be considered thrashing
269 static unsigned uvm_perf_thrashing_lapse_usec = UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT;
270 
271 #define UVM_PERF_THRASHING_NAP_DEFAULT 1
272 #define UVM_PERF_THRASHING_NAP_MAX     100
273 
274 // Time that the processor being throttled is forbidden to work on the thrashing
275 // page. This value is a multiplier of uvm_perf_thrashing_lapse_usec.
276 static unsigned uvm_perf_thrashing_nap = UVM_PERF_THRASHING_NAP_DEFAULT;
277 
278 #define UVM_PERF_THRASHING_EPOCH_DEFAULT 2000
279 
280 // Time lapse after which we consider thrashing is no longer happening. This
281 // value is a multiplier of uvm_perf_thrashing_lapse_usec.
282 static unsigned uvm_perf_thrashing_epoch = UVM_PERF_THRASHING_EPOCH_DEFAULT;
283 
284 // When pages are pinned and the rest of thrashing processors are mapped
285 // remotely we lose track of who is accessing the page for the rest of
286 // program execution. This can lead to tremendous performance loss if the page
287 // is not thrashing anymore and it is always being accessed remotely.
288 // In order to avoid that scenario, we use a timer that unpins memory after
289 // some time. We use a per-VA space list of pinned pages, sorted by the
290 // deadline at which it will be unmapped from remote processors. Therefore,
291 // next remote access will trigger a fault that will migrate the page.
292 #define UVM_PERF_THRASHING_PIN_DEFAULT 300
293 #define UVM_PERF_THRASHING_PIN_DEFAULT_EMULATION 10
294 
295 // Time for which a page remains pinned. This value is a multiplier of
296 // uvm_perf_thrashing_lapse_usec. 0 means that it is pinned forever.
297 static unsigned uvm_perf_thrashing_pin = UVM_PERF_THRASHING_PIN_DEFAULT;
298 
299 // Number of times a VA block can be reset back to non-thrashing. This
300 // mechanism tries to avoid performing optimizations on a block that periodically
301 // causes thrashing
302 #define UVM_PERF_THRASHING_MAX_RESETS_DEFAULT 4
303 
304 static unsigned uvm_perf_thrashing_max_resets = UVM_PERF_THRASHING_MAX_RESETS_DEFAULT;
305 
306 // Module parameters for the tunables
307 module_param(uvm_perf_thrashing_enable,        uint, S_IRUGO);
308 module_param(uvm_perf_thrashing_threshold,     uint, S_IRUGO);
309 module_param(uvm_perf_thrashing_pin_threshold, uint, S_IRUGO);
310 module_param(uvm_perf_thrashing_lapse_usec,    uint, S_IRUGO);
311 module_param(uvm_perf_thrashing_nap,           uint, S_IRUGO);
312 module_param(uvm_perf_thrashing_epoch,         uint, S_IRUGO);
313 module_param(uvm_perf_thrashing_pin,           uint, S_IRUGO);
314 module_param(uvm_perf_thrashing_max_resets,    uint, S_IRUGO);
315 
316 // See map_remote_on_atomic_fault uvm_va_block.c
317 unsigned uvm_perf_map_remote_on_native_atomics_fault = 0;
318 module_param(uvm_perf_map_remote_on_native_atomics_fault, uint, S_IRUGO);
319 
320 // Global post-processed values of the module parameters. They can be overriden
321 // per VA-space.
322 static bool g_uvm_perf_thrashing_enable;
323 static unsigned g_uvm_perf_thrashing_threshold;
324 static unsigned g_uvm_perf_thrashing_pin_threshold;
325 static NvU64 g_uvm_perf_thrashing_lapse_usec;
326 static NvU64 g_uvm_perf_thrashing_nap;
327 static NvU64 g_uvm_perf_thrashing_epoch;
328 static NvU64 g_uvm_perf_thrashing_pin;
329 static unsigned g_uvm_perf_thrashing_max_resets;
330 
331 // Helper macros to initialize thrashing parameters from module parameters
332 //
333 // This helper returns whether the type for the parameter is signed
334 #define THRASHING_PARAMETER_IS_SIGNED(v) (((typeof(v)) -1) < 0)
335 
336 // Macro that initializes the given thrashing parameter and checks its validity
337 // (within [_mi:_ma]). Otherwise it is initialized with the given default
338 // parameter _d. The user value is read from _v, and the final value is stored
339 // in a variable named g_##_v, so it must be declared, too. Only unsigned
340 // parameters are supported.
341 #define INIT_THRASHING_PARAMETER_MIN_MAX(_v, _d, _mi, _ma)                      \
342     do {                                                                        \
343         unsigned v = (_v);                                                      \
344         unsigned d = (_d);                                                      \
345         unsigned mi = (_mi);                                                    \
346         unsigned ma = (_ma);                                                    \
347                                                                                 \
348         BUILD_BUG_ON(sizeof(_v) > sizeof(unsigned));                            \
349         BUILD_BUG_ON(THRASHING_PARAMETER_IS_SIGNED(_v));                        \
350                                                                                 \
351         UVM_ASSERT(mi <= ma);                                                   \
352         UVM_ASSERT(d >= mi);                                                    \
353         UVM_ASSERT(d <= ma);                                                    \
354                                                                                 \
355         if (v >= mi && v <= ma) {                                               \
356             g_##_v = v;                                                         \
357         }                                                                       \
358         else {                                                                  \
359             pr_info("Invalid value %u for " #_v ". Using %u instead\n", v, d);  \
360                                                                                 \
361             g_##_v = d;                                                         \
362         }                                                                       \
363     } while (0)
364 
365 #define INIT_THRASHING_PARAMETER(v, d)                 INIT_THRASHING_PARAMETER_MIN_MAX(v, d, 0u, UINT_MAX)
366 
367 #define INIT_THRASHING_PARAMETER_MIN(v, d, mi)         INIT_THRASHING_PARAMETER_MIN_MAX(v, d, mi, UINT_MAX)
368 #define INIT_THRASHING_PARAMETER_MAX(v, d, ma)         INIT_THRASHING_PARAMETER_MIN_MAX(v, d, 0u, ma)
369 
370 #define INIT_THRASHING_PARAMETER_NONZERO(v, d)         INIT_THRASHING_PARAMETER_MIN_MAX(v, d, 1u, UINT_MAX)
371 #define INIT_THRASHING_PARAMETER_NONZERO_MAX(v, d, ma) INIT_THRASHING_PARAMETER_MIN_MAX(v, d, 1u, ma)
372 
373 #define INIT_THRASHING_PARAMETER_TOGGLE(v, d)          INIT_THRASHING_PARAMETER_MIN_MAX(v, d, 0u, 1u)
374 
375 // Helpers to get/set the time stamp
page_thrashing_get_time_stamp(page_thrashing_info_t * entry)376 static NvU64 page_thrashing_get_time_stamp(page_thrashing_info_t *entry)
377 {
378     return entry->last_time_stamp << (64 - PAGE_THRASHING_LAST_TIME_STAMP_BITS);
379 }
380 
page_thrashing_set_time_stamp(page_thrashing_info_t * entry,NvU64 time_stamp)381 static void page_thrashing_set_time_stamp(page_thrashing_info_t *entry, NvU64 time_stamp)
382 {
383     entry->last_time_stamp = time_stamp >> (64 - PAGE_THRASHING_LAST_TIME_STAMP_BITS);
384 }
385 
page_thrashing_get_throttling_end_time_stamp(page_thrashing_info_t * entry)386 static NvU64 page_thrashing_get_throttling_end_time_stamp(page_thrashing_info_t *entry)
387 {
388     return entry->throttling_end_time_stamp << (64 - PAGE_THRASHING_THROTTLING_END_TIME_STAMP_BITS);
389 }
390 
page_thrashing_set_throttling_end_time_stamp(page_thrashing_info_t * entry,NvU64 time_stamp)391 static void page_thrashing_set_throttling_end_time_stamp(page_thrashing_info_t *entry, NvU64 time_stamp)
392 {
393     entry->throttling_end_time_stamp = time_stamp >> (64 - PAGE_THRASHING_THROTTLING_END_TIME_STAMP_BITS);
394 }
395 
396 // Performance heuristics module for thrashing
397 static uvm_perf_module_t g_module_thrashing;
398 
399 // Callback declaration for the performance heuristics events
400 static void thrashing_event_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data);
401 static void thrashing_block_destroy_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data);
402 static void thrashing_block_munmap_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data);
403 
404 static uvm_perf_module_event_callback_desc_t g_callbacks_thrashing[] = {
405     { UVM_PERF_EVENT_BLOCK_DESTROY, thrashing_block_destroy_cb },
406     { UVM_PERF_EVENT_MODULE_UNLOAD, thrashing_block_destroy_cb },
407     { UVM_PERF_EVENT_BLOCK_SHRINK , thrashing_block_destroy_cb },
408     { UVM_PERF_EVENT_BLOCK_MUNMAP , thrashing_block_munmap_cb  },
409     { UVM_PERF_EVENT_MIGRATION,     thrashing_event_cb         },
410     { UVM_PERF_EVENT_REVOCATION,    thrashing_event_cb         }
411 };
412 
nv_procfs_read_thrashing_stats(struct seq_file * s,void * v)413 static int nv_procfs_read_thrashing_stats(struct seq_file *s, void *v)
414 {
415     processor_thrashing_stats_t *processor_stats = (processor_thrashing_stats_t *)s->private;
416 
417     UVM_ASSERT(processor_stats);
418 
419     if (!uvm_down_read_trylock(&g_uvm_global.pm.lock))
420             return -EAGAIN;
421 
422     UVM_SEQ_OR_DBG_PRINT(s, "thrashing     %llu\n", (NvU64)atomic64_read(&processor_stats->num_thrashing));
423     UVM_SEQ_OR_DBG_PRINT(s, "throttle      %llu\n", (NvU64)atomic64_read(&processor_stats->num_throttle));
424     UVM_SEQ_OR_DBG_PRINT(s, "pin_local     %llu\n", (NvU64)atomic64_read(&processor_stats->num_pin_local));
425     UVM_SEQ_OR_DBG_PRINT(s, "pin_remote    %llu\n", (NvU64)atomic64_read(&processor_stats->num_pin_remote));
426 
427     uvm_up_read(&g_uvm_global.pm.lock);
428 
429     return 0;
430 }
431 
nv_procfs_read_thrashing_stats_entry(struct seq_file * s,void * v)432 static int nv_procfs_read_thrashing_stats_entry(struct seq_file *s, void *v)
433 {
434     UVM_ENTRY_RET(nv_procfs_read_thrashing_stats(s, v));
435 }
436 
437 UVM_DEFINE_SINGLE_PROCFS_FILE(thrashing_stats_entry);
438 
439 #define THRASHING_STATS_FILE_NAME "thrashing_stats"
440 
441 // Initialization/deinitialization of CPU thrashing stats
442 //
cpu_thrashing_stats_init(void)443 static NV_STATUS cpu_thrashing_stats_init(void)
444 {
445     struct proc_dir_entry *cpu_base_dir_entry = uvm_procfs_get_cpu_base_dir();
446 
447     if (uvm_procfs_is_debug_enabled()) {
448         UVM_ASSERT(!g_cpu_thrashing_stats.procfs_file);
449         g_cpu_thrashing_stats.procfs_file = NV_CREATE_PROC_FILE(THRASHING_STATS_FILE_NAME,
450                                                                 cpu_base_dir_entry,
451                                                                 thrashing_stats_entry,
452                                                                 &g_cpu_thrashing_stats);
453         if (!g_cpu_thrashing_stats.procfs_file)
454             return NV_ERR_OPERATING_SYSTEM;
455     }
456 
457     return NV_OK;
458 }
459 
cpu_thrashing_stats_exit(void)460 static void cpu_thrashing_stats_exit(void)
461 {
462     if (g_cpu_thrashing_stats.procfs_file) {
463         UVM_ASSERT(uvm_procfs_is_debug_enabled());
464         proc_remove(g_cpu_thrashing_stats.procfs_file);
465         g_cpu_thrashing_stats.procfs_file = NULL;
466     }
467 }
468 
469 // Get the thrashing stats struct for the given VA space if it exists
470 //
471 // No lock may be held. Therefore, the stats must be updated using atomics
gpu_thrashing_stats_get_or_null(uvm_gpu_t * gpu)472 static processor_thrashing_stats_t *gpu_thrashing_stats_get_or_null(uvm_gpu_t *gpu)
473 {
474     return uvm_perf_module_type_data(gpu->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
475 }
476 
thrashing_stats_get_or_null(uvm_va_space_t * va_space,uvm_processor_id_t id)477 static processor_thrashing_stats_t *thrashing_stats_get_or_null(uvm_va_space_t *va_space, uvm_processor_id_t id)
478 {
479     if (UVM_ID_IS_CPU(id)) {
480         if (g_cpu_thrashing_stats.procfs_file)
481             return &g_cpu_thrashing_stats;
482 
483         return NULL;
484     }
485 
486     return gpu_thrashing_stats_get_or_null(uvm_va_space_get_gpu(va_space, id));
487 }
488 
489 // Create the thrashing stats struct for the given GPU
490 //
491 // Global lock needs to be held
gpu_thrashing_stats_create(uvm_gpu_t * gpu)492 static NV_STATUS gpu_thrashing_stats_create(uvm_gpu_t *gpu)
493 {
494     processor_thrashing_stats_t *gpu_thrashing;
495 
496     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
497     UVM_ASSERT(gpu_thrashing_stats_get_or_null(gpu) == NULL);
498     UVM_ASSERT(uvm_procfs_is_debug_enabled());
499 
500     gpu_thrashing = uvm_kvmalloc_zero(sizeof(*gpu_thrashing));
501     if (!gpu_thrashing)
502         return NV_ERR_NO_MEMORY;
503 
504     gpu_thrashing->procfs_file = NV_CREATE_PROC_FILE(THRASHING_STATS_FILE_NAME,
505                                                      gpu->procfs.dir,
506                                                      thrashing_stats_entry,
507                                                      gpu_thrashing);
508     if (!gpu_thrashing->procfs_file) {
509         uvm_kvfree(gpu_thrashing);
510         return NV_ERR_OPERATING_SYSTEM;
511     }
512 
513     uvm_perf_module_type_set_data(gpu->perf_modules_data, gpu_thrashing, UVM_PERF_MODULE_TYPE_THRASHING);
514 
515     return NV_OK;
516 }
517 
gpu_thrashing_stats_destroy(uvm_gpu_t * gpu)518 static void gpu_thrashing_stats_destroy(uvm_gpu_t *gpu)
519 {
520     processor_thrashing_stats_t *gpu_thrashing = gpu_thrashing_stats_get_or_null(gpu);
521 
522     uvm_assert_mutex_locked(&g_uvm_global.global_lock);
523 
524     if (gpu_thrashing) {
525         uvm_perf_module_type_unset_data(gpu->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
526 
527         if (gpu_thrashing->procfs_file)
528             proc_remove(gpu_thrashing->procfs_file);
529 
530         uvm_kvfree(gpu_thrashing);
531     }
532 }
533 
534 // Get the thrashing detection struct for the given VA space if it exists
535 //
536 // The caller must ensure that the va_space cannot be deleted, for the
537 // duration of this call. Holding either the va_block or va_space lock will do
538 // that.
va_space_thrashing_info_get_or_null(uvm_va_space_t * va_space)539 static va_space_thrashing_info_t *va_space_thrashing_info_get_or_null(uvm_va_space_t *va_space)
540 {
541     return uvm_perf_module_type_data(va_space->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
542 }
543 
544 // Get the thrashing detection struct for the given VA space. It asserts that
545 // the information has been previously created.
546 //
547 // The caller must ensure that the va_space cannot be deleted, for the
548 // duration of this call. Holding either the va_block or va_space lock will do
549 // that.
va_space_thrashing_info_get(uvm_va_space_t * va_space)550 static va_space_thrashing_info_t *va_space_thrashing_info_get(uvm_va_space_t *va_space)
551 {
552     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get_or_null(va_space);
553     UVM_ASSERT(va_space_thrashing);
554 
555     return va_space_thrashing;
556 }
557 
va_space_thrashing_info_init_params(va_space_thrashing_info_t * va_space_thrashing)558 static void va_space_thrashing_info_init_params(va_space_thrashing_info_t *va_space_thrashing)
559 {
560     UVM_ASSERT(!va_space_thrashing->params.test_overrides);
561 
562     va_space_thrashing->params.enable = g_uvm_perf_thrashing_enable;
563 
564     // Snap the thrashing parameters so that they can be tuned per VA space
565     va_space_thrashing->params.threshold     = g_uvm_perf_thrashing_threshold;
566     va_space_thrashing->params.pin_threshold = g_uvm_perf_thrashing_pin_threshold;
567 
568     // Default thrashing parameters are overriden for simulated/emulated GPUs
569     if (g_uvm_global.num_simulated_devices > 0 &&
570         (g_uvm_perf_thrashing_lapse_usec == UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT)) {
571         va_space_thrashing->params.lapse_ns  = UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT_EMULATION * 1000;
572     }
573     else {
574         va_space_thrashing->params.lapse_ns  = g_uvm_perf_thrashing_lapse_usec * 1000;
575     }
576 
577     va_space_thrashing->params.nap_ns        = va_space_thrashing->params.lapse_ns * g_uvm_perf_thrashing_nap;
578     va_space_thrashing->params.epoch_ns      = va_space_thrashing->params.lapse_ns * g_uvm_perf_thrashing_epoch;
579 
580     if (g_uvm_global.num_simulated_devices > 0 && (g_uvm_perf_thrashing_pin == UVM_PERF_THRASHING_PIN_DEFAULT)) {
581         va_space_thrashing->params.pin_ns    = va_space_thrashing->params.lapse_ns
582                                                * UVM_PERF_THRASHING_PIN_DEFAULT_EMULATION;
583     }
584     else {
585         va_space_thrashing->params.pin_ns    = va_space_thrashing->params.lapse_ns * g_uvm_perf_thrashing_pin;
586     }
587 
588     va_space_thrashing->params.max_resets    = g_uvm_perf_thrashing_max_resets;
589 }
590 
591 // Create the thrashing detection struct for the given VA space
592 //
593 // VA space lock needs to be held in write mode
va_space_thrashing_info_create(uvm_va_space_t * va_space)594 static va_space_thrashing_info_t *va_space_thrashing_info_create(uvm_va_space_t *va_space)
595 {
596     va_space_thrashing_info_t *va_space_thrashing;
597     uvm_assert_rwsem_locked_write(&va_space->lock);
598 
599     UVM_ASSERT(va_space_thrashing_info_get_or_null(va_space) == NULL);
600 
601     va_space_thrashing = uvm_kvmalloc_zero(sizeof(*va_space_thrashing));
602     if (va_space_thrashing) {
603         uvm_va_block_context_t *block_context = uvm_va_block_context_alloc(NULL);
604 
605         if (!block_context) {
606             uvm_kvfree(va_space_thrashing);
607             return NULL;
608         }
609 
610         va_space_thrashing->pinned_pages.va_block_context = block_context;
611         va_space_thrashing->va_space = va_space;
612 
613         va_space_thrashing_info_init_params(va_space_thrashing);
614 
615         uvm_perf_module_type_set_data(va_space->perf_modules_data, va_space_thrashing, UVM_PERF_MODULE_TYPE_THRASHING);
616     }
617 
618     return va_space_thrashing;
619 }
620 
621 // Destroy the thrashing detection struct for the given VA space
622 //
623 // VA space lock needs to be in write mode
va_space_thrashing_info_destroy(uvm_va_space_t * va_space)624 static void va_space_thrashing_info_destroy(uvm_va_space_t *va_space)
625 {
626     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get_or_null(va_space);
627     uvm_assert_rwsem_locked_write(&va_space->lock);
628 
629     if (va_space_thrashing) {
630         uvm_perf_module_type_unset_data(va_space->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
631         uvm_va_block_context_free(va_space_thrashing->pinned_pages.va_block_context);
632         uvm_kvfree(va_space_thrashing);
633     }
634 }
635 
636 // Get the thrashing detection struct for the given block
thrashing_info_get(uvm_va_block_t * va_block)637 static block_thrashing_info_t *thrashing_info_get(uvm_va_block_t *va_block)
638 {
639     uvm_assert_mutex_locked(&va_block->lock);
640     return uvm_perf_module_type_data(va_block->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
641 }
642 
643 // Get the thrashing detection struct for the given block or create it if it
644 // does not exist
thrashing_info_get_create(uvm_va_block_t * va_block)645 static block_thrashing_info_t *thrashing_info_get_create(uvm_va_block_t *va_block)
646 {
647     block_thrashing_info_t *block_thrashing = thrashing_info_get(va_block);
648 
649     BUILD_BUG_ON((1 << 8 * sizeof(block_thrashing->num_thrashing_pages)) < PAGES_PER_UVM_VA_BLOCK);
650     BUILD_BUG_ON((1 << 16) < UVM_ID_MAX_PROCESSORS);
651 
652     if (!block_thrashing) {
653         block_thrashing = nv_kmem_cache_zalloc(g_va_block_thrashing_info_cache, NV_UVM_GFP_FLAGS);
654         if (!block_thrashing)
655             goto done;
656 
657         block_thrashing->last_processor = UVM_ID_INVALID;
658         INIT_LIST_HEAD(&block_thrashing->pinned_pages.list);
659 
660         uvm_perf_module_type_set_data(va_block->perf_modules_data, block_thrashing, UVM_PERF_MODULE_TYPE_THRASHING);
661     }
662 
663 done:
664     return block_thrashing;
665 }
666 
667 static void thrashing_reset_pages_in_region(uvm_va_block_t *va_block, NvU64 address, NvU64 bytes);
668 
uvm_perf_thrashing_info_destroy(uvm_va_block_t * va_block)669 void uvm_perf_thrashing_info_destroy(uvm_va_block_t *va_block)
670 {
671     block_thrashing_info_t *block_thrashing = thrashing_info_get(va_block);
672 
673     if (block_thrashing) {
674         thrashing_reset_pages_in_region(va_block, va_block->start, uvm_va_block_size(va_block));
675 
676         uvm_perf_module_type_unset_data(va_block->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
677 
678         uvm_kvfree(block_thrashing->pages);
679         kmem_cache_free(g_va_block_thrashing_info_cache, block_thrashing);
680     }
681 }
682 
thrashing_block_destroy_cb(uvm_perf_event_t event_id,uvm_perf_event_data_t * event_data)683 void thrashing_block_destroy_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data)
684 {
685     uvm_va_block_t *va_block;
686 
687     UVM_ASSERT(g_uvm_perf_thrashing_enable);
688 
689     UVM_ASSERT(event_id == UVM_PERF_EVENT_BLOCK_DESTROY ||
690                event_id == UVM_PERF_EVENT_BLOCK_SHRINK ||
691                event_id == UVM_PERF_EVENT_MODULE_UNLOAD);
692 
693     if (event_id == UVM_PERF_EVENT_BLOCK_DESTROY)
694         va_block = event_data->block_destroy.block;
695     else if (event_id == UVM_PERF_EVENT_BLOCK_SHRINK)
696         va_block = event_data->block_shrink.block;
697     else
698         va_block = event_data->module_unload.block;
699 
700     if (!va_block)
701         return;
702 
703     uvm_perf_thrashing_info_destroy(va_block);
704 }
705 
thrashing_block_munmap_cb(uvm_perf_event_t event_id,uvm_perf_event_data_t * event_data)706 void thrashing_block_munmap_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data)
707 {
708     uvm_va_block_t *va_block = event_data->block_munmap.block;
709     uvm_va_block_region_t region = event_data->block_munmap.region;
710 
711     UVM_ASSERT(g_uvm_perf_thrashing_enable);
712     UVM_ASSERT(event_id == UVM_PERF_EVENT_BLOCK_MUNMAP);
713     UVM_ASSERT(va_block);
714 
715     thrashing_reset_pages_in_region(va_block,
716                                     uvm_va_block_region_start(va_block, region),
717                                     uvm_va_block_region_size(region));
718 }
719 
720 // Sanity checks of the thrashing tracking state
thrashing_state_checks(uvm_va_block_t * va_block,block_thrashing_info_t * block_thrashing,page_thrashing_info_t * page_thrashing,uvm_page_index_t page_index)721 static bool thrashing_state_checks(uvm_va_block_t *va_block,
722                                    block_thrashing_info_t *block_thrashing,
723                                    page_thrashing_info_t *page_thrashing,
724                                    uvm_page_index_t page_index)
725 {
726     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
727     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get(va_space);
728 
729     if (!block_thrashing) {
730         UVM_ASSERT(!page_thrashing);
731         return true;
732     }
733 
734     UVM_ASSERT(uvm_page_mask_subset(&block_thrashing->pinned_pages.mask, &block_thrashing->thrashing_pages));
735 
736     if (page_thrashing) {
737         UVM_ASSERT(block_thrashing->pages);
738         UVM_ASSERT(page_thrashing == &block_thrashing->pages[page_index]);
739     }
740     else {
741         UVM_ASSERT(!uvm_page_mask_test(&block_thrashing->thrashing_pages, page_index));
742         return true;
743     }
744 
745     UVM_ASSERT(uvm_processor_mask_subset(&page_thrashing->throttled_processors,
746                                          &page_thrashing->processors));
747 
748     if (uvm_page_mask_test(&block_thrashing->thrashing_pages, page_index))
749         UVM_ASSERT(page_thrashing->num_thrashing_events >= va_space_thrashing->params.threshold);
750 
751     if (page_thrashing->pinned) {
752         UVM_ASSERT(uvm_page_mask_test(&block_thrashing->pinned_pages.mask, page_index));
753         UVM_ASSERT(UVM_ID_IS_VALID(page_thrashing->pinned_residency_id));
754         UVM_ASSERT(page_thrashing->throttling_count == 0);
755     }
756     else {
757         UVM_ASSERT(!uvm_page_mask_test(&block_thrashing->pinned_pages.mask, page_index));
758         UVM_ASSERT(UVM_ID_IS_INVALID(page_thrashing->pinned_residency_id));
759 
760         if (!uvm_processor_mask_empty(&page_thrashing->throttled_processors)) {
761             UVM_ASSERT(page_thrashing->throttling_count > 0);
762             UVM_ASSERT(uvm_page_mask_test(&block_thrashing->thrashing_pages, page_index));
763         }
764     }
765 
766     return true;
767 }
768 
769 // Update throttling heuristics. Mainly check if a new throttling period has
770 // started and choose the next processor not to be throttled. This function
771 // is executed before the thrashing mitigation logic kicks in.
thrashing_throttle_update(va_space_thrashing_info_t * va_space_thrashing,uvm_va_block_t * va_block,page_thrashing_info_t * page_thrashing,uvm_processor_id_t processor,NvU64 time_stamp)772 static void thrashing_throttle_update(va_space_thrashing_info_t *va_space_thrashing,
773                                       uvm_va_block_t *va_block,
774                                       page_thrashing_info_t *page_thrashing,
775                                       uvm_processor_id_t processor,
776                                       NvU64 time_stamp)
777 {
778     NvU64 current_end_time_stamp = page_thrashing_get_throttling_end_time_stamp(page_thrashing);
779 
780     uvm_assert_mutex_locked(&va_block->lock);
781 
782     if (time_stamp > current_end_time_stamp) {
783         NvU64 throttling_end_time_stamp = time_stamp + va_space_thrashing->params.nap_ns;
784         page_thrashing_set_throttling_end_time_stamp(page_thrashing, throttling_end_time_stamp);
785 
786         // Avoid choosing the same processor in consecutive thrashing periods
787         if (uvm_id_equal(page_thrashing->do_not_throttle_processor_id, processor))
788             page_thrashing->do_not_throttle_processor_id = UVM_ID_INVALID;
789         else
790             page_thrashing->do_not_throttle_processor_id = processor;
791     }
792     else if (UVM_ID_IS_INVALID(page_thrashing->do_not_throttle_processor_id)) {
793         page_thrashing->do_not_throttle_processor_id = processor;
794     }
795 }
796 
797 // Throttle the execution of a processor. If this is the first processor being
798 // throttled for a throttling period, compute the time stamp until which the
799 // rest of processors will be throttled on fault.
800 //
801 // - Page may be pinned (possible in thrashing due to revocation, such as
802 //   in system-wide atomics)
803 // - Requesting processor must not be throttled at this point.
804 //
thrashing_throttle_processor(uvm_va_block_t * va_block,block_thrashing_info_t * block_thrashing,page_thrashing_info_t * page_thrashing,uvm_page_index_t page_index,uvm_processor_id_t processor)805 static void thrashing_throttle_processor(uvm_va_block_t *va_block,
806                                          block_thrashing_info_t *block_thrashing,
807                                          page_thrashing_info_t *page_thrashing,
808                                          uvm_page_index_t page_index,
809                                          uvm_processor_id_t processor)
810 {
811     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
812     NvU64 address = uvm_va_block_cpu_page_address(va_block, page_index);
813 
814     uvm_assert_mutex_locked(&va_block->lock);
815 
816     UVM_ASSERT(!uvm_id_equal(processor, page_thrashing->do_not_throttle_processor_id));
817 
818     if (!uvm_processor_mask_test_and_set(&page_thrashing->throttled_processors, processor)) {
819         // CPU is throttled by sleeping. This is done in uvm_vm_fault so it
820         // drops the VA block and VA space locks. Throttling start/end events
821         // are recorded around the sleep calls.
822         if (UVM_ID_IS_GPU(processor))
823             uvm_tools_record_throttling_start(va_space, address, processor);
824 
825         if (!page_thrashing->pinned)
826             UVM_PERF_SATURATING_INC(page_thrashing->throttling_count);
827 
828         UVM_PERF_SATURATING_INC(block_thrashing->throttling_count);
829     }
830 
831     UVM_ASSERT(thrashing_state_checks(va_block, block_thrashing, page_thrashing, page_index));
832 }
833 
834 // Stop throttling on the given processor. If this is the last processor being
835 // throttled for a throttling period, it will clear the throttling period.
836 //
837 // - Page may be pinned (possible in thrashing due to revocation, such as
838 //   in system-wide atomics)
839 // - Requesting processor must be throttled at this point.
840 //
thrashing_throttle_end_processor(uvm_va_block_t * va_block,block_thrashing_info_t * block_thrashing,page_thrashing_info_t * page_thrashing,uvm_page_index_t page_index,uvm_processor_id_t processor)841 static void thrashing_throttle_end_processor(uvm_va_block_t *va_block,
842                                              block_thrashing_info_t *block_thrashing,
843                                              page_thrashing_info_t *page_thrashing,
844                                              uvm_page_index_t page_index,
845                                              uvm_processor_id_t processor)
846 {
847     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
848     NvU64 address = uvm_va_block_cpu_page_address(va_block, page_index);
849 
850     UVM_ASSERT(uvm_processor_mask_test(&page_thrashing->throttled_processors, processor));
851     uvm_processor_mask_clear(&page_thrashing->throttled_processors, processor);
852     if (uvm_processor_mask_empty(&page_thrashing->throttled_processors))
853         page_thrashing_set_throttling_end_time_stamp(page_thrashing, 0);
854 
855     // See comment regarding throttling start/end events for CPU in
856     // thrashing_throttle_processor
857     if (UVM_ID_IS_GPU(processor))
858         uvm_tools_record_throttling_end(va_space, address, processor);
859 
860     UVM_ASSERT(thrashing_state_checks(va_block, block_thrashing, page_thrashing, page_index));
861 }
862 
863 // Clear the throttling state for all processors. This is used while
864 // transitioning to pinned state and during thrashing information reset.
thrashing_throttling_reset_page(uvm_va_block_t * va_block,block_thrashing_info_t * block_thrashing,page_thrashing_info_t * page_thrashing,uvm_page_index_t page_index)865 static void thrashing_throttling_reset_page(uvm_va_block_t *va_block,
866                                             block_thrashing_info_t *block_thrashing,
867                                             page_thrashing_info_t *page_thrashing,
868                                             uvm_page_index_t page_index)
869 {
870     uvm_processor_id_t processor_id;
871 
872     for_each_id_in_mask(processor_id, &page_thrashing->throttled_processors) {
873         thrashing_throttle_end_processor(va_block,
874                                          block_thrashing,
875                                          page_thrashing,
876                                          page_index,
877                                          processor_id);
878     }
879 
880     UVM_ASSERT(uvm_processor_mask_empty(&page_thrashing->throttled_processors));
881 }
882 
883 // Find the pinned page descriptor for the given page index. Return NULL if the
884 // page is not pinned.
find_pinned_page(block_thrashing_info_t * block_thrashing,uvm_page_index_t page_index)885 static pinned_page_t *find_pinned_page(block_thrashing_info_t *block_thrashing, uvm_page_index_t page_index)
886 {
887     pinned_page_t *pinned_page;
888 
889     list_for_each_entry(pinned_page, &block_thrashing->pinned_pages.list, va_block_list_entry) {
890         if (pinned_page->page_index == page_index)
891             return pinned_page;
892     }
893 
894     return NULL;
895 }
896 
897 // Pin a page on the specified processor. All thrashing processors will be
898 // mapped remotely on this location, when possible
899 //
900 // - Requesting processor cannot be throttled
901 //
thrashing_pin_page(va_space_thrashing_info_t * va_space_thrashing,uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,block_thrashing_info_t * block_thrashing,page_thrashing_info_t * page_thrashing,uvm_page_index_t page_index,NvU64 time_stamp,uvm_processor_id_t residency,uvm_processor_id_t requester)902 static NV_STATUS thrashing_pin_page(va_space_thrashing_info_t *va_space_thrashing,
903                                     uvm_va_block_t *va_block,
904                                     uvm_va_block_context_t *va_block_context,
905                                     block_thrashing_info_t *block_thrashing,
906                                     page_thrashing_info_t *page_thrashing,
907                                     uvm_page_index_t page_index,
908                                     NvU64 time_stamp,
909                                     uvm_processor_id_t residency,
910                                     uvm_processor_id_t requester)
911 {
912     uvm_processor_mask_t *current_residency = &va_block_context->scratch_processor_mask;
913 
914     uvm_assert_mutex_locked(&va_block->lock);
915     UVM_ASSERT(!uvm_processor_mask_test(&page_thrashing->throttled_processors, requester));
916 
917     uvm_va_block_page_resident_processors(va_block, page_index, current_residency);
918 
919     // If we are pinning the page for the first time or we are pinning it on a
920     // different location that the current location, reset the throttling state
921     // to make sure that we flush any pending ThrottlingEnd events.
922     if (!page_thrashing->pinned || !uvm_processor_mask_test(current_residency, residency))
923         thrashing_throttling_reset_page(va_block, block_thrashing, page_thrashing, page_index);
924 
925     if (!page_thrashing->pinned) {
926         if (va_space_thrashing->params.pin_ns > 0) {
927             pinned_page_t *pinned_page = nv_kmem_cache_zalloc(g_pinned_page_cache, NV_UVM_GFP_FLAGS);
928             if (!pinned_page)
929                 return NV_ERR_NO_MEMORY;
930 
931             pinned_page->va_block = va_block;
932             pinned_page->page_index = page_index;
933             pinned_page->deadline = time_stamp + va_space_thrashing->params.pin_ns;
934 
935             uvm_spin_lock(&va_space_thrashing->pinned_pages.lock);
936 
937             list_add_tail(&pinned_page->va_space_list_entry, &va_space_thrashing->pinned_pages.list);
938             list_add_tail(&pinned_page->va_block_list_entry, &block_thrashing->pinned_pages.list);
939 
940             // We only schedule the delayed work if the list was empty before
941             // adding this page. Otherwise, we just add it to the list. The
942             // unpinning helper will remove from the list those pages with
943             // deadline prior to its wakeup timestamp and will reschedule
944             // itself if there are remaining pages in the list.
945             if (list_is_singular(&va_space_thrashing->pinned_pages.list) &&
946                 !va_space_thrashing->pinned_pages.in_va_space_teardown) {
947                 int scheduled;
948                 scheduled = schedule_delayed_work(&va_space_thrashing->pinned_pages.dwork,
949                                                   usecs_to_jiffies(va_space_thrashing->params.pin_ns / 1000));
950                 UVM_ASSERT(scheduled != 0);
951             }
952 
953             uvm_spin_unlock(&va_space_thrashing->pinned_pages.lock);
954         }
955 
956         page_thrashing->throttling_count = 0;
957         page_thrashing->pinned = true;
958         UVM_PERF_SATURATING_INC(block_thrashing->pinned_pages.count);
959         uvm_page_mask_set(&block_thrashing->pinned_pages.mask, page_index);
960     }
961 
962     page_thrashing->pinned_residency_id = residency;
963 
964     UVM_ASSERT(thrashing_state_checks(va_block, block_thrashing, page_thrashing, page_index));
965 
966     return NV_OK;
967 }
968 
969 // Unpin a page. This function just clears the pinning tracking state, and does
970 // not remove remote mappings on the page. Callers will need to do it manually
971 // BEFORE calling this function, if so desired.
972 // - Page must be pinned
973 //
thrashing_unpin_page(va_space_thrashing_info_t * va_space_thrashing,uvm_va_block_t * va_block,block_thrashing_info_t * block_thrashing,page_thrashing_info_t * page_thrashing,uvm_page_index_t page_index)974 static void thrashing_unpin_page(va_space_thrashing_info_t *va_space_thrashing,
975                                  uvm_va_block_t *va_block,
976                                  block_thrashing_info_t *block_thrashing,
977                                  page_thrashing_info_t *page_thrashing,
978                                  uvm_page_index_t page_index)
979 {
980     uvm_assert_mutex_locked(&va_block->lock);
981     UVM_ASSERT(page_thrashing->pinned);
982 
983     if (va_space_thrashing->params.pin_ns > 0) {
984         bool do_free = false;
985         pinned_page_t *pinned_page = find_pinned_page(block_thrashing, page_index);
986 
987         UVM_ASSERT(pinned_page);
988         UVM_ASSERT(pinned_page->page_index == page_index);
989         UVM_ASSERT(pinned_page->va_block == va_block);
990 
991         // The va_space_list_entry and va_block_list_entry have special
992         // meanings here:
993         // - va_space_list_entry: when the delayed unpin worker removes the
994         // pinned_page from this list, it takes the ownership of the page and
995         // is in charge of freeing it.
996         // - va_block_list_entry: by removing the page from this list,
997         // thrashing_unpin_page tells the unpin delayed worker to skip
998         // unpinning that page.
999         uvm_spin_lock(&va_space_thrashing->pinned_pages.lock);
1000         list_del_init(&pinned_page->va_block_list_entry);
1001 
1002         if (!list_empty(&pinned_page->va_space_list_entry)) {
1003             do_free = true;
1004             list_del_init(&pinned_page->va_space_list_entry);
1005 
1006             if (list_empty(&va_space_thrashing->pinned_pages.list))
1007                 cancel_delayed_work(&va_space_thrashing->pinned_pages.dwork);
1008         }
1009 
1010         uvm_spin_unlock(&va_space_thrashing->pinned_pages.lock);
1011 
1012         if (do_free)
1013             kmem_cache_free(g_pinned_page_cache, pinned_page);
1014     }
1015 
1016     page_thrashing->pinned_residency_id = UVM_ID_INVALID;
1017     page_thrashing->pinned = false;
1018     uvm_page_mask_clear(&block_thrashing->pinned_pages.mask, page_index);
1019 
1020     UVM_ASSERT(thrashing_state_checks(va_block, block_thrashing, page_thrashing, page_index));
1021 }
1022 
thrashing_detected(uvm_va_block_t * va_block,block_thrashing_info_t * block_thrashing,page_thrashing_info_t * page_thrashing,uvm_page_index_t page_index,uvm_processor_id_t processor_id)1023 static void thrashing_detected(uvm_va_block_t *va_block,
1024                                block_thrashing_info_t *block_thrashing,
1025                                page_thrashing_info_t *page_thrashing,
1026                                uvm_page_index_t page_index,
1027                                uvm_processor_id_t processor_id)
1028 {
1029     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1030     NvU64 address = uvm_va_block_cpu_page_address(va_block, page_index);
1031 
1032     // Thrashing detected, record the event
1033     uvm_tools_record_thrashing(va_space, address, PAGE_SIZE, &page_thrashing->processors);
1034     if (!uvm_page_mask_test_and_set(&block_thrashing->thrashing_pages, page_index))
1035         ++block_thrashing->num_thrashing_pages;
1036 
1037     PROCESSOR_THRASHING_STATS_INC(va_space, processor_id, num_thrashing);
1038 
1039     UVM_ASSERT(thrashing_state_checks(va_block, block_thrashing, page_thrashing, page_index));
1040 }
1041 
1042 // Clear the thrashing information for the given page. This function does not
1043 // unmap remote mappings on the page. Callers will need to do it BEFORE calling
1044 // this function, if so desired
thrashing_reset_page(va_space_thrashing_info_t * va_space_thrashing,uvm_va_block_t * va_block,block_thrashing_info_t * block_thrashing,uvm_page_index_t page_index)1045 static void thrashing_reset_page(va_space_thrashing_info_t *va_space_thrashing,
1046                                  uvm_va_block_t *va_block,
1047                                  block_thrashing_info_t *block_thrashing,
1048                                  uvm_page_index_t page_index)
1049 {
1050     page_thrashing_info_t *page_thrashing = &block_thrashing->pages[page_index];
1051     uvm_assert_mutex_locked(&va_block->lock);
1052 
1053     UVM_ASSERT(block_thrashing->num_thrashing_pages > 0);
1054     UVM_ASSERT(uvm_page_mask_test(&block_thrashing->thrashing_pages, page_index));
1055     UVM_ASSERT(page_thrashing->num_thrashing_events > 0);
1056 
1057     thrashing_throttling_reset_page(va_block, block_thrashing, page_thrashing, page_index);
1058     UVM_ASSERT(uvm_processor_mask_empty(&page_thrashing->throttled_processors));
1059 
1060     if (page_thrashing->pinned)
1061         thrashing_unpin_page(va_space_thrashing, va_block, block_thrashing, page_thrashing, page_index);
1062 
1063     page_thrashing->last_time_stamp       = 0;
1064     page_thrashing->has_migration_events  = 0;
1065     page_thrashing->has_revocation_events = 0;
1066     page_thrashing->num_thrashing_events  = 0;
1067     uvm_processor_mask_zero(&page_thrashing->processors);
1068 
1069     if (uvm_page_mask_test_and_clear(&block_thrashing->thrashing_pages, page_index))
1070         --block_thrashing->num_thrashing_pages;
1071 
1072     UVM_ASSERT(thrashing_state_checks(va_block, block_thrashing, page_thrashing, page_index));
1073 }
1074 
1075 // Call thrashing_reset_page for all the thrashing pages in the region
1076 // described by address and bytes
thrashing_reset_pages_in_region(uvm_va_block_t * va_block,NvU64 address,NvU64 bytes)1077 static void thrashing_reset_pages_in_region(uvm_va_block_t *va_block, NvU64 address, NvU64 bytes)
1078 {
1079     uvm_page_index_t page_index;
1080     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1081     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get(va_space);
1082     block_thrashing_info_t *block_thrashing = NULL;
1083     uvm_va_block_region_t region = uvm_va_block_region_from_start_size(va_block, address, bytes);
1084 
1085     block_thrashing = thrashing_info_get(va_block);
1086     if (!block_thrashing || !block_thrashing->pages)
1087         return;
1088 
1089     // Update all pages in the region
1090     for_each_va_block_page_in_region_mask(page_index, &block_thrashing->thrashing_pages, region)
1091         thrashing_reset_page(va_space_thrashing, va_block, block_thrashing, page_index);
1092 }
1093 
1094 
1095 // Unmap remote mappings from the given processors on the pinned pages
1096 // described by region and block_thrashing->pinned pages.
unmap_remote_pinned_pages(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,block_thrashing_info_t * block_thrashing,uvm_va_block_region_t region,const uvm_processor_mask_t * unmap_processors)1097 static NV_STATUS unmap_remote_pinned_pages(uvm_va_block_t *va_block,
1098                                            uvm_va_block_context_t *va_block_context,
1099                                            block_thrashing_info_t *block_thrashing,
1100                                            uvm_va_block_region_t region,
1101                                            const uvm_processor_mask_t *unmap_processors)
1102 {
1103     NV_STATUS status = NV_OK;
1104     NV_STATUS tracker_status;
1105     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
1106     uvm_processor_id_t processor_id;
1107     const uvm_va_policy_t *policy = uvm_va_policy_get(va_block, uvm_va_block_region_start(va_block, region));
1108 
1109     uvm_assert_mutex_locked(&va_block->lock);
1110 
1111     for_each_id_in_mask(processor_id, unmap_processors) {
1112         UVM_ASSERT(uvm_id_equal(processor_id, policy->preferred_location) ||
1113                    !uvm_processor_mask_test(&policy->accessed_by, processor_id));
1114 
1115         if (uvm_processor_mask_test(&va_block->resident, processor_id)) {
1116             const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, processor_id, NUMA_NO_NODE);
1117 
1118             if (!uvm_page_mask_andnot(&va_block_context->caller_page_mask,
1119                                       &block_thrashing->pinned_pages.mask,
1120                                       resident_mask))
1121                 continue;
1122         }
1123         else {
1124             uvm_page_mask_copy(&va_block_context->caller_page_mask, &block_thrashing->pinned_pages.mask);
1125         }
1126 
1127         status = uvm_va_block_unmap(va_block,
1128                                     va_block_context,
1129                                     processor_id,
1130                                     region,
1131                                     &va_block_context->caller_page_mask,
1132                                     &local_tracker);
1133         if (status != NV_OK)
1134             break;
1135     }
1136 
1137     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
1138     if (status == NV_OK)
1139         status = tracker_status;
1140 
1141     uvm_tracker_deinit(&local_tracker);
1142 
1143     return status;
1144 }
1145 
uvm_perf_thrashing_unmap_remote_pinned_pages_all(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_va_block_region_t region)1146 NV_STATUS uvm_perf_thrashing_unmap_remote_pinned_pages_all(uvm_va_block_t *va_block,
1147                                                            uvm_va_block_context_t *va_block_context,
1148                                                            uvm_va_block_region_t region)
1149 {
1150     block_thrashing_info_t *block_thrashing;
1151     uvm_processor_mask_t *unmap_processors = &va_block_context->unmap_processors_mask;
1152     const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region);
1153 
1154     uvm_assert_mutex_locked(&va_block->lock);
1155 
1156     block_thrashing = thrashing_info_get(va_block);
1157     if (!block_thrashing || !block_thrashing->pages)
1158         return NV_OK;
1159 
1160     if (uvm_page_mask_empty(&block_thrashing->pinned_pages.mask))
1161         return NV_OK;
1162 
1163     // Unmap all mapped processors (that are not SetAccessedBy) with
1164     // no copy of the page
1165     uvm_processor_mask_andnot(unmap_processors, &va_block->mapped, &policy->accessed_by);
1166 
1167     return unmap_remote_pinned_pages(va_block, va_block_context, block_thrashing, region, unmap_processors);
1168 }
1169 
1170 // Check that we are not migrating pages away from its pinned location and
1171 // that we are not prefetching thrashing pages.
migrating_wrong_pages(uvm_va_block_t * va_block,NvU64 address,NvU64 bytes,uvm_processor_id_t proc_id,uvm_make_resident_cause_t cause)1172 static bool migrating_wrong_pages(uvm_va_block_t *va_block,
1173                                   NvU64 address,
1174                                   NvU64 bytes,
1175                                   uvm_processor_id_t proc_id,
1176                                   uvm_make_resident_cause_t cause)
1177 {
1178     uvm_page_index_t page_index;
1179     block_thrashing_info_t *block_thrashing = NULL;
1180     uvm_va_block_region_t region = uvm_va_block_region_from_start_size(va_block, address, bytes);
1181 
1182     block_thrashing = thrashing_info_get(va_block);
1183     if (!block_thrashing || !block_thrashing->pages)
1184         return false;
1185 
1186     for_each_va_block_page_in_region(page_index, region) {
1187         page_thrashing_info_t *page_thrashing = &block_thrashing->pages[page_index];
1188         UVM_ASSERT_MSG(!page_thrashing->pinned || uvm_id_equal(proc_id, page_thrashing->pinned_residency_id),
1189                        "Migrating to %u instead of %u\n",
1190                        uvm_id_value(proc_id), uvm_id_value(page_thrashing->pinned_residency_id));
1191         if (cause == UVM_MAKE_RESIDENT_CAUSE_PREFETCH)
1192             UVM_ASSERT(!uvm_page_mask_test(&block_thrashing->thrashing_pages, page_index));
1193     }
1194 
1195     return false;
1196 }
1197 
is_migration_pinned_pages_update(uvm_va_block_t * va_block,const uvm_perf_event_data_t * event_data,NvU64 address,NvU64 bytes)1198 static bool is_migration_pinned_pages_update(uvm_va_block_t *va_block,
1199                                              const uvm_perf_event_data_t *event_data,
1200                                              NvU64 address,
1201                                              NvU64 bytes)
1202 {
1203     const block_thrashing_info_t *block_thrashing = NULL;
1204     uvm_va_block_region_t region = uvm_va_block_region_from_start_size(va_block, address, bytes);
1205     bool ret;
1206 
1207     if (event_data->migration.cause != UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT &&
1208         event_data->migration.cause != UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER) {
1209         return false;
1210     }
1211 
1212     block_thrashing = thrashing_info_get(va_block);
1213     if (!block_thrashing || !block_thrashing->pages)
1214         return false;
1215 
1216     ret = uvm_page_mask_region_full(&block_thrashing->pinned_pages.mask, region);
1217     if (ret) {
1218         uvm_page_index_t page_index;
1219         for_each_va_block_page_in_region(page_index, region) {
1220             page_thrashing_info_t *page_thrashing = &block_thrashing->pages[page_index];
1221             UVM_ASSERT(uvm_id_equal(page_thrashing->pinned_residency_id, event_data->migration.dst));
1222         }
1223     }
1224 
1225     return ret;
1226 }
1227 
1228 // This function processes migration/revocation events and determines if the
1229 // affected pages are thrashing or not.
thrashing_event_cb(uvm_perf_event_t event_id,uvm_perf_event_data_t * event_data)1230 void thrashing_event_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data)
1231 {
1232     va_space_thrashing_info_t *va_space_thrashing;
1233     block_thrashing_info_t *block_thrashing = NULL;
1234     uvm_va_block_t *va_block;
1235     uvm_va_space_t *va_space;
1236     NvU64 address;
1237     NvU64 bytes;
1238     uvm_processor_id_t processor_id;
1239     uvm_page_index_t page_index;
1240     NvU64 time_stamp;
1241     uvm_va_block_region_t region;
1242     uvm_read_duplication_policy_t read_duplication;
1243 
1244     UVM_ASSERT(g_uvm_perf_thrashing_enable);
1245 
1246     UVM_ASSERT(event_id == UVM_PERF_EVENT_MIGRATION || event_id == UVM_PERF_EVENT_REVOCATION);
1247 
1248     if (event_id == UVM_PERF_EVENT_MIGRATION) {
1249         va_block     = event_data->migration.block;
1250         address      = event_data->migration.address;
1251         bytes        = event_data->migration.bytes;
1252         processor_id = event_data->migration.dst;
1253 
1254         // Skip the thrashing detection logic on eviction as we cannot take
1255         // the VA space lock
1256         if (event_data->migration.cause == UVM_MAKE_RESIDENT_CAUSE_EVICTION)
1257             return;
1258 
1259         // Do not perform checks during the first part of staging copies
1260         if (!uvm_id_equal(event_data->migration.dst, event_data->migration.make_resident_context->dest_id))
1261             return;
1262 
1263         va_space = uvm_va_block_get_va_space(va_block);
1264         va_space_thrashing = va_space_thrashing_info_get(va_space);
1265         if (!va_space_thrashing->params.enable)
1266             return;
1267 
1268         // TODO: Bug 3660922: HMM will need to look up the policy when
1269         // read duplication is supported.
1270         read_duplication = uvm_va_block_is_hmm(va_block) ?
1271                            UVM_READ_DUPLICATION_UNSET :
1272                            uvm_va_range_get_policy(va_block->va_range)->read_duplication;
1273 
1274         // We only care about migrations due to replayable faults, access
1275         // counters and page prefetching. For non-replayable faults, UVM will
1276         // try not to migrate memory since CE is transferring data anyway.
1277         // However, we can still see migration events due to initial
1278         // population. The rest of migrations are triggered due to user
1279         // commands or advice (such as read duplication) which takes precedence
1280         // over our heuristics. Therefore, we clear our internal tracking
1281         // state.
1282         if ((event_data->migration.cause != UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT &&
1283              event_data->migration.cause != UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER &&
1284              event_data->migration.cause != UVM_MAKE_RESIDENT_CAUSE_PREFETCH) ||
1285             (event_data->migration.transfer_mode != UVM_VA_BLOCK_TRANSFER_MODE_MOVE) ||
1286             (read_duplication == UVM_READ_DUPLICATION_ENABLED)) {
1287             thrashing_reset_pages_in_region(va_block, address, bytes);
1288             return;
1289         }
1290 
1291         // Assert that we are not migrating pages that are pinned away from
1292         // their pinning residency, or prefetching pages that are thrashing
1293         UVM_ASSERT(!migrating_wrong_pages(va_block, address, bytes, processor_id, event_data->migration.cause));
1294 
1295         // If we are being migrated due to pinning just return
1296         if (is_migration_pinned_pages_update(va_block, event_data, address, bytes))
1297             return;
1298     }
1299     else {
1300         va_block     = event_data->revocation.block;
1301         address      = event_data->revocation.address;
1302         bytes        = event_data->revocation.bytes;
1303         processor_id = event_data->revocation.proc_id;
1304 
1305         va_space = uvm_va_block_get_va_space(va_block);
1306         va_space_thrashing = va_space_thrashing_info_get(va_space);
1307         if (!va_space_thrashing->params.enable)
1308             return;
1309     }
1310 
1311     block_thrashing = thrashing_info_get_create(va_block);
1312     if (!block_thrashing)
1313         return;
1314 
1315     time_stamp = NV_GETTIME();
1316 
1317     if (!block_thrashing->pages) {
1318         // Don't create the per-page tracking structure unless there is some potential thrashing within the block
1319         NvU16 num_block_pages;
1320 
1321         if (block_thrashing->last_time_stamp == 0 ||
1322             uvm_id_equal(block_thrashing->last_processor, processor_id) ||
1323             time_stamp - block_thrashing->last_time_stamp > va_space_thrashing->params.lapse_ns)
1324             goto done;
1325 
1326         num_block_pages = uvm_va_block_size(va_block) / PAGE_SIZE;
1327 
1328         block_thrashing->pages = uvm_kvmalloc_zero(sizeof(*block_thrashing->pages) * num_block_pages);
1329         if (!block_thrashing->pages)
1330             goto done;
1331 
1332         for (page_index = 0; page_index < num_block_pages; ++page_index) {
1333             block_thrashing->pages[page_index].pinned_residency_id = UVM_ID_INVALID;
1334             block_thrashing->pages[page_index].do_not_throttle_processor_id = UVM_ID_INVALID;
1335         }
1336     }
1337 
1338     region = uvm_va_block_region_from_start_size(va_block, address, bytes);
1339 
1340     // Update all pages in the region
1341     for_each_va_block_page_in_region(page_index, region) {
1342         page_thrashing_info_t *page_thrashing = &block_thrashing->pages[page_index];
1343         NvU64 last_time_stamp = page_thrashing_get_time_stamp(page_thrashing);
1344 
1345         // It is not possible that a pinned page is migrated here, since the
1346         // fault that triggered the migration should have unpinned it in its
1347         // call to uvm_perf_thrashing_get_hint. Moreover page prefetching never
1348         // includes pages that are thrashing (including pinning)
1349         if (event_id == UVM_PERF_EVENT_MIGRATION)
1350             UVM_ASSERT(page_thrashing->pinned == 0);
1351 
1352         uvm_processor_mask_set(&page_thrashing->processors, processor_id);
1353         page_thrashing_set_time_stamp(page_thrashing, time_stamp);
1354 
1355         if (last_time_stamp == 0)
1356             continue;
1357 
1358         if (time_stamp - last_time_stamp <= va_space_thrashing->params.lapse_ns) {
1359             UVM_PERF_SATURATING_INC(page_thrashing->num_thrashing_events);
1360             if (page_thrashing->num_thrashing_events == va_space_thrashing->params.threshold)
1361                 thrashing_detected(va_block, block_thrashing, page_thrashing, page_index, processor_id);
1362 
1363             if (page_thrashing->num_thrashing_events >= va_space_thrashing->params.threshold)
1364                 block_thrashing->last_thrashing_time_stamp = time_stamp;
1365 
1366             if (event_id == UVM_PERF_EVENT_MIGRATION)
1367                 page_thrashing->has_migration_events = true;
1368             else
1369                 page_thrashing->has_revocation_events = true;
1370         }
1371         else if (page_thrashing->num_thrashing_events >= va_space_thrashing->params.threshold &&
1372                  !page_thrashing->pinned) {
1373             thrashing_reset_page(va_space_thrashing, va_block, block_thrashing, page_index);
1374         }
1375     }
1376 
1377 done:
1378     block_thrashing->last_time_stamp = time_stamp;
1379     block_thrashing->last_processor  = processor_id;
1380 }
1381 
thrashing_processors_can_access(uvm_va_space_t * va_space,page_thrashing_info_t * page_thrashing,uvm_processor_id_t to)1382 static bool thrashing_processors_can_access(uvm_va_space_t *va_space,
1383                                             page_thrashing_info_t *page_thrashing,
1384                                             uvm_processor_id_t to)
1385 {
1386     if (UVM_ID_IS_INVALID(to))
1387         return false;
1388 
1389     return uvm_processor_mask_subset(&page_thrashing->processors,
1390                                      &va_space->accessible_from[uvm_id_value(to)]);
1391 }
1392 
thrashing_processors_have_fast_access_to(uvm_va_space_t * va_space,uvm_va_block_context_t * va_block_context,page_thrashing_info_t * page_thrashing,uvm_processor_id_t to)1393 static bool thrashing_processors_have_fast_access_to(uvm_va_space_t *va_space,
1394                                                      uvm_va_block_context_t *va_block_context,
1395                                                      page_thrashing_info_t *page_thrashing,
1396                                                      uvm_processor_id_t to)
1397 {
1398     uvm_processor_mask_t *fast_to = &va_block_context->fast_access_mask;
1399 
1400     if (UVM_ID_IS_INVALID(to))
1401         return false;
1402 
1403     // Combine NVLINK and native atomics mask since we could have PCIe
1404     // atomics in the future
1405     uvm_processor_mask_and(fast_to,
1406                            &va_space->has_nvlink[uvm_id_value(to)],
1407                            &va_space->has_native_atomics[uvm_id_value(to)]);
1408     uvm_processor_mask_set(fast_to, to);
1409 
1410     return uvm_processor_mask_subset(&page_thrashing->processors, fast_to);
1411 }
1412 
thrashing_processors_common_locations(uvm_va_space_t * va_space,page_thrashing_info_t * page_thrashing,uvm_processor_mask_t * common_locations)1413 static void thrashing_processors_common_locations(uvm_va_space_t *va_space,
1414                                                   page_thrashing_info_t *page_thrashing,
1415                                                   uvm_processor_mask_t *common_locations)
1416 {
1417     bool is_first = true;
1418     uvm_processor_id_t id;
1419 
1420     // Find processors that can be accessed from all thrashing processors. For
1421     // example: if A, B and C are thrashing, and A can access B and C can access
1422     // B, too, B would be the common location.
1423     uvm_processor_mask_zero(common_locations);
1424 
1425     for_each_id_in_mask(id, &page_thrashing->processors) {
1426         if (is_first)
1427             uvm_processor_mask_copy(common_locations, &va_space->can_access[uvm_id_value(id)]);
1428         else
1429             uvm_processor_mask_and(common_locations, common_locations, &va_space->can_access[uvm_id_value(id)]);
1430 
1431         is_first = false;
1432     }
1433 }
1434 
preferred_location_is_thrashing(uvm_processor_id_t preferred_location,page_thrashing_info_t * page_thrashing)1435 static bool preferred_location_is_thrashing(uvm_processor_id_t preferred_location,
1436                                             page_thrashing_info_t *page_thrashing)
1437 {
1438     if (UVM_ID_IS_INVALID(preferred_location))
1439         return false;
1440 
1441     return uvm_processor_mask_test(&page_thrashing->processors, preferred_location);
1442 }
1443 
get_hint_for_migration_thrashing(va_space_thrashing_info_t * va_space_thrashing,uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_page_index_t page_index,page_thrashing_info_t * page_thrashing,uvm_processor_id_t requester)1444 static uvm_perf_thrashing_hint_t get_hint_for_migration_thrashing(va_space_thrashing_info_t *va_space_thrashing,
1445                                                                   uvm_va_block_t *va_block,
1446                                                                   uvm_va_block_context_t *va_block_context,
1447                                                                   uvm_page_index_t page_index,
1448                                                                   page_thrashing_info_t *page_thrashing,
1449                                                                   uvm_processor_id_t requester)
1450 {
1451     uvm_perf_thrashing_hint_t hint;
1452     uvm_processor_id_t closest_resident_id;
1453     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1454     uvm_processor_id_t do_not_throttle_processor = page_thrashing->do_not_throttle_processor_id;
1455     uvm_processor_id_t pinned_residency = page_thrashing->pinned_residency_id;
1456     const uvm_va_policy_t *policy;
1457     uvm_processor_id_t preferred_location;
1458 
1459     policy = uvm_va_policy_get(va_block, uvm_va_block_cpu_page_address(va_block, page_index));
1460 
1461     preferred_location = policy->preferred_location;
1462 
1463     hint.type = UVM_PERF_THRASHING_HINT_TYPE_NONE;
1464 
1465     closest_resident_id = uvm_va_block_page_get_closest_resident(va_block, va_block_context, page_index, requester);
1466     if (uvm_va_block_is_hmm(va_block)) {
1467         // HMM pages always start out resident on the CPU but may not be
1468         // recorded in the va_block state because hmm_range_fault() or
1469         // similar functions haven't been called to get an accurate snapshot
1470         // of the Linux state. We can assume pages are CPU resident for the
1471         // purpose of deciding where to migrate to reduce thrashing.
1472         if (UVM_ID_IS_INVALID(closest_resident_id))
1473             closest_resident_id = UVM_ID_CPU;
1474     }
1475     else {
1476         UVM_ASSERT(UVM_ID_IS_VALID(closest_resident_id));
1477     }
1478 
1479     if (thrashing_processors_can_access(va_space, page_thrashing, preferred_location)) {
1480         // The logic in uvm_va_block_select_residency chooses the preferred
1481         // location if the requester can access it, so all processors should
1482         // naturally get mapped to the preferred without thrashing. However,
1483         // we can get here if preferred location was set after processors
1484         // started thrashing.
1485         //
1486         // TODO: Bug 2527408. Reset thrashing history when a user policy
1487         //       changes in a VA block.
1488         hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1489         hint.pin.residency = preferred_location;
1490     }
1491     else if (!preferred_location_is_thrashing(preferred_location, page_thrashing) &&
1492              thrashing_processors_have_fast_access_to(va_space, va_block_context, page_thrashing, closest_resident_id)){
1493         // This is a fast path for those scenarios in which all thrashing
1494         // processors have fast (NVLINK + native atomics) access to the current
1495         // residency. This is skipped if the preferred location is thrashing and
1496         // not accessible by the rest of thrashing processors. Otherwise, we
1497         // would be in the condition above.
1498         if (UVM_ID_IS_CPU(closest_resident_id)) {
1499             // On P9 systems, we prefer the CPU to map vidmem (since it can
1500             // cache it), so don't map the GPU to sysmem.
1501             if (UVM_ID_IS_GPU(requester)) {
1502                 hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1503                 hint.pin.residency = requester;
1504             }
1505         }
1506         else {
1507             hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1508             hint.pin.residency = closest_resident_id;
1509         }
1510     }
1511     else if (uvm_id_equal(requester, preferred_location)) {
1512         if (page_thrashing->pinned) {
1513             // If the faulting processor is the preferred location, we can
1514             // only:
1515             // 1) Pin to the preferred location
1516             // 2) Throttle if it's pinned elsewhere and we are not the
1517             //    do_not_throttle_processor
1518             if (uvm_id_equal(preferred_location, pinned_residency) ||
1519                 uvm_id_equal(preferred_location, do_not_throttle_processor)) {
1520                 hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1521                 hint.pin.residency = preferred_location;
1522             }
1523             else {
1524                 hint.type = UVM_PERF_THRASHING_HINT_TYPE_THROTTLE;
1525             }
1526         }
1527         else if (!uvm_id_equal(preferred_location, do_not_throttle_processor)) {
1528             hint.type = UVM_PERF_THRASHING_HINT_TYPE_THROTTLE;
1529         }
1530         else if (page_thrashing->throttling_count >= va_space_thrashing->params.pin_threshold) {
1531             hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1532             hint.pin.residency = preferred_location;
1533         }
1534     }
1535     else if (page_thrashing->pinned) {
1536         // 1) If the requester is the do_not_throttle_processor pin it to the
1537         //    requester if all thrashing processors can access the requester,
1538         //    or to a common location, or to the requester anyway if no common
1539         //    location found.
1540         // 2) Try to map the current pinned residency.
1541         // 3) Throttle.
1542         if (uvm_id_equal(requester, do_not_throttle_processor)) {
1543             hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1544 
1545             if (thrashing_processors_can_access(va_space, page_thrashing, requester)) {
1546                 hint.pin.residency = requester;
1547             }
1548             else {
1549                 uvm_processor_mask_t *common_locations = &va_block_context->scratch_processor_mask;
1550 
1551                 thrashing_processors_common_locations(va_space, page_thrashing, common_locations);
1552                 if (uvm_processor_mask_empty(common_locations)) {
1553                     hint.pin.residency = requester;
1554                 }
1555                 else {
1556                     // Find the common location that is closest to the requester
1557                     hint.pin.residency = uvm_processor_mask_find_closest_id(va_space, common_locations, requester);
1558                 }
1559             }
1560         }
1561         else if (uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(pinned_residency)], requester)) {
1562             if (!uvm_va_block_is_hmm(va_block))
1563                 UVM_ASSERT(uvm_id_equal(closest_resident_id, pinned_residency));
1564 
1565             hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1566             hint.pin.residency = pinned_residency;
1567         }
1568         else {
1569             hint.type = UVM_PERF_THRASHING_HINT_TYPE_THROTTLE;
1570         }
1571     }
1572     else if (!uvm_id_equal(requester, do_not_throttle_processor)) {
1573         hint.type = UVM_PERF_THRASHING_HINT_TYPE_THROTTLE;
1574     }
1575     else if (page_thrashing->throttling_count >= va_space_thrashing->params.pin_threshold) {
1576         hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
1577         hint.pin.residency = requester;
1578     }
1579 
1580     if (hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN &&
1581         !uvm_va_space_processor_has_memory(va_space, hint.pin.residency))
1582         hint.pin.residency = UVM_ID_CPU;
1583 
1584     return hint;
1585 }
1586 
1587 // Function called on fault that tells the fault handler if any operation
1588 // should be performed to minimize thrashing. The logic is as follows:
1589 //
1590 // - Phase0: Block thrashing. If a number of consecutive thrashing events have
1591 //   been detected on the VA block, per-page thrashing tracking information is
1592 //   created.
1593 // - Phase1: Throttling. When several processors fight over a page, we start a
1594 //   "throttling period". During that period, only one processor will be able
1595 //   to service faults on the page, and the rest will be throttled. All CPU
1596 //   faults are considered to belong to the same device, even if they come from
1597 //   different CPU threads.
1598 // - Phase2: Pinning. After a number of consecutive throttling periods, the page
1599 //   is pinned on a specific processor which all of the thrashing processors can
1600 //   access.
1601 // - Phase3: Revocation throttling. Even if the page is pinned, it can be still
1602 //   thrashing due to revocation events (mainly due to system-wide atomics). In
1603 //   that case we keep the page pinned while applying the same algorithm as in
1604 //   Phase1.
uvm_perf_thrashing_get_hint(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,NvU64 address,uvm_processor_id_t requester)1605 uvm_perf_thrashing_hint_t uvm_perf_thrashing_get_hint(uvm_va_block_t *va_block,
1606                                                       uvm_va_block_context_t *va_block_context,
1607                                                       NvU64 address,
1608                                                       uvm_processor_id_t requester)
1609 {
1610     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1611     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get(va_space);
1612     block_thrashing_info_t *block_thrashing = NULL;
1613     page_thrashing_info_t *page_thrashing = NULL;
1614     uvm_perf_thrashing_hint_t hint;
1615     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, address);
1616     NvU64 time_stamp;
1617     NvU64 last_time_stamp;
1618 
1619     hint.type = UVM_PERF_THRASHING_HINT_TYPE_NONE;
1620 
1621     if (!va_space_thrashing->params.enable)
1622         return hint;
1623 
1624     // If we don't have enough memory to store thrashing information, we assume
1625     // no thrashing
1626     block_thrashing = thrashing_info_get(va_block);
1627     if (!block_thrashing)
1628         return hint;
1629 
1630     // If the per-page tracking structure has not been created yet, we assume
1631     // no thrashing
1632     if (!block_thrashing->pages)
1633         return hint;
1634 
1635     time_stamp = NV_GETTIME();
1636 
1637     if (block_thrashing->last_thrashing_time_stamp != 0 &&
1638         (time_stamp - block_thrashing->last_thrashing_time_stamp > va_space_thrashing->params.epoch_ns) &&
1639         block_thrashing->pinned_pages.count == 0 &&
1640         block_thrashing->thrashing_reset_count < va_space_thrashing->params.max_resets) {
1641         uvm_page_index_t reset_page_index;
1642 
1643         ++block_thrashing->thrashing_reset_count;
1644 
1645         // Clear the state of throttled processors to make sure that we flush
1646         // any pending ThrottlingEnd events
1647         for_each_va_block_page_in_mask(reset_page_index, &block_thrashing->thrashing_pages, va_block) {
1648             thrashing_throttling_reset_page(va_block,
1649                                             block_thrashing,
1650                                             &block_thrashing->pages[reset_page_index],
1651                                             reset_page_index);
1652         }
1653 
1654         // Reset per-page tracking structure
1655         // TODO: Bug 1769904 [uvm] Speculatively unpin pages that were pinned on a specific memory due to thrashing
1656         UVM_ASSERT(uvm_page_mask_empty(&block_thrashing->pinned_pages.mask));
1657         uvm_kvfree(block_thrashing->pages);
1658         block_thrashing->pages                     = NULL;
1659         block_thrashing->num_thrashing_pages       = 0;
1660         block_thrashing->last_processor            = UVM_ID_INVALID;
1661         block_thrashing->last_time_stamp           = 0;
1662         block_thrashing->last_thrashing_time_stamp = 0;
1663         uvm_page_mask_zero(&block_thrashing->thrashing_pages);
1664         goto done;
1665     }
1666 
1667     page_thrashing = &block_thrashing->pages[page_index];
1668 
1669     // Not enough thrashing events yet
1670     if (page_thrashing->num_thrashing_events < va_space_thrashing->params.threshold)
1671         goto done;
1672 
1673     // If the requesting processor is throttled, check the throttling end time
1674     // stamp
1675     if (uvm_processor_mask_test(&page_thrashing->throttled_processors, requester)) {
1676         NvU64 throttling_end_time_stamp = page_thrashing_get_throttling_end_time_stamp(page_thrashing);
1677         if (time_stamp < throttling_end_time_stamp &&
1678             !uvm_id_equal(requester, page_thrashing->do_not_throttle_processor_id)) {
1679             hint.type = UVM_PERF_THRASHING_HINT_TYPE_THROTTLE;
1680             goto done;
1681         }
1682 
1683         thrashing_throttle_end_processor(va_block, block_thrashing, page_thrashing, page_index, requester);
1684     }
1685 
1686     UVM_ASSERT(!uvm_processor_mask_test(&page_thrashing->throttled_processors, requester));
1687 
1688     last_time_stamp = page_thrashing_get_time_stamp(page_thrashing);
1689 
1690     // If the lapse since the last thrashing event is longer than a thrashing
1691     // lapse we are no longer thrashing
1692     if (time_stamp - last_time_stamp > va_space_thrashing->params.lapse_ns &&
1693         !page_thrashing->pinned) {
1694         goto done;
1695     }
1696 
1697     // Set the requesting processor in the thrashing processors mask
1698     uvm_processor_mask_set(&page_thrashing->processors, requester);
1699 
1700     UVM_ASSERT(page_thrashing->has_migration_events || page_thrashing->has_revocation_events);
1701 
1702     // Update throttling heuristics
1703     thrashing_throttle_update(va_space_thrashing, va_block, page_thrashing, requester, time_stamp);
1704 
1705     if (page_thrashing->pinned &&
1706         page_thrashing->has_revocation_events &&
1707         !uvm_id_equal(requester, page_thrashing->do_not_throttle_processor_id)) {
1708 
1709         // When we get revocation thrashing, this is due to system-wide atomics
1710         // downgrading the permissions of other processors. Revocations only
1711         // happen when several processors are mapping the same page and there
1712         // are no migrations. In this case, the only thing we can do is to
1713         // throttle the execution of the processors.
1714         hint.type = UVM_PERF_THRASHING_HINT_TYPE_THROTTLE;
1715     }
1716     else {
1717         hint = get_hint_for_migration_thrashing(va_space_thrashing,
1718                                                 va_block,
1719                                                 va_block_context,
1720                                                 page_index,
1721                                                 page_thrashing,
1722                                                 requester);
1723     }
1724 
1725 done:
1726     if (hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
1727         NV_STATUS status = thrashing_pin_page(va_space_thrashing,
1728                                               va_block,
1729                                               va_block_context,
1730                                               block_thrashing,
1731                                               page_thrashing,
1732                                               page_index,
1733                                               time_stamp,
1734                                               hint.pin.residency,
1735                                               requester);
1736 
1737         // If there was some problem pinning the page (i.e. OOM), demote to
1738         // throttling)
1739         if (status != NV_OK) {
1740             hint.type = UVM_PERF_THRASHING_HINT_TYPE_THROTTLE;
1741         }
1742         else {
1743             if (uvm_id_equal(hint.pin.residency, requester))
1744                 PROCESSOR_THRASHING_STATS_INC(va_space, requester, num_pin_local);
1745             else
1746                 PROCESSOR_THRASHING_STATS_INC(va_space, requester, num_pin_remote);
1747 
1748             uvm_processor_mask_copy(&hint.pin.processors, &page_thrashing->processors);
1749         }
1750     }
1751 
1752     if (hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) {
1753         thrashing_throttle_processor(va_block,
1754                                      block_thrashing,
1755                                      page_thrashing,
1756                                      page_index,
1757                                      requester);
1758 
1759         PROCESSOR_THRASHING_STATS_INC(va_space, requester, num_throttle);
1760 
1761         hint.throttle.end_time_stamp = page_thrashing_get_throttling_end_time_stamp(page_thrashing);
1762     }
1763     else if (hint.type == UVM_PERF_THRASHING_HINT_TYPE_NONE && page_thrashing) {
1764         UVM_ASSERT(!uvm_processor_mask_test(&page_thrashing->throttled_processors, requester));
1765         UVM_ASSERT(!page_thrashing->pinned);
1766         UVM_ASSERT(UVM_ID_IS_INVALID(page_thrashing->pinned_residency_id));
1767     }
1768 
1769     return hint;
1770 }
1771 
uvm_perf_thrashing_get_thrashing_processors(uvm_va_block_t * va_block,NvU64 address)1772 uvm_processor_mask_t *uvm_perf_thrashing_get_thrashing_processors(uvm_va_block_t *va_block, NvU64 address)
1773 {
1774     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1775     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get(va_space);
1776     block_thrashing_info_t *block_thrashing = NULL;
1777     page_thrashing_info_t *page_thrashing = NULL;
1778     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, address);
1779 
1780     UVM_ASSERT(g_uvm_perf_thrashing_enable);
1781     UVM_ASSERT(va_space_thrashing->params.enable);
1782 
1783     block_thrashing = thrashing_info_get(va_block);
1784     UVM_ASSERT(block_thrashing);
1785 
1786     UVM_ASSERT(block_thrashing->pages);
1787 
1788     page_thrashing = &block_thrashing->pages[page_index];
1789 
1790     return &page_thrashing->processors;
1791 }
1792 
uvm_perf_thrashing_get_thrashing_pages(uvm_va_block_t * va_block)1793 const uvm_page_mask_t *uvm_perf_thrashing_get_thrashing_pages(uvm_va_block_t *va_block)
1794 {
1795     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1796     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get(va_space);
1797     block_thrashing_info_t *block_thrashing = NULL;
1798 
1799     if (!va_space_thrashing->params.enable)
1800         return NULL;
1801 
1802     block_thrashing = thrashing_info_get(va_block);
1803     if (!block_thrashing)
1804         return NULL;
1805 
1806     if (block_thrashing->num_thrashing_pages == 0)
1807         return NULL;
1808 
1809     return &block_thrashing->thrashing_pages;
1810 }
1811 
1812 #define TIMER_GRANULARITY_NS 20000ULL
thrashing_unpin_pages(struct work_struct * work)1813 static void thrashing_unpin_pages(struct work_struct *work)
1814 {
1815     struct delayed_work *dwork = to_delayed_work(work);
1816     va_space_thrashing_info_t *va_space_thrashing = container_of(dwork, va_space_thrashing_info_t, pinned_pages.dwork);
1817     uvm_va_space_t *va_space = va_space_thrashing->va_space;
1818     uvm_va_block_context_t *va_block_context = va_space_thrashing->pinned_pages.va_block_context;
1819 
1820     // Take the VA space lock so that VA blocks don't go away during this
1821     // operation.
1822     uvm_va_space_down_read(va_space);
1823 
1824     if (va_space_thrashing->pinned_pages.in_va_space_teardown)
1825         goto exit_no_list_lock;
1826 
1827     while (1) {
1828         pinned_page_t *pinned_page;
1829         uvm_va_block_t *va_block;
1830 
1831         uvm_spin_lock(&va_space_thrashing->pinned_pages.lock);
1832         pinned_page = list_first_entry_or_null(&va_space_thrashing->pinned_pages.list,
1833                                                pinned_page_t,
1834                                                va_space_list_entry);
1835 
1836         if (pinned_page) {
1837             NvU64 now = NV_GETTIME();
1838 
1839             if (pinned_page->deadline <= (now + TIMER_GRANULARITY_NS)) {
1840                 list_del_init(&pinned_page->va_space_list_entry);
1841 
1842                 // Work cancellation is left to thrashing_unpin_page() as this
1843                 // would only catch the following pattern:
1844                 // - Worker thread A is in thrashing_unpin_pages but hasn't
1845                 // looked at the list yet
1846                 // - Thread B then removes the last entry
1847                 // - Thread C then adds a new entry and re-schedules work
1848                 // - Worker thread A removes the entry added by C because the
1849                 // deadline has passed (unlikely), then cancels the work
1850                 // scheduled by C.
1851             }
1852             else {
1853                 NvU64 elapsed_us = (pinned_page->deadline - now) / 1000;
1854 
1855                 schedule_delayed_work(&va_space_thrashing->pinned_pages.dwork, usecs_to_jiffies(elapsed_us));
1856                 uvm_spin_unlock(&va_space_thrashing->pinned_pages.lock);
1857                 break;
1858             }
1859         }
1860 
1861         uvm_spin_unlock(&va_space_thrashing->pinned_pages.lock);
1862 
1863         if (!pinned_page)
1864             break;
1865 
1866         va_block = pinned_page->va_block;
1867         if (uvm_va_block_is_hmm(va_block))
1868             uvm_hmm_migrate_begin_wait(va_block);
1869         uvm_mutex_lock(&va_block->lock);
1870 
1871         // Only operate if the pinned page's tracking state isn't already
1872         // cleared by thrashing_unpin_page()
1873         if (!list_empty(&pinned_page->va_block_list_entry)) {
1874             uvm_page_index_t page_index = pinned_page->page_index;
1875             block_thrashing_info_t *block_thrashing = thrashing_info_get(va_block);
1876 
1877             UVM_ASSERT(block_thrashing);
1878             UVM_ASSERT(uvm_page_mask_test(&block_thrashing->pinned_pages.mask, page_index));
1879 
1880             uvm_va_block_context_init(va_block_context, NULL);
1881 
1882             uvm_perf_thrashing_unmap_remote_pinned_pages_all(va_block,
1883                                                              va_block_context,
1884                                                              uvm_va_block_region_for_page(page_index));
1885             thrashing_reset_page(va_space_thrashing, va_block, block_thrashing, page_index);
1886         }
1887 
1888         uvm_mutex_unlock(&va_block->lock);
1889         if (uvm_va_block_is_hmm(va_block))
1890             uvm_hmm_migrate_finish(va_block);
1891         kmem_cache_free(g_pinned_page_cache, pinned_page);
1892     }
1893 
1894 exit_no_list_lock:
1895     uvm_va_space_up_read(va_space);
1896 }
1897 
thrashing_unpin_pages_entry(struct work_struct * work)1898 static void thrashing_unpin_pages_entry(struct work_struct *work)
1899 {
1900     UVM_ENTRY_VOID(thrashing_unpin_pages(work));
1901 }
1902 
uvm_perf_thrashing_load(uvm_va_space_t * va_space)1903 NV_STATUS uvm_perf_thrashing_load(uvm_va_space_t *va_space)
1904 {
1905     va_space_thrashing_info_t *va_space_thrashing;
1906     NV_STATUS status;
1907 
1908     status = uvm_perf_module_load(&g_module_thrashing, va_space);
1909     if (status != NV_OK)
1910         return status;
1911 
1912     va_space_thrashing = va_space_thrashing_info_create(va_space);
1913     if (!va_space_thrashing)
1914         return NV_ERR_NO_MEMORY;
1915 
1916     uvm_spin_lock_init(&va_space_thrashing->pinned_pages.lock, UVM_LOCK_ORDER_LEAF);
1917     INIT_LIST_HEAD(&va_space_thrashing->pinned_pages.list);
1918     INIT_DELAYED_WORK(&va_space_thrashing->pinned_pages.dwork, thrashing_unpin_pages_entry);
1919 
1920     return NV_OK;
1921 }
1922 
uvm_perf_thrashing_stop(uvm_va_space_t * va_space)1923 void uvm_perf_thrashing_stop(uvm_va_space_t *va_space)
1924 {
1925     va_space_thrashing_info_t *va_space_thrashing;
1926 
1927     uvm_va_space_down_write(va_space);
1928     va_space_thrashing = va_space_thrashing_info_get_or_null(va_space);
1929 
1930     // Prevent further unpinning operations from being scheduled
1931     if (va_space_thrashing)
1932         va_space_thrashing->pinned_pages.in_va_space_teardown = true;
1933 
1934     uvm_va_space_up_write(va_space);
1935 
1936     // Cancel any pending work. We can safely access va_space_thrashing
1937     // because this function is called once from the VA space teardown path,
1938     // and the only function that frees it is uvm_perf_thrashing_unload,
1939     // which is called later in the teardown path.
1940     if (va_space_thrashing)
1941         (void)cancel_delayed_work_sync(&va_space_thrashing->pinned_pages.dwork);
1942 }
1943 
uvm_perf_thrashing_unload(uvm_va_space_t * va_space)1944 void uvm_perf_thrashing_unload(uvm_va_space_t *va_space)
1945 {
1946     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get_or_null(va_space);
1947 
1948     uvm_perf_module_unload(&g_module_thrashing, va_space);
1949 
1950     // Make sure that there are not pending work items
1951     if (va_space_thrashing) {
1952         UVM_ASSERT(list_empty(&va_space_thrashing->pinned_pages.list));
1953 
1954         va_space_thrashing_info_destroy(va_space);
1955     }
1956 }
1957 
uvm_perf_thrashing_register_gpu(uvm_va_space_t * va_space,uvm_gpu_t * gpu)1958 void uvm_perf_thrashing_register_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu)
1959 {
1960     va_space_thrashing_info_t *va_space_thrashing = va_space_thrashing_info_get(va_space);
1961 
1962     // If a simulated GPU is registered, re-initialize thrashing parameters in
1963     // case they need to be adjusted.
1964     if ((g_uvm_global.num_simulated_devices > 0) && !va_space_thrashing->params.test_overrides)
1965         va_space_thrashing_info_init_params(va_space_thrashing);
1966 }
1967 
uvm_perf_thrashing_init(void)1968 NV_STATUS uvm_perf_thrashing_init(void)
1969 {
1970     NV_STATUS status;
1971 
1972     INIT_THRASHING_PARAMETER_TOGGLE(uvm_perf_thrashing_enable, UVM_PERF_THRASHING_ENABLE_DEFAULT);
1973     if (!g_uvm_perf_thrashing_enable)
1974         return NV_OK;
1975 
1976     uvm_perf_module_init("perf_thrashing",
1977                          UVM_PERF_MODULE_TYPE_THRASHING,
1978                          g_callbacks_thrashing,
1979                          ARRAY_SIZE(g_callbacks_thrashing),
1980                          &g_module_thrashing);
1981 
1982     INIT_THRASHING_PARAMETER_NONZERO_MAX(uvm_perf_thrashing_threshold,
1983                                          UVM_PERF_THRASHING_THRESHOLD_DEFAULT,
1984                                          UVM_PERF_THRASHING_THRESHOLD_MAX);
1985 
1986     INIT_THRASHING_PARAMETER_NONZERO_MAX(uvm_perf_thrashing_pin_threshold,
1987                                          UVM_PERF_THRASHING_PIN_THRESHOLD_DEFAULT,
1988                                          UVM_PERF_THRASHING_PIN_THRESHOLD_MAX);
1989 
1990 
1991 
1992     // In Confidential Computing, the DMA path is slower due to cryptographic
1993     // operations & other associated overhead. Enforce a larger window to allow
1994     // the thrashing mitigation mechanisms to work properly.
1995     if (g_uvm_global.conf_computing_enabled)
1996         INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_lapse_usec, UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT * 10);
1997     else
1998         INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_lapse_usec, UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT);
1999 
2000     INIT_THRASHING_PARAMETER_NONZERO_MAX(uvm_perf_thrashing_nap,
2001                                          UVM_PERF_THRASHING_NAP_DEFAULT,
2002                                          UVM_PERF_THRASHING_NAP_MAX);
2003 
2004 
2005     INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_epoch, UVM_PERF_THRASHING_EPOCH_DEFAULT);
2006 
2007     INIT_THRASHING_PARAMETER(uvm_perf_thrashing_pin, UVM_PERF_THRASHING_PIN_DEFAULT);
2008 
2009     INIT_THRASHING_PARAMETER(uvm_perf_thrashing_max_resets, UVM_PERF_THRASHING_MAX_RESETS_DEFAULT);
2010 
2011     g_va_block_thrashing_info_cache = NV_KMEM_CACHE_CREATE("uvm_block_thrashing_info_t", block_thrashing_info_t);
2012     if (!g_va_block_thrashing_info_cache) {
2013         status = NV_ERR_NO_MEMORY;
2014         goto error;
2015     }
2016 
2017     g_pinned_page_cache = NV_KMEM_CACHE_CREATE("uvm_pinned_page_t", pinned_page_t);
2018     if (!g_pinned_page_cache) {
2019         status = NV_ERR_NO_MEMORY;
2020         goto error;
2021     }
2022 
2023     status = cpu_thrashing_stats_init();
2024     if (status != NV_OK)
2025         goto error;
2026 
2027     return NV_OK;
2028 
2029 error:
2030     uvm_perf_thrashing_exit();
2031 
2032     return status;
2033 }
2034 
uvm_perf_thrashing_exit(void)2035 void uvm_perf_thrashing_exit(void)
2036 {
2037     cpu_thrashing_stats_exit();
2038 
2039     kmem_cache_destroy_safe(&g_va_block_thrashing_info_cache);
2040     kmem_cache_destroy_safe(&g_pinned_page_cache);
2041 }
2042 
uvm_perf_thrashing_add_gpu(uvm_gpu_t * gpu)2043 NV_STATUS uvm_perf_thrashing_add_gpu(uvm_gpu_t *gpu)
2044 {
2045     if (!uvm_procfs_is_debug_enabled())
2046         return NV_OK;
2047 
2048     return gpu_thrashing_stats_create(gpu);
2049 }
2050 
uvm_perf_thrashing_remove_gpu(uvm_gpu_t * gpu)2051 void uvm_perf_thrashing_remove_gpu(uvm_gpu_t *gpu)
2052 {
2053     gpu_thrashing_stats_destroy(gpu);
2054 }
2055 
uvm_test_get_page_thrashing_policy(UVM_TEST_GET_PAGE_THRASHING_POLICY_PARAMS * params,struct file * filp)2056 NV_STATUS uvm_test_get_page_thrashing_policy(UVM_TEST_GET_PAGE_THRASHING_POLICY_PARAMS *params, struct file *filp)
2057 {
2058     uvm_va_space_t *va_space = uvm_va_space_get(filp);
2059     va_space_thrashing_info_t *va_space_thrashing;
2060 
2061     uvm_va_space_down_read(va_space);
2062 
2063     va_space_thrashing = va_space_thrashing_info_get(va_space);
2064 
2065     if (va_space_thrashing->params.enable) {
2066         params->policy = UVM_TEST_PAGE_THRASHING_POLICY_ENABLE;
2067         params->nap_ns = va_space_thrashing->params.nap_ns;
2068         params->pin_ns = va_space_thrashing->params.pin_ns;
2069         params->map_remote_on_native_atomics_fault = uvm_perf_map_remote_on_native_atomics_fault != 0;
2070     }
2071     else {
2072         params->policy = UVM_TEST_PAGE_THRASHING_POLICY_DISABLE;
2073     }
2074 
2075     uvm_va_space_up_read(va_space);
2076 
2077     return NV_OK;
2078 }
2079 
uvm_test_set_page_thrashing_policy(UVM_TEST_SET_PAGE_THRASHING_POLICY_PARAMS * params,struct file * filp)2080 NV_STATUS uvm_test_set_page_thrashing_policy(UVM_TEST_SET_PAGE_THRASHING_POLICY_PARAMS *params, struct file *filp)
2081 {
2082     NV_STATUS status = NV_OK;
2083     uvm_va_space_t *va_space = uvm_va_space_get(filp);
2084     va_space_thrashing_info_t *va_space_thrashing;
2085 
2086     if (params->policy >= UVM_TEST_PAGE_THRASHING_POLICY_MAX)
2087         return NV_ERR_INVALID_ARGUMENT;
2088 
2089     if (!g_uvm_perf_thrashing_enable)
2090         return NV_ERR_INVALID_STATE;
2091 
2092     uvm_va_space_down_write(va_space);
2093 
2094     va_space_thrashing = va_space_thrashing_info_get(va_space);
2095     va_space_thrashing->params.test_overrides = true;
2096 
2097     if (params->policy == UVM_TEST_PAGE_THRASHING_POLICY_ENABLE) {
2098         if (va_space_thrashing->params.enable)
2099             goto done_unlock_va_space;
2100 
2101         va_space_thrashing->params.pin_ns = params->pin_ns;
2102         va_space_thrashing->params.enable = true;
2103     }
2104     else {
2105         if (!va_space_thrashing->params.enable)
2106             goto done_unlock_va_space;
2107 
2108         va_space_thrashing->params.enable = false;
2109     }
2110 
2111     // When disabling thrashing detection, destroy the thrashing tracking
2112     // information for all VA blocks and unpin pages
2113     if (!va_space_thrashing->params.enable) {
2114         uvm_va_range_t *va_range;
2115 
2116         uvm_for_each_va_range(va_range, va_space) {
2117             uvm_va_block_t *va_block;
2118 
2119             if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
2120                 continue;
2121 
2122             for_each_va_block_in_va_range(va_range, va_block) {
2123                 uvm_va_block_region_t va_block_region = uvm_va_block_region_from_block(va_block);
2124                 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
2125 
2126                 uvm_mutex_lock(&va_block->lock);
2127 
2128                 // Unmap may split PTEs and require a retry. Needs to be called
2129                 // before the pinned pages information is destroyed.
2130                 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL,
2131                              uvm_perf_thrashing_unmap_remote_pinned_pages_all(va_block,
2132                                                                               block_context,
2133                                                                               va_block_region));
2134 
2135                 uvm_perf_thrashing_info_destroy(va_block);
2136 
2137                 uvm_mutex_unlock(&va_block->lock);
2138 
2139                 // Re-enable thrashing on failure to avoid getting asserts
2140                 // about having state while thrashing is disabled
2141                 if (status != NV_OK) {
2142                     va_space_thrashing->params.enable = true;
2143                     goto done_unlock_va_space;
2144                 }
2145             }
2146         }
2147 
2148         status = uvm_hmm_clear_thrashing_policy(va_space);
2149 
2150         // Re-enable thrashing on failure to avoid getting asserts
2151         // about having state while thrashing is disabled
2152         if (status != NV_OK) {
2153             va_space_thrashing->params.enable = true;
2154             goto done_unlock_va_space;
2155         }
2156     }
2157 
2158 done_unlock_va_space:
2159     uvm_va_space_up_write(va_space);
2160 
2161     return status;
2162 }
2163