1 /*******************************************************************************
2     Copyright (c) 2016-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN hint OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_linux.h"
25 #include "uvm_perf_events.h"
26 #include "uvm_perf_module.h"
27 #include "uvm_perf_prefetch.h"
28 #include "uvm_kvmalloc.h"
29 #include "uvm_va_block.h"
30 #include "uvm_va_range.h"
31 #include "uvm_test.h"
32 
33 //
34 // Tunables for prefetch detection/prevention (configurable via module parameters)
35 //
36 
37 // Enable/disable prefetch performance heuristics
38 static unsigned uvm_perf_prefetch_enable = 1;
39 
40 // TODO: Bug 1778037: [uvm] Use adaptive threshold for page prefetching
41 #define UVM_PREFETCH_THRESHOLD_DEFAULT 51
42 
43 // Percentage of children subregions that need to be resident in order to
44 // trigger prefetching of the remaining subregions
45 //
46 // Valid values 1-100
47 static unsigned uvm_perf_prefetch_threshold  = UVM_PREFETCH_THRESHOLD_DEFAULT;
48 
49 #define UVM_PREFETCH_MIN_FAULTS_MIN     1
50 #define UVM_PREFETCH_MIN_FAULTS_DEFAULT 1
51 #define UVM_PREFETCH_MIN_FAULTS_MAX     20
52 
53 // Minimum number of faults on a block in order to enable the prefetching
54 // logic
55 static unsigned uvm_perf_prefetch_min_faults = UVM_PREFETCH_MIN_FAULTS_DEFAULT;
56 
57 // Module parameters for the tunables
58 module_param(uvm_perf_prefetch_enable, uint, S_IRUGO);
59 module_param(uvm_perf_prefetch_threshold, uint, S_IRUGO);
60 module_param(uvm_perf_prefetch_min_faults, uint, S_IRUGO);
61 
62 static bool g_uvm_perf_prefetch_enable;
63 static unsigned g_uvm_perf_prefetch_threshold;
64 static unsigned g_uvm_perf_prefetch_min_faults;
65 
66 void uvm_perf_prefetch_bitmap_tree_iter_init(const uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
67                                              uvm_page_index_t page_index,
68                                              uvm_perf_prefetch_bitmap_tree_iter_t *iter)
69 {
70     UVM_ASSERT(bitmap_tree->level_count > 0);
71     UVM_ASSERT_MSG(page_index < bitmap_tree->leaf_count,
72                    "%zd vs %zd",
73                    (size_t)page_index,
74                    (size_t)bitmap_tree->leaf_count);
75 
76     iter->level_idx = bitmap_tree->level_count - 1;
77     iter->node_idx  = page_index;
78 }
79 
80 uvm_va_block_region_t uvm_perf_prefetch_bitmap_tree_iter_get_range(const uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
81                                                                    const uvm_perf_prefetch_bitmap_tree_iter_t *iter)
82 {
83     NvU16 range_leaves = uvm_perf_tree_iter_leaf_range(bitmap_tree, iter);
84     NvU16 range_start = uvm_perf_tree_iter_leaf_range_start(bitmap_tree, iter);
85     uvm_va_block_region_t subregion = uvm_va_block_region(range_start, range_start + range_leaves);
86 
87     UVM_ASSERT(iter->level_idx >= 0);
88     UVM_ASSERT(iter->level_idx < bitmap_tree->level_count);
89 
90     return subregion;
91 }
92 
93 NvU16 uvm_perf_prefetch_bitmap_tree_iter_get_count(const uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
94                                                    const uvm_perf_prefetch_bitmap_tree_iter_t *iter)
95 {
96     uvm_va_block_region_t subregion = uvm_perf_prefetch_bitmap_tree_iter_get_range(bitmap_tree, iter);
97 
98     return uvm_page_mask_region_weight(&bitmap_tree->pages, subregion);
99 }
100 
101 static uvm_va_block_region_t compute_prefetch_region(uvm_page_index_t page_index,
102                                                      uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
103                                                      uvm_va_block_region_t max_prefetch_region)
104 {
105     NvU16 counter;
106     uvm_perf_prefetch_bitmap_tree_iter_t iter;
107     uvm_va_block_region_t prefetch_region = uvm_va_block_region(0, 0);
108 
109     uvm_perf_prefetch_bitmap_tree_traverse_counters(counter,
110                                                     bitmap_tree,
111                                                     page_index - max_prefetch_region.first + bitmap_tree->offset,
112                                                     &iter) {
113         uvm_va_block_region_t subregion = uvm_perf_prefetch_bitmap_tree_iter_get_range(bitmap_tree, &iter);
114         NvU16 subregion_pages = uvm_va_block_region_num_pages(subregion);
115 
116         UVM_ASSERT(counter <= subregion_pages);
117         if (counter * 100 > subregion_pages * g_uvm_perf_prefetch_threshold)
118             prefetch_region = subregion;
119     }
120 
121     // Clamp prefetch region to actual pages
122     if (prefetch_region.outer) {
123         prefetch_region.first += max_prefetch_region.first;
124         if (prefetch_region.first < bitmap_tree->offset) {
125             prefetch_region.first = bitmap_tree->offset;
126         }
127         else {
128             prefetch_region.first -= bitmap_tree->offset;
129             if (prefetch_region.first < max_prefetch_region.first)
130                 prefetch_region.first = max_prefetch_region.first;
131         }
132 
133         prefetch_region.outer += max_prefetch_region.first;
134         if (prefetch_region.outer < bitmap_tree->offset) {
135             prefetch_region.outer = bitmap_tree->offset;
136         }
137         else {
138             prefetch_region.outer -= bitmap_tree->offset;
139             if (prefetch_region.outer > max_prefetch_region.outer)
140                 prefetch_region.outer = max_prefetch_region.outer;
141         }
142     }
143 
144     return prefetch_region;
145 }
146 
147 static void grow_fault_granularity_if_no_thrashing(uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
148                                                    uvm_va_block_region_t region,
149                                                    uvm_page_index_t first,
150                                                    const uvm_page_mask_t *faulted_pages,
151                                                    const uvm_page_mask_t *thrashing_pages)
152 {
153     if (!uvm_page_mask_region_empty(faulted_pages, region) &&
154         (!thrashing_pages || uvm_page_mask_region_empty(thrashing_pages, region))) {
155         UVM_ASSERT(region.first >= first);
156         region.first = region.first - first + bitmap_tree->offset;
157         region.outer = region.outer - first + bitmap_tree->offset;
158         UVM_ASSERT(region.outer <= bitmap_tree->leaf_count);
159         uvm_page_mask_region_fill(&bitmap_tree->pages, region);
160     }
161 }
162 
163 static void grow_fault_granularity(uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
164                                    NvU32 big_page_size,
165                                    uvm_va_block_region_t big_pages_region,
166                                    uvm_va_block_region_t max_prefetch_region,
167                                    const uvm_page_mask_t *faulted_pages,
168                                    const uvm_page_mask_t *thrashing_pages)
169 {
170     uvm_page_index_t pages_per_big_page = big_page_size / PAGE_SIZE;
171     uvm_page_index_t page_index;
172 
173     // Migrate whole block if no big pages and no page in it is thrashing
174     if (!big_pages_region.outer) {
175         grow_fault_granularity_if_no_thrashing(bitmap_tree,
176                                                max_prefetch_region,
177                                                max_prefetch_region.first,
178                                                faulted_pages,
179                                                thrashing_pages);
180         return;
181     }
182 
183     // Migrate whole "prefix" if no page in it is thrashing
184     if (big_pages_region.first > max_prefetch_region.first) {
185         uvm_va_block_region_t prefix_region = uvm_va_block_region(max_prefetch_region.first, big_pages_region.first);
186 
187         grow_fault_granularity_if_no_thrashing(bitmap_tree,
188                                                prefix_region,
189                                                max_prefetch_region.first,
190                                                faulted_pages,
191                                                thrashing_pages);
192     }
193 
194     // Migrate whole big pages if they are not thrashing
195     for (page_index = big_pages_region.first;
196          page_index < big_pages_region.outer;
197          page_index += pages_per_big_page) {
198         uvm_va_block_region_t big_region = uvm_va_block_region(page_index,
199                                                                page_index + pages_per_big_page);
200 
201         grow_fault_granularity_if_no_thrashing(bitmap_tree,
202                                                big_region,
203                                                max_prefetch_region.first,
204                                                faulted_pages,
205                                                thrashing_pages);
206     }
207 
208     // Migrate whole "suffix" if no page in it is thrashing
209     if (big_pages_region.outer < max_prefetch_region.outer) {
210         uvm_va_block_region_t suffix_region = uvm_va_block_region(big_pages_region.outer,
211                                                                   max_prefetch_region.outer);
212 
213         grow_fault_granularity_if_no_thrashing(bitmap_tree,
214                                                suffix_region,
215                                                max_prefetch_region.first,
216                                                faulted_pages,
217                                                thrashing_pages);
218     }
219 }
220 
221 // Within a block we only allow prefetching to a single processor. Therefore,
222 // if two processors are accessing non-overlapping regions within the same
223 // block they won't benefit from prefetching.
224 //
225 // TODO: Bug 1778034: [uvm] Explore prefetching to different processors within
226 // a VA block.
227 static NvU32 uvm_perf_prefetch_prenotify_fault_migrations(uvm_va_block_t *va_block,
228                                                           uvm_va_block_context_t *va_block_context,
229                                                           uvm_processor_id_t new_residency,
230                                                           const uvm_page_mask_t *faulted_pages,
231                                                           uvm_va_block_region_t faulted_region,
232                                                           uvm_page_mask_t *prefetch_pages,
233                                                           uvm_perf_prefetch_bitmap_tree_t *bitmap_tree)
234 {
235     uvm_page_index_t page_index;
236     const uvm_page_mask_t *resident_mask = NULL;
237     const uvm_page_mask_t *thrashing_pages = NULL;
238     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
239     const uvm_va_policy_t *policy = va_block_context->policy;
240     uvm_va_block_region_t max_prefetch_region;
241     NvU32 big_page_size;
242     uvm_va_block_region_t big_pages_region;
243 
244     if (!uvm_id_equal(va_block->prefetch_info.last_migration_proc_id, new_residency)) {
245         va_block->prefetch_info.last_migration_proc_id = new_residency;
246         va_block->prefetch_info.fault_migrations_to_last_proc = 0;
247     }
248 
249     // Compute the expanded region that prefetching is allowed from.
250     if (uvm_va_block_is_hmm(va_block)) {
251         max_prefetch_region = uvm_hmm_get_prefetch_region(va_block,
252                                                           va_block_context,
253                                                           uvm_va_block_region_start(va_block, faulted_region));
254     }
255     else {
256         max_prefetch_region = uvm_va_block_region_from_block(va_block);
257     }
258 
259     uvm_page_mask_zero(prefetch_pages);
260 
261     if (UVM_ID_IS_CPU(new_residency) || va_block->gpus[uvm_id_gpu_index(new_residency)] != NULL)
262         resident_mask = uvm_va_block_resident_mask_get(va_block, new_residency);
263 
264     // If this is a first-touch fault and the destination processor is the
265     // preferred location, populate the whole max_prefetch_region.
266     if (uvm_processor_mask_empty(&va_block->resident) &&
267         uvm_id_equal(new_residency, policy->preferred_location)) {
268         uvm_page_mask_region_fill(prefetch_pages, max_prefetch_region);
269         goto done;
270     }
271 
272     if (resident_mask)
273         uvm_page_mask_or(&bitmap_tree->pages, resident_mask, faulted_pages);
274     else
275         uvm_page_mask_copy(&bitmap_tree->pages, faulted_pages);
276 
277     // If we are using a subregion of the va_block, align bitmap_tree
278     uvm_page_mask_shift_right(&bitmap_tree->pages, &bitmap_tree->pages, max_prefetch_region.first);
279 
280     // Get the big page size for the new residency.
281     // Assume 64K size if the new residency is the CPU or no GPU va space is
282     // registered in the current process for this GPU.
283     if (UVM_ID_IS_GPU(new_residency) &&
284         uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, new_residency)) {
285         uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, new_residency);
286 
287         big_page_size = uvm_va_block_gpu_big_page_size(va_block, gpu);
288     }
289     else {
290         big_page_size = UVM_PAGE_SIZE_64K;
291     }
292 
293     big_pages_region = uvm_va_block_big_page_region_subset(va_block, max_prefetch_region, big_page_size);
294 
295     // Adjust the prefetch tree to big page granularity to make sure that we
296     // get big page-friendly prefetching hints
297     if (big_pages_region.first - max_prefetch_region.first > 0) {
298         bitmap_tree->offset = big_page_size / PAGE_SIZE - (big_pages_region.first - max_prefetch_region.first);
299         bitmap_tree->leaf_count = uvm_va_block_region_num_pages(max_prefetch_region) + bitmap_tree->offset;
300 
301         UVM_ASSERT(bitmap_tree->offset < big_page_size / PAGE_SIZE);
302         UVM_ASSERT(bitmap_tree->leaf_count <= PAGES_PER_UVM_VA_BLOCK);
303 
304         uvm_page_mask_shift_left(&bitmap_tree->pages, &bitmap_tree->pages, bitmap_tree->offset);
305     }
306     else {
307         bitmap_tree->offset = 0;
308         bitmap_tree->leaf_count = uvm_va_block_region_num_pages(max_prefetch_region);
309     }
310 
311     bitmap_tree->level_count = ilog2(roundup_pow_of_two(bitmap_tree->leaf_count)) + 1;
312 
313     thrashing_pages = uvm_perf_thrashing_get_thrashing_pages(va_block);
314 
315     // Assume big pages by default. Prefetch the rest of 4KB subregions within
316     // the big page region unless there is thrashing.
317     grow_fault_granularity(bitmap_tree,
318                            big_page_size,
319                            big_pages_region,
320                            max_prefetch_region,
321                            faulted_pages,
322                            thrashing_pages);
323 
324     // Do not compute prefetch regions with faults on pages that are thrashing
325     if (thrashing_pages)
326         uvm_page_mask_andnot(&va_block_context->scratch_page_mask, faulted_pages, thrashing_pages);
327     else
328         uvm_page_mask_copy(&va_block_context->scratch_page_mask, faulted_pages);
329 
330     // Update the tree using the scratch mask to compute the pages to prefetch
331     for_each_va_block_page_in_region_mask(page_index, &va_block_context->scratch_page_mask, faulted_region) {
332         uvm_va_block_region_t region = compute_prefetch_region(page_index, bitmap_tree, max_prefetch_region);
333 
334         uvm_page_mask_region_fill(prefetch_pages, region);
335 
336         // Early out if we have already prefetched until the end of the VA block
337         if (region.outer == max_prefetch_region.outer)
338             break;
339     }
340 
341 done:
342     // Do not prefetch pages that are going to be migrated/populated due to a
343     // fault
344     uvm_page_mask_andnot(prefetch_pages, prefetch_pages, faulted_pages);
345 
346     // TODO: Bug 1765432: prefetching pages that are already mapped on the CPU
347     // would trigger a remap, which may cause a large overhead. Therefore,
348     // exclude them from the mask.
349     // For HMM, we don't know what pages are mapped by the CPU unless we try to
350     // migrate them. Prefetch pages will only be opportunistically migrated.
351     if (UVM_ID_IS_CPU(new_residency) && !uvm_va_block_is_hmm(va_block)) {
352         uvm_page_mask_and(&va_block_context->scratch_page_mask,
353                           resident_mask,
354                           &va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]);
355         uvm_page_mask_andnot(prefetch_pages, prefetch_pages, &va_block_context->scratch_page_mask);
356     }
357 
358     // Avoid prefetching pages that are thrashing
359     if (thrashing_pages)
360         uvm_page_mask_andnot(prefetch_pages, prefetch_pages, thrashing_pages);
361 
362     va_block->prefetch_info.fault_migrations_to_last_proc += uvm_page_mask_region_weight(faulted_pages, faulted_region);
363 
364     return uvm_page_mask_weight(prefetch_pages);
365 }
366 
367 void uvm_perf_prefetch_get_hint(uvm_va_block_t *va_block,
368                                 uvm_va_block_context_t *va_block_context,
369                                 uvm_processor_id_t new_residency,
370                                 const uvm_page_mask_t *faulted_pages,
371                                 uvm_va_block_region_t faulted_region,
372                                 uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
373                                 uvm_perf_prefetch_hint_t *out_hint)
374 {
375     const uvm_va_policy_t *policy = va_block_context->policy;
376     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
377     uvm_page_mask_t *prefetch_pages = &out_hint->prefetch_pages_mask;
378     NvU32 pending_prefetch_pages;
379 
380     uvm_assert_rwsem_locked(&va_space->lock);
381     uvm_assert_mutex_locked(&va_block->lock);
382     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, policy, faulted_region));
383     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, va_block_context, faulted_region));
384 
385     out_hint->residency = UVM_ID_INVALID;
386     uvm_page_mask_zero(prefetch_pages);
387 
388     if (!g_uvm_perf_prefetch_enable)
389         return;
390 
391     if (!va_space->test.page_prefetch_enabled)
392         return;
393 
394     pending_prefetch_pages = uvm_perf_prefetch_prenotify_fault_migrations(va_block,
395                                                                           va_block_context,
396                                                                           new_residency,
397                                                                           faulted_pages,
398                                                                           faulted_region,
399                                                                           prefetch_pages,
400                                                                           bitmap_tree);
401 
402     if (va_block->prefetch_info.fault_migrations_to_last_proc >= g_uvm_perf_prefetch_min_faults &&
403         pending_prefetch_pages > 0) {
404         bool changed = false;
405         uvm_range_group_range_t *rgr;
406 
407         // Only prefetch in range group ranges which have pages that need to
408         // move.
409         uvm_range_group_for_each_range_in(rgr, va_space, va_block->start, va_block->end) {
410             uvm_va_block_region_t region = uvm_va_block_region_from_start_end(va_block,
411                                                                               max(rgr->node.start, va_block->start),
412                                                                               min(rgr->node.end, va_block->end));
413 
414             if (uvm_page_mask_region_empty(faulted_pages, region) &&
415                 !uvm_page_mask_region_empty(prefetch_pages, region)) {
416                 uvm_page_mask_region_clear(prefetch_pages, region);
417                 changed = true;
418             }
419         }
420 
421         if (changed)
422             pending_prefetch_pages = uvm_page_mask_weight(prefetch_pages);
423 
424         if (pending_prefetch_pages > 0)
425             out_hint->residency = va_block->prefetch_info.last_migration_proc_id;
426     }
427 }
428 
429 NV_STATUS uvm_perf_prefetch_init(void)
430 {
431     g_uvm_perf_prefetch_enable = uvm_perf_prefetch_enable != 0;
432 
433     if (!g_uvm_perf_prefetch_enable)
434         return NV_OK;
435 
436     if (uvm_perf_prefetch_threshold <= 100) {
437         g_uvm_perf_prefetch_threshold = uvm_perf_prefetch_threshold;
438     }
439     else {
440         pr_info("Invalid value %u for uvm_perf_prefetch_threshold. Using %u instead\n",
441                 uvm_perf_prefetch_threshold, UVM_PREFETCH_THRESHOLD_DEFAULT);
442 
443         g_uvm_perf_prefetch_threshold = UVM_PREFETCH_THRESHOLD_DEFAULT;
444     }
445 
446     if (uvm_perf_prefetch_min_faults >= UVM_PREFETCH_MIN_FAULTS_MIN &&
447         uvm_perf_prefetch_min_faults <= UVM_PREFETCH_MIN_FAULTS_MAX) {
448         g_uvm_perf_prefetch_min_faults = uvm_perf_prefetch_min_faults;
449     }
450     else {
451         pr_info("Invalid value %u for uvm_perf_prefetch_min_faults. Using %u instead\n",
452                 uvm_perf_prefetch_min_faults, UVM_PREFETCH_MIN_FAULTS_DEFAULT);
453 
454         g_uvm_perf_prefetch_min_faults = UVM_PREFETCH_MIN_FAULTS_DEFAULT;
455     }
456 
457     return NV_OK;
458 }
459 
460 NV_STATUS uvm_test_set_page_prefetch_policy(UVM_TEST_SET_PAGE_PREFETCH_POLICY_PARAMS *params, struct file *filp)
461 {
462     uvm_va_space_t *va_space = uvm_va_space_get(filp);
463 
464     if (params->policy >= UVM_TEST_PAGE_PREFETCH_POLICY_MAX)
465         return NV_ERR_INVALID_ARGUMENT;
466 
467     uvm_va_space_down_write(va_space);
468 
469     if (params->policy == UVM_TEST_PAGE_PREFETCH_POLICY_ENABLE)
470         va_space->test.page_prefetch_enabled = true;
471     else
472         va_space->test.page_prefetch_enabled = false;
473 
474     uvm_va_space_up_write(va_space);
475 
476     return NV_OK;
477 }
478