1 /*******************************************************************************
2     Copyright (c) 2015-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include <asm/atomic.h>
25 
26 #include "uvm_global.h"
27 #include "uvm_channel.h"
28 #include "uvm_hal.h"
29 #include "uvm_mem.h"
30 #include "uvm_push.h"
31 #include "uvm_test.h"
32 #include "uvm_test_rng.h"
33 #include "uvm_thread_context.h"
34 #include "uvm_va_space.h"
35 #include "uvm_tracker.h"
36 #include "uvm_gpu_semaphore.h"
37 #include "uvm_kvmalloc.h"
38 
39 #define TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES 2
40 
get_push_begin_size(uvm_channel_t * channel)41 static NvU32 get_push_begin_size(uvm_channel_t *channel)
42 {
43     // SEC2 channels allocate CSL signature buffer at the beginning.
44     if (uvm_channel_is_sec2(channel))
45         return UVM_CONF_COMPUTING_SIGN_BUF_MAX_SIZE + UVM_METHOD_SIZE;
46 
47     return 0;
48 }
49 
50 // This is the storage required by a semaphore release.
get_push_end_min_size(uvm_channel_t * channel)51 static NvU32 get_push_end_min_size(uvm_channel_t *channel)
52 {
53     if (g_uvm_global.conf_computing_enabled) {
54         if (uvm_channel_is_ce(channel)) {
55             // Space (in bytes) used by uvm_push_end() on a CE channel when
56             // the Confidential Computing feature is enabled.
57             //
58             // Note that CE semaphore release pushes two memset and one
59             // encryption method on top of the regular release.
60             // Memset size
61             // -------------
62             // PUSH_2U (SET_REMAP)              :   3 Words
63             // PUSH_2U (OFFSET_OUT)             :   3 Words
64             // PUSH_1U (LINE_LENGTH_IN)         :   2 Words
65             // PUSH_1U (LAUNCH_DMA)             :   2 Words
66             // Total 10 * UVM_METHOD_SIZE       :  40 Bytes
67             //
68             // Encrypt size
69             // -------------
70             // PUSH_1U (SET_SECURE_COPY_MODE)   :   2 Words
71             // PUSH_4U (ENCRYPT_AUTH_TAG + IV)  :   5 Words
72             // PUSH_4U (OFFSET_IN_OUT)          :   5 Words
73             // PUSH_2U (LINE_LENGTH_IN)         :   2 Words
74             // PUSH_2U (LAUNCH_DMA)             :   2 Words
75             // Total 16 * UVM_METHOD_SIZE       :  64 Bytes
76             //
77             // TOTAL                            : 144 Bytes
78 
79             if (uvm_channel_is_wlc(channel)) {
80                 // Same as CE + LCIC GPPut update + LCIC doorbell
81                 return 24 + 144 + 24 + 24;
82             }
83 
84             return 24 + 144;
85         }
86 
87         UVM_ASSERT(uvm_channel_is_sec2(channel));
88 
89         // A perfectly aligned inline buffer in SEC2 semaphore release.
90         // We add UVM_METHOD_SIZE because of the NOP method to reserve
91         // UVM_CSL_SIGN_AUTH_TAG_SIZE_BYTES (the inline buffer.)
92         return 48 + UVM_CSL_SIGN_AUTH_TAG_SIZE_BYTES + UVM_METHOD_SIZE;
93     }
94 
95     UVM_ASSERT(uvm_channel_is_ce(channel));
96 
97     // Space (in bytes) used by uvm_push_end() on a CE channel.
98     return 24;
99 }
100 
get_push_end_max_size(uvm_channel_t * channel)101 static NvU32 get_push_end_max_size(uvm_channel_t *channel)
102 {
103     // WLC pushes are always padded to UVM_MAX_WLC_PUSH_SIZE
104     if (uvm_channel_is_wlc(channel))
105         return UVM_MAX_WLC_PUSH_SIZE;
106 
107     // Space (in bytes) used by uvm_push_end() on a SEC2 channel.
108     // Note that SEC2 semaphore release uses an inline buffer with alignment
109     // requirements. This is the "worst" case semaphore_release storage.
110     if (uvm_channel_is_sec2(channel))
111         return 48 + UVM_CSL_SIGN_AUTH_TAG_SIZE_BYTES + UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT;
112 
113     UVM_ASSERT(uvm_channel_is_ce(channel));
114 
115     // Space (in bytes) used by uvm_push_end() on a CE channel.
116     return get_push_end_min_size(channel);
117 }
118 
test_push_end_size(uvm_va_space_t * va_space)119 static NV_STATUS test_push_end_size(uvm_va_space_t *va_space)
120 {
121     uvm_gpu_t *gpu;
122 
123     for_each_va_space_gpu(gpu, va_space) {
124         uvm_channel_type_t type;
125 
126         for (type = 0; type < UVM_CHANNEL_TYPE_COUNT; ++type) {
127             uvm_push_t push;
128             NvU32 push_size_before;
129             NvU32 push_end_size_observed;
130             NvU32 push_end_size_expected[2];
131 
132             // SEC2 is only available when Confidential Computing is enabled
133             if ((type == UVM_CHANNEL_TYPE_SEC2) && !g_uvm_global.conf_computing_enabled)
134                 continue;
135 
136             // WLC is only available when Confidential Computing is enabled
137             if ((type == UVM_CHANNEL_TYPE_WLC) && !g_uvm_global.conf_computing_enabled)
138                 continue;
139 
140             // LCIC doesn't accept pushes
141             if (type == UVM_CHANNEL_TYPE_LCIC)
142                 continue;
143             TEST_NV_CHECK_RET(uvm_push_begin(gpu->channel_manager,
144                                              type,
145                                              &push,
146                                              "type %s",
147                                              uvm_channel_type_to_string(type)));
148 
149             push_size_before = uvm_push_get_size(&push);
150             uvm_push_end(&push);
151             push_end_size_observed = uvm_push_get_size(&push) - push_size_before;
152 
153             push_end_size_expected[0] = get_push_end_min_size(push.channel);
154             push_end_size_expected[1] = get_push_end_max_size(push.channel);
155 
156             if (push_end_size_observed < push_end_size_expected[0] ||
157                 push_end_size_observed > push_end_size_expected[1]) {
158                 UVM_TEST_PRINT("push_end_size incorrect, %u instead of [%u:%u] on channel type %s for GPU %s\n",
159                                push_end_size_observed,
160                                push_end_size_expected[0],
161                                push_end_size_expected[1],
162                                uvm_channel_type_to_string(type),
163                                uvm_gpu_name(gpu));
164                 // The size mismatch error gets precedence over a wait error
165                 (void) uvm_push_wait(&push);
166 
167                 return NV_ERR_INVALID_STATE;
168             }
169 
170             TEST_NV_CHECK_RET(uvm_push_wait(&push));
171         }
172     }
173 
174     return NV_OK;
175 }
176 
177 typedef enum {
178     TEST_INLINE_ADD,
179     TEST_INLINE_GET,
180     TEST_INLINE_SINGLE_BUFFER,
181     TEST_INLINE_MAX,
182 } test_inline_type_t;
183 
test_push_inline_data_gpu(uvm_gpu_t * gpu)184 static NV_STATUS test_push_inline_data_gpu(uvm_gpu_t *gpu)
185 {
186     static const size_t test_sizes[] = { 1, 2, 3, 4, 8, 31, 32, 1023, 1024, 1025, UVM_PUSH_INLINE_DATA_MAX_SIZE };
187     NV_STATUS status;
188     int i, j;
189     int test_inline_type;
190     uvm_push_t push;
191     uvm_mem_t *mem = NULL;
192     char *verif;
193 
194     // TODO: Bug 3839176: test is waived on Confidential Computing because
195     // it assumes that GPU can access system memory without using encryption.
196     if (g_uvm_global.conf_computing_enabled)
197         return NV_OK;
198 
199     status = uvm_mem_alloc_sysmem_and_map_cpu_kernel(UVM_PUSH_INLINE_DATA_MAX_SIZE, current->mm, &mem);
200     TEST_CHECK_GOTO(status == NV_OK, done);
201 
202     status = uvm_mem_map_gpu_kernel(mem, gpu);
203     TEST_CHECK_GOTO(status == NV_OK, done);
204 
205     verif = (char *)uvm_mem_get_cpu_addr_kernel(mem);
206 
207     for (test_inline_type = 0; test_inline_type < TEST_INLINE_MAX; ++test_inline_type) {
208         for (i = 0; i < ARRAY_SIZE(test_sizes); ++i) {
209             size_t test_size = test_sizes[i];
210             uvm_push_inline_data_t data;
211             size_t inline_data_size = 0;
212             uvm_gpu_address_t data_gpu_address;
213             char *inline_buf;
214 
215             status = uvm_push_begin(gpu->channel_manager,
216                                     UVM_CHANNEL_TYPE_GPU_INTERNAL,
217                                     &push,
218                                     "Inline data size %zu",
219                                     test_size);
220             TEST_CHECK_GOTO(status == NV_OK, done);
221 
222             // Do a noop first to test inline data starting at different offsets
223             gpu->parent->host_hal->noop(&push, roundup(min(test_size, (size_t)4096), UVM_METHOD_SIZE));
224 
225             switch (test_inline_type) {
226                 case TEST_INLINE_ADD:
227                     uvm_push_inline_data_begin(&push, &data);
228                     for (j = 0; j < test_size; ++j) {
229                         char value = 1 + i + j;
230                         uvm_push_inline_data_add(&data, &value, 1);
231                     }
232                     inline_data_size = uvm_push_inline_data_size(&data);
233                     data_gpu_address = uvm_push_inline_data_end(&data);
234                     break;
235                 case TEST_INLINE_GET:
236                     uvm_push_inline_data_begin(&push, &data);
237                     inline_buf = (char*)uvm_push_inline_data_get(&data, test_size);
238                     inline_data_size = uvm_push_inline_data_size(&data);
239                     data_gpu_address = uvm_push_inline_data_end(&data);
240                     for (j = 0; j < test_size; ++j)
241                         inline_buf[j] = 1 + i + j;
242                     break;
243                 case TEST_INLINE_SINGLE_BUFFER:
244                     inline_buf = (char*)uvm_push_get_single_inline_buffer(&push,
245                                                                           test_size,
246                                                                           UVM_METHOD_SIZE,
247                                                                           &data_gpu_address);
248                     inline_data_size = test_size;
249                     for (j = 0; j < test_size; ++j)
250                         inline_buf[j] = 1 + i + j;
251                     break;
252             }
253 
254 
255             gpu->parent->ce_hal->memcopy(&push,
256                                         uvm_mem_gpu_address_virtual_kernel(mem, gpu),
257                                         data_gpu_address,
258                                         test_size);
259             status = uvm_push_end_and_wait(&push);
260             TEST_CHECK_GOTO(status == NV_OK, done);
261 
262             TEST_CHECK_GOTO(inline_data_size == test_size, done);
263 
264             for (j = 0; j < test_size; ++j) {
265                 char expected = 1 + i + j;
266                 if (verif[j] != expected) {
267                     UVM_TEST_PRINT("size %zu verif[%d] = %d instead of %d\n", test_size, j, verif[j], expected);
268                     status = NV_ERR_INVALID_STATE;
269                     goto done;
270                 }
271             }
272         }
273     }
274 done:
275     uvm_mem_free(mem);
276 
277     return status;
278 }
279 
test_push_inline_data(uvm_va_space_t * va_space)280 static NV_STATUS test_push_inline_data(uvm_va_space_t *va_space)
281 {
282     uvm_gpu_t *gpu;
283 
284     for_each_va_space_gpu(gpu, va_space) {
285         TEST_CHECK_RET(test_push_inline_data_gpu(gpu) == NV_OK);
286     }
287 
288     return NV_OK;
289 }
290 
291 // Test that begins UVM_PUSH_MAX_CONCURRENT_PUSHES number of pushes before
292 // ending any of them on each GPU.
293 // Notably starting more than a single push is not safe to do outside of a test
294 // as if multiple threads tried doing so, it could easily deadlock.
test_concurrent_pushes(uvm_va_space_t * va_space)295 static NV_STATUS test_concurrent_pushes(uvm_va_space_t *va_space)
296 {
297     NV_STATUS status = NV_OK;
298     uvm_gpu_t *gpu;
299     uvm_push_t *pushes;
300     uvm_tracker_t tracker;
301 
302     // When the Confidential Computing feature is enabled, a channel reserved at
303     // the start of a push cannot be reserved again until that push ends. The
304     // test is waived, because the number of pushes it starts per pool exceeds
305     // the number of channels in the pool, so it would block indefinitely.
306     if (g_uvm_global.conf_computing_enabled)
307         return NV_OK;
308 
309     uvm_tracker_init(&tracker);
310 
311     // As noted above, this test does unsafe things that would be detected by
312     // lock tracking, opt-out.
313     uvm_thread_context_lock_disable_tracking();
314 
315     pushes = uvm_kvmalloc_zero(sizeof(*pushes) * UVM_PUSH_MAX_CONCURRENT_PUSHES);
316     if (pushes == NULL) {
317         status = NV_ERR_NO_MEMORY;
318         goto done;
319     }
320 
321     for_each_va_space_gpu(gpu, va_space) {
322         NvU32 i;
323 
324         for (i = 0; i < UVM_PUSH_MAX_CONCURRENT_PUSHES; ++i) {
325             uvm_push_t *push = &pushes[i];
326             status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, push, "concurrent push %u", i);
327             TEST_CHECK_GOTO(status == NV_OK, done);
328         }
329         for (i = 0; i < UVM_PUSH_MAX_CONCURRENT_PUSHES; ++i) {
330             uvm_push_t *push = &pushes[i];
331             uvm_push_end(push);
332             TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, push), done);
333         }
334         TEST_CHECK_GOTO(tracker.size != 0, done);
335 
336         status = uvm_tracker_wait(&tracker);
337         TEST_CHECK_GOTO(status == NV_OK, done);
338     }
339 
340 done:
341     uvm_thread_context_lock_enable_tracking();
342 
343     uvm_tracker_deinit(&tracker);
344 
345     uvm_kvfree(pushes);
346 
347     return status;
348 }
349 
add_to_counter(void * ptr,int value)350 static void add_to_counter(void* ptr, int value)
351 {
352     atomic_t *atomic = (atomic_t*) ptr;
353     atomic_add(value, atomic);
354 }
355 
add_one_to_counter(void * ptr)356 static void add_one_to_counter(void* ptr)
357 {
358     add_to_counter(ptr, 1);
359 }
360 
add_two_to_counter(void * ptr)361 static void add_two_to_counter(void* ptr)
362 {
363     add_to_counter(ptr, 2);
364 }
365 
test_push_interleaving_on_gpu(uvm_gpu_t * gpu)366 static NV_STATUS test_push_interleaving_on_gpu(uvm_gpu_t* gpu)
367 {
368     NV_STATUS status;
369     uvm_channel_t *channel;
370     uvm_push_t push;
371     NvU32 i;
372     NvU32 *host_va;
373     NvU64 gpu_va;
374     NvU32 observed, expected;
375     unsigned int num_non_paused_pushes;
376     uvm_push_t pushes_not_ended[TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES];
377     const NvLength size = sizeof(NvU32) * (1 + TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES);
378     uvm_rm_mem_t *mem = NULL;
379     atomic_t on_complete_counter = ATOMIC_INIT(0);
380 
381     // TODO: Bug 3839176: test is waived on Confidential Computing because
382     // it assumes that GPU can access system memory without using encryption.
383     if (g_uvm_global.conf_computing_enabled)
384         return NV_OK;
385 
386     // This test issues virtual memcopies/memsets, which in SR-IOV heavy cannot
387     // be pushed to a proxy channel. Pushing to a UVM internal CE channel works
388     // in all scenarios.
389     channel = uvm_channel_any_of_type(gpu->channel_manager, UVM_CHANNEL_POOL_TYPE_CE);
390     TEST_CHECK_RET(channel != NULL);
391 
392     if (channel->num_gpfifo_entries <= TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES) {
393         UVM_TEST_PRINT("Insufficient number of gpfifo entries per channel to run this test. Expected at least %u "
394                        "entries, but found %u\n",
395                        TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES + 1,
396                        channel->num_gpfifo_entries);
397         return NV_ERR_INVALID_STATE;
398     }
399     num_non_paused_pushes = channel->num_gpfifo_entries;
400 
401     // The UVM driver only allows push interleaving across separate threads, but
402     // it is hard to consistently replicate the interleaving. Instead, we
403     // temporarily disable lock tracking, so we can interleave pushes from a
404     // single thread.
405     uvm_thread_context_lock_disable_tracking();
406 
407     status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, size, 0, &mem);
408     TEST_CHECK_GOTO(status == NV_OK, done);
409     host_va = (NvU32*)uvm_rm_mem_get_cpu_va(mem);
410     gpu_va = uvm_rm_mem_get_gpu_va(mem, gpu, uvm_channel_is_proxy(channel)).address;
411     memset(host_va, 0, size);
412 
413     // Begin a few pushes on the channel, but do not end them yet.
414     // Each pushed method sets a magic number on an independent memory location.
415     for (i = 0; i < TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES; ++i) {
416         uvm_push_info_t *push_info;
417 
418         status = uvm_push_begin_on_channel(channel, pushes_not_ended + i, "Set to 0x%x", 0xDEADBEEF + i);
419         TEST_CHECK_GOTO(status == NV_OK, done);
420         gpu->parent->ce_hal->memset_v_4(pushes_not_ended + i,
421                                         gpu_va + sizeof(NvU32) * (i + 1),
422                                         0xDEADBEEF + i,
423                                         sizeof(NvU32));
424 
425         push_info = uvm_push_info_from_push(pushes_not_ended + i);
426         push_info->on_complete = add_two_to_counter;
427         push_info->on_complete_data = &on_complete_counter;
428     }
429 
430     // Push N (N = #channel entries) value increments to the same channel.
431     for (i = 0; i < num_non_paused_pushes; ++i) {
432         uvm_push_info_t *push_info;
433 
434         status = uvm_push_begin_on_channel(channel, &push, "inc to %u", i + 1);
435         TEST_CHECK_GOTO(status == NV_OK, done);
436         gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va, num_non_paused_pushes);
437 
438         push_info = uvm_push_info_from_push(&push);
439         push_info->on_complete = add_one_to_counter;
440         push_info->on_complete_data = &on_complete_counter;
441 
442         uvm_push_end(&push);
443     }
444 
445     // End the pending pushes
446     for (i = 0; i < TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES; ++i)
447         uvm_push_end(pushes_not_ended + i);
448 
449     // When the channel manager becomes idle, the GPU methods have been
450     // completed, and the CPU completion callbacks associated with the push
451     // have been invoked.
452     status = uvm_channel_manager_wait(channel->pool->manager);
453     TEST_CHECK_GOTO(status == NV_OK, done);
454 
455     observed = host_va[0];
456     expected = num_non_paused_pushes;
457     if (observed != expected) {
458         UVM_TEST_PRINT("Observed counter %u but expected %u\n", observed, expected);
459         status = NV_ERR_INVALID_STATE;
460         goto done;
461     }
462 
463     for (i = 0; i < TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES; ++i) {
464         observed = host_va[i + 1];
465         expected = 0xDEADBEEF + i;
466         if (observed != expected) {
467             UVM_TEST_PRINT("Observed magic number 0x%x but expected 0x%x\n", observed, expected);
468             status = NV_ERR_INVALID_STATE;
469             goto done;
470         }
471     }
472 
473     observed = atomic_read(&on_complete_counter);
474     expected = TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES * 2 + num_non_paused_pushes;
475     if (observed != expected) {
476         UVM_TEST_PRINT("Wrong value of counter incremented by push info callback. Observed %u but expected %u\n",
477                        observed,
478                        expected);
479         status = NV_ERR_INVALID_STATE;
480         goto done;
481     }
482 
483 done:
484     uvm_rm_mem_free(mem);
485     uvm_thread_context_lock_enable_tracking();
486 
487     return status;
488 }
489 
490 // Using a single thread, interleave pushes and check that the result is
491 // consistent with a non-interleaved sequence.
492 // 1) Begin a few pushes in channel X but do not end them. Each pushed (GPU)
493 //    method sets a individual value in an independent system memory location.
494 //    Each push is associated with a push info (CPU) callback that atomically
495 //    adds 2 to a memory location M
496 // 2) Begin and end many pushes in the same channel X such that all the gpfifo
497 //    entries are filled. All the pushed methods do the same thing: atomically
498 //    increment a given system memory location.
499 //    Each push is associated with a push info callback that atomically
500 //    increments the memory location M
501 // 3) End the pending pushes
502 //
503 // The final state should be the same as in the non-interleaved sequence
504 // (1)-(3)-(2)
505 //
506 // Starting more than a single push is not safe to do outside of a test as if
507 // multiple threads tried doing so, it could easily deadlock.
test_push_interleaving(uvm_va_space_t * va_space)508 static NV_STATUS test_push_interleaving(uvm_va_space_t *va_space)
509 {
510     NV_STATUS status;
511     uvm_gpu_t *gpu;
512 
513     BUILD_BUG_ON(TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES >= UVM_PUSH_MAX_CONCURRENT_PUSHES);
514 
515     for_each_va_space_gpu(gpu, va_space) {
516         status = test_push_interleaving_on_gpu(gpu);
517         if (status != NV_OK)
518             return status;
519     }
520 
521     return NV_OK;
522 }
523 
524 // Push exactly UVM_MAX_PUSH_SIZE methods while acquiring a semaphore
525 // This is very tightly coupled with the pushbuffer implementation and method
526 // sizes, which is not ideal, but allows to test corner cases in the pushbuffer
527 // management code.
test_push_exactly_max_push(uvm_gpu_t * gpu,uvm_push_t * push,uvm_channel_type_t channel_type,uvm_gpu_semaphore_t * sema_to_acquire,NvU32 value)528 static NV_STATUS test_push_exactly_max_push(uvm_gpu_t *gpu,
529                                             uvm_push_t *push,
530                                             uvm_channel_type_t channel_type,
531                                             uvm_gpu_semaphore_t *sema_to_acquire,
532                                             NvU32 value)
533 {
534     NV_STATUS status;
535     NvU64 semaphore_gpu_va;
536     NvU32 push_end_size;
537 
538     status = uvm_push_begin(gpu->channel_manager, channel_type, push, "Test push");
539     if (status != NV_OK)
540         return status;
541 
542     TEST_CHECK_RET(uvm_push_has_space(push, UVM_MAX_PUSH_SIZE - get_push_begin_size(push->channel)));
543     TEST_CHECK_RET(!uvm_push_has_space(push, UVM_MAX_PUSH_SIZE - get_push_begin_size(push->channel) + 1));
544 
545     semaphore_gpu_va = uvm_gpu_semaphore_get_gpu_va(sema_to_acquire, gpu, uvm_channel_is_proxy(push->channel));
546     gpu->parent->host_hal->semaphore_acquire(push, semaphore_gpu_va, value);
547 
548     // Push a noop leaving just push_end_size in the pushbuffer.
549     push_end_size = get_push_end_max_size(push->channel);
550     gpu->parent->host_hal->noop(push, UVM_MAX_PUSH_SIZE - uvm_push_get_size(push) - push_end_size);
551 
552     TEST_CHECK_RET(uvm_push_has_space(push, push_end_size));
553     TEST_CHECK_RET(!uvm_push_has_space(push, push_end_size + 1));
554     uvm_push_end(push);
555 
556     UVM_ASSERT_MSG(uvm_push_get_size(push) == UVM_MAX_PUSH_SIZE, "push_size %u\n", uvm_push_get_size(push));
557 
558     return NV_OK;
559 }
560 
test_count_idle_chunks(uvm_pushbuffer_t * pushbuffer)561 static NvU32 test_count_idle_chunks(uvm_pushbuffer_t *pushbuffer)
562 {
563     NvU32 i;
564     NvU32 count = 0;
565     for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i)
566         count += test_bit(i, pushbuffer->idle_chunks) ? 1 : 0;
567     return count;
568 }
569 
test_count_available_chunks(uvm_pushbuffer_t * pushbuffer)570 static NvU32 test_count_available_chunks(uvm_pushbuffer_t *pushbuffer)
571 {
572     NvU32 i;
573     NvU32 count = 0;
574     for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i)
575         count += test_bit(i, pushbuffer->available_chunks) ? 1 : 0;
576     return count;
577 }
578 
579 // Reuse the whole pushbuffer 4 times, one UVM_MAX_PUSH_SIZE at a time
580 #define EXTRA_MAX_PUSHES_WHILE_FULL (4 * UVM_PUSHBUFFER_SIZE / UVM_MAX_PUSH_SIZE)
581 
582 // Test doing pushes of exactly UVM_MAX_PUSH_SIZE size and only allowing them to
583 // complete one by one.
test_max_pushes_on_gpu(uvm_gpu_t * gpu)584 static NV_STATUS test_max_pushes_on_gpu(uvm_gpu_t *gpu)
585 {
586     NV_STATUS status;
587 
588     uvm_tracker_t tracker;
589     uvm_gpu_semaphore_t sema;
590     NvU32 total_push_size = 0;
591     NvU32 push_count = 0;
592     NvU32 i;
593     uvm_channel_type_t channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL;
594 
595     uvm_tracker_init(&tracker);
596 
597     status = uvm_gpu_semaphore_alloc(gpu->semaphore_pool, &sema);
598     TEST_CHECK_GOTO(status == NV_OK, done);
599 
600     uvm_gpu_semaphore_set_payload(&sema, 0);
601 
602     // Use SEC2 channel when Confidential Compute is enabled since all other
603     // channel types need extra space for work launch, and the channel type
604     // really doesn't matter for this test.
605     if (g_uvm_global.conf_computing_enabled)
606         channel_type = UVM_CHANNEL_TYPE_SEC2;
607 
608     // Need to wait for all channels to completely idle so that the pushbuffer
609     // is in completely idle state when we begin.
610     status = uvm_channel_manager_wait(gpu->channel_manager);
611     TEST_CHECK_GOTO(status == NV_OK, done);
612 
613     while (uvm_pushbuffer_has_space(gpu->channel_manager->pushbuffer)) {
614         uvm_push_t push;
615 
616         ++push_count;
617 
618         status = test_push_exactly_max_push(gpu, &push, channel_type, &sema, push_count);
619         TEST_CHECK_GOTO(status == NV_OK, done);
620 
621         total_push_size += uvm_push_get_size(&push);
622         TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done);
623     }
624 
625     if (total_push_size != UVM_PUSHBUFFER_SIZE) {
626         UVM_TEST_PRINT("Unexpected space in the pushbuffer, total push %u\n", total_push_size);
627         uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
628         status = NV_ERR_INVALID_STATE;
629         goto done;
630     }
631 
632     TEST_CHECK_GOTO(test_count_available_chunks(gpu->channel_manager->pushbuffer) == 0, done);
633     TEST_CHECK_GOTO(test_count_idle_chunks(gpu->channel_manager->pushbuffer) == 0, done);
634 
635     for (i = 0; i < EXTRA_MAX_PUSHES_WHILE_FULL; ++i) {
636         uvm_push_t push;
637 
638         // There should be no space for another push until the sema is
639         // incremented. Incrementing the same allows a single push to complete
640         // freeing exactly UVM_MAX_PUSH_SIZE space.
641         if (uvm_pushbuffer_has_space(gpu->channel_manager->pushbuffer)) {
642             UVM_TEST_PRINT("Unexpected space in the pushbuffer for iter %d\n", i);
643             uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
644             status = NV_ERR_INVALID_STATE;
645             goto done;
646         }
647 
648         uvm_gpu_semaphore_set_payload(&sema, i + 1);
649 
650         ++push_count;
651 
652         // Take UVM_MAX_PUSH_SIZE space. This should leave no space left again.
653         status = test_push_exactly_max_push(gpu, &push, channel_type, &sema, push_count);
654         TEST_CHECK_GOTO(status == NV_OK, done);
655 
656         TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done);
657     }
658 
659 done:
660     uvm_gpu_semaphore_set_payload(&sema, push_count);
661     uvm_tracker_wait_deinit(&tracker);
662 
663     uvm_gpu_semaphore_free(&sema);
664 
665     return status;
666 }
667 
668 // Test doing UVM_PUSHBUFFER_CHUNKS independent pushes expecting each one to use
669 // a different chunk in the pushbuffer.
test_idle_chunks_on_gpu(uvm_gpu_t * gpu)670 static NV_STATUS test_idle_chunks_on_gpu(uvm_gpu_t *gpu)
671 {
672     NV_STATUS status;
673 
674     uvm_gpu_semaphore_t sema;
675     uvm_tracker_t tracker = UVM_TRACKER_INIT();
676     NvU32 i;
677     uvm_channel_type_t channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL;
678 
679     // Use SEC2 channel when Confidential Compute is enabled since all other
680     // channel types need extra space for work launch, and the channel type
681     // really doesn't matter for this test.
682     if (g_uvm_global.conf_computing_enabled)
683         channel_type = UVM_CHANNEL_TYPE_SEC2;
684 
685     uvm_tracker_init(&tracker);
686 
687     status = uvm_gpu_semaphore_alloc(gpu->semaphore_pool, &sema);
688     TEST_CHECK_GOTO(status == NV_OK, done);
689 
690     uvm_gpu_semaphore_set_payload(&sema, 0);
691 
692     // Need to wait for all channels to completely idle so that the pushbuffer
693     // is in completely idle state when we begin.
694     status = uvm_channel_manager_wait(gpu->channel_manager);
695     TEST_CHECK_GOTO(status == NV_OK, done);
696 
697     for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i) {
698         NvU64 semaphore_gpu_va;
699         uvm_push_t push;
700 
701         status = uvm_push_begin(gpu->channel_manager, channel_type, &push, "Push using chunk %u", i);
702         TEST_CHECK_GOTO(status == NV_OK, done);
703 
704         semaphore_gpu_va = uvm_gpu_semaphore_get_gpu_va(&sema, gpu, uvm_channel_is_proxy(push.channel));
705         gpu->parent->host_hal->semaphore_acquire(&push, semaphore_gpu_va, i + 1);
706         uvm_push_end(&push);
707 
708         TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done);
709 
710         if (test_count_idle_chunks(gpu->channel_manager->pushbuffer) != UVM_PUSHBUFFER_CHUNKS - i - 1) {
711             UVM_TEST_PRINT("Unexpected count of idle chunks in the pushbuffer %u instead of %u\n",
712                            test_count_idle_chunks(gpu->channel_manager->pushbuffer), UVM_PUSHBUFFER_CHUNKS - i - 1);
713             uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
714             status = NV_ERR_INVALID_STATE;
715             goto done;
716         }
717     }
718     uvm_gpu_semaphore_set_payload(&sema, UVM_PUSHBUFFER_CHUNKS + 1);
719 
720     status = uvm_channel_manager_wait(gpu->channel_manager);
721     TEST_CHECK_GOTO(status == NV_OK, done);
722 
723     if (test_count_idle_chunks(gpu->channel_manager->pushbuffer) != UVM_PUSHBUFFER_CHUNKS) {
724         UVM_TEST_PRINT("Unexpected count of idle chunks in the pushbuffer %u\n",
725                        test_count_idle_chunks(gpu->channel_manager->pushbuffer));
726         uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
727         status = NV_ERR_INVALID_STATE;
728         goto done;
729     }
730 
731 done:
732     uvm_gpu_semaphore_set_payload(&sema, UVM_PUSHBUFFER_CHUNKS + 1);
733     uvm_tracker_wait(&tracker);
734 
735     uvm_gpu_semaphore_free(&sema);
736     uvm_tracker_deinit(&tracker);
737 
738     return status;
739 }
740 
test_pushbuffer(uvm_va_space_t * va_space)741 static NV_STATUS test_pushbuffer(uvm_va_space_t *va_space)
742 {
743     uvm_gpu_t *gpu;
744 
745     for_each_va_space_gpu(gpu, va_space) {
746         TEST_NV_CHECK_RET(test_max_pushes_on_gpu(gpu));
747         TEST_NV_CHECK_RET(test_idle_chunks_on_gpu(gpu));
748     }
749     return NV_OK;
750 }
751 
752 typedef struct
753 {
754     NvU64 *timestmap_in_pushbuffer;
755     NvU64 timestamp;
756 } timestamp_test_t;
757 
timestamp_on_complete(void * void_data)758 static void timestamp_on_complete(void *void_data)
759 {
760     timestamp_test_t *data = (timestamp_test_t *)void_data;
761 
762     if (uvm_global_get_status() != NV_OK) {
763         // Do nothing if a global error has been set as the callback might be
764         // called from teardown where the reference to test data is no longer
765         // valid.
766         return;
767     }
768 
769     data->timestamp = *data->timestmap_in_pushbuffer;
770 }
771 
test_timestamp_on_gpu(uvm_gpu_t * gpu)772 static NV_STATUS test_timestamp_on_gpu(uvm_gpu_t *gpu)
773 {
774     NV_STATUS status;
775     uvm_push_t push;
776     timestamp_test_t test_data = {0};
777     NvU32 i;
778     NvU64 last_stamp = 0;
779 
780     for (i = 0; i < 10; ++i) {
781         status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "Releasing a timestamp");
782         if (status != NV_OK)
783             return status;
784 
785         test_data.timestmap_in_pushbuffer = uvm_push_timestamp(&push);
786         uvm_push_info_from_push(&push)->on_complete = timestamp_on_complete;
787         uvm_push_info_from_push(&push)->on_complete_data = &test_data;
788         uvm_push_end(&push);
789 
790         // Synchronize the channel manager to make sure the on_complete
791         // callbacks have a chance to run.
792         status = uvm_channel_manager_wait(gpu->channel_manager);
793         TEST_CHECK_RET(status == NV_OK);
794 
795         TEST_CHECK_RET(test_data.timestamp != 0);
796         TEST_CHECK_RET(test_data.timestamp > last_stamp);
797         last_stamp = test_data.timestamp;
798     }
799 
800     return NV_OK;
801 }
802 
test_timestamp(uvm_va_space_t * va_space)803 static NV_STATUS test_timestamp(uvm_va_space_t *va_space)
804 {
805     uvm_gpu_t *gpu;
806 
807     for_each_va_space_gpu(gpu, va_space)
808         TEST_CHECK_RET(test_timestamp_on_gpu(gpu) == NV_OK);
809 
810     return NV_OK;
811 }
812 
sync_memcopy(uvm_channel_type_t type,uvm_mem_t * dst,uvm_mem_t * src)813 static NV_STATUS sync_memcopy(uvm_channel_type_t type, uvm_mem_t *dst, uvm_mem_t *src)
814 {
815     uvm_push_t push;
816     uvm_gpu_address_t dst_va;
817     uvm_gpu_address_t src_va;
818     uvm_gpu_t *gpu;
819     NV_STATUS status;
820 
821     UVM_ASSERT(uvm_mem_is_vidmem(src) || uvm_mem_is_vidmem(dst));
822 
823     if (type == UVM_CHANNEL_TYPE_CPU_TO_GPU || type == UVM_CHANNEL_TYPE_GPU_TO_CPU) {
824         gpu = (type == UVM_CHANNEL_TYPE_CPU_TO_GPU) ? dst->backing_gpu : src->backing_gpu;
825         status = uvm_push_begin(gpu->channel_manager, type, &push, uvm_channel_type_to_string(type));
826         if (status != NV_OK)
827             return status;
828 
829         dst_va = uvm_mem_gpu_address_virtual_kernel(dst, gpu);
830         src_va = uvm_mem_gpu_address_virtual_kernel(src, gpu);
831         gpu->parent->ce_hal->memcopy(&push, dst_va, src_va, src->size);
832     }
833     else {
834         unsigned i;
835         const NvU32 chunk_size = src->chunk_size;
836 
837         UVM_ASSERT((src->size % chunk_size) == 0);
838 
839         gpu = src->backing_gpu;
840         status = uvm_push_begin_gpu_to_gpu(gpu->channel_manager,
841                                            dst->backing_gpu,
842                                            &push,
843                                            uvm_channel_type_to_string(type));
844 
845         for (i = 0; i < src->size / chunk_size; i++) {
846             dst_va = uvm_mem_gpu_address_copy(dst, gpu, i * chunk_size, chunk_size);
847             src_va = uvm_mem_gpu_address_copy(src, gpu, i * chunk_size, chunk_size);
848             gpu->parent->ce_hal->memcopy(&push, dst_va, src_va, chunk_size);
849         }
850     }
851 
852     return uvm_push_end_and_wait(&push);
853 }
854 
can_do_peer_copies(uvm_va_space_t * va_space,uvm_gpu_t * gpu_a,uvm_gpu_t * gpu_b)855 static bool can_do_peer_copies(uvm_va_space_t *va_space, uvm_gpu_t *gpu_a, uvm_gpu_t *gpu_b)
856 {
857     if (gpu_a == gpu_b || !uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu_a->id)], gpu_b->id))
858         return false;
859 
860     UVM_ASSERT(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu_b->id)], gpu_a->id));
861 
862     // TODO: Bug 2028875. Indirect peers are not supported for now.
863     if (uvm_gpus_are_indirect_peers(gpu_a, gpu_b))
864         return false;
865 
866     return true;
867 }
868 
869 // Test the GPU to GPU push interface by transferring data between each
870 // permutation of GPU peers.
test_push_gpu_to_gpu(uvm_va_space_t * va_space)871 static NV_STATUS test_push_gpu_to_gpu(uvm_va_space_t *va_space)
872 {
873     NvU32 i;
874     NV_STATUS status;
875     uvm_gpu_t *gpu, *gpu_a, *gpu_b;
876     uvm_mem_t **mem;
877     NvU32 *host_ptr;
878     const size_t size = 1024 * 1024;
879     bool waive = true;
880 
881     // TODO: Bug 3839176: the test is waived on Confidential Computing because
882     // it assumes that GPU can access system memory without using encryption.
883     if (g_uvm_global.conf_computing_enabled)
884         return NV_OK;
885 
886     for_each_va_space_gpu(gpu_a, va_space) {
887         for_each_va_space_gpu(gpu_b, va_space) {
888             if (can_do_peer_copies(va_space, gpu_a, gpu_b)) {
889                 waive = false;
890                 break;
891             }
892         }
893     }
894 
895     if (waive)
896         return NV_OK;
897 
898     mem = uvm_kvmalloc_zero(sizeof(*mem) * UVM_ID_MAX_PROCESSORS);
899     if (!mem)
900         return NV_ERR_NO_MEMORY;
901 
902     // Alloc and initialize host buffer
903     status = uvm_mem_alloc_sysmem_and_map_cpu_kernel(size, current->mm, &mem[UVM_ID_CPU_VALUE]);
904     TEST_CHECK_GOTO(status == NV_OK, done);
905 
906     host_ptr = (NvU32 *)uvm_mem_get_cpu_addr_kernel(mem[UVM_ID_CPU_VALUE]);
907 
908     for (i = 0; i < size / sizeof(NvU32); ++i)
909         host_ptr[i] = i + 1;
910 
911     // Allocate vidmem on each GPU, and map the host buffer
912     for_each_va_space_gpu(gpu, va_space) {
913         status = uvm_mem_alloc_vidmem(size, gpu, &mem[uvm_id_value(gpu->id)]);
914         TEST_CHECK_GOTO(status == NV_OK, done);
915 
916         status = uvm_mem_map_gpu_kernel(mem[uvm_id_value(gpu->id)], gpu);
917         TEST_CHECK_GOTO(status == NV_OK, done);
918 
919         status = uvm_mem_map_gpu_kernel(mem[UVM_ID_CPU_VALUE], gpu);
920         TEST_CHECK_GOTO(status == NV_OK, done);
921     }
922 
923     // Copy buffer between each pair of GPU peers, in both directions
924     for_each_va_space_gpu(gpu_a, va_space) {
925         for_each_va_space_gpu(gpu_b, va_space) {
926             if (!can_do_peer_copies(va_space, gpu_a, gpu_b))
927                 continue;
928 
929             // Copy from CPU to the first GPU, and then zero out the host copy
930             status = sync_memcopy(UVM_CHANNEL_TYPE_CPU_TO_GPU,
931                                   mem[uvm_id_value(gpu_a->id)],
932                                   mem[UVM_ID_CPU_VALUE]);
933             TEST_CHECK_GOTO(status == NV_OK, done);
934 
935             memset(host_ptr, 0, size / sizeof(NvU32));
936 
937             // Copy from the first GPU to the second GPU
938             status = sync_memcopy(UVM_CHANNEL_TYPE_GPU_TO_GPU,
939                                   mem[uvm_id_value(gpu_b->id)],
940                                   mem[uvm_id_value(gpu_a->id)]);
941             TEST_CHECK_GOTO(status == NV_OK, done);
942 
943             // Copy from the second GPU back to the host, and check result
944             status = sync_memcopy(UVM_CHANNEL_TYPE_GPU_TO_CPU,
945                                   mem[UVM_ID_CPU_VALUE],
946                                   mem[uvm_id_value(gpu_b->id)]);
947             TEST_CHECK_GOTO(status == NV_OK, done);
948 
949             for (i = 0; i < size / sizeof(NvU32); ++i) {
950                 if (host_ptr[i] != i + 1) {
951                     UVM_TEST_PRINT("host_ptr[%u] = %u instead of %u when copying between %s and %s\n",
952                                    i,
953                                    host_ptr[i],
954                                    i + 1,
955                                    uvm_gpu_name(gpu_a),
956                                    uvm_gpu_name(gpu_b));
957                     status = NV_ERR_INVALID_STATE;
958                     TEST_CHECK_GOTO(status == NV_OK, done);
959                 }
960             }
961         }
962     }
963 
964  done:
965     for_each_va_space_gpu(gpu, va_space)
966         uvm_mem_free(mem[uvm_id_value(gpu->id)]);
967 
968     uvm_mem_free(mem[UVM_ID_CPU_VALUE]);
969     uvm_kvfree(mem);
970 
971     return status;
972 }
973 
uvm_test_push_sanity(UVM_TEST_PUSH_SANITY_PARAMS * params,struct file * filp)974 NV_STATUS uvm_test_push_sanity(UVM_TEST_PUSH_SANITY_PARAMS *params, struct file *filp)
975 {
976     NV_STATUS status;
977     uvm_va_space_t *va_space = uvm_va_space_get(filp);
978 
979     // Take the global lock as some of the tests rely on being the
980     // only thread doing pushes and could deadlock otherwise.
981     uvm_mutex_lock(&g_uvm_global.global_lock);
982     uvm_va_space_down_read_rm(va_space);
983 
984     status = test_push_end_size(va_space);
985     if (status != NV_OK)
986         goto done;
987 
988     status = test_push_inline_data(va_space);
989     if (status != NV_OK)
990         goto done;
991 
992     status = test_concurrent_pushes(va_space);
993     if (status != NV_OK)
994         goto done;
995 
996     status = test_push_interleaving(va_space);
997     if (status != NV_OK)
998         goto done;
999 
1000     status = test_push_gpu_to_gpu(va_space);
1001     if (status != NV_OK)
1002         goto done;
1003 
1004     status = test_pushbuffer(va_space);
1005     if (status != NV_OK)
1006         goto done;
1007 
1008     if (!params->skipTimestampTest) {
1009         status = test_timestamp(va_space);
1010         if (status != NV_OK)
1011             goto done;
1012     }
1013 
1014 done:
1015     uvm_va_space_up_read_rm(va_space);
1016     uvm_mutex_unlock(&g_uvm_global.global_lock);
1017 
1018     return status;
1019 }
1020