1 /*******************************************************************************
2 Copyright (c) 2015-2023 NVIDIA Corporation
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to
6 deal in the Software without restriction, including without limitation the
7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8 sell copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be
12 included in all copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 DEALINGS IN THE SOFTWARE.
21
22 *******************************************************************************/
23
24 #include <asm/atomic.h>
25
26 #include "uvm_global.h"
27 #include "uvm_channel.h"
28 #include "uvm_hal.h"
29 #include "uvm_mem.h"
30 #include "uvm_push.h"
31 #include "uvm_test.h"
32 #include "uvm_test_rng.h"
33 #include "uvm_thread_context.h"
34 #include "uvm_va_space.h"
35 #include "uvm_tracker.h"
36 #include "uvm_gpu_semaphore.h"
37 #include "uvm_kvmalloc.h"
38
39 #define TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES 2
40
get_push_begin_size(uvm_channel_t * channel)41 static NvU32 get_push_begin_size(uvm_channel_t *channel)
42 {
43 // SEC2 channels allocate CSL signature buffer at the beginning.
44 if (uvm_channel_is_sec2(channel))
45 return UVM_CONF_COMPUTING_SIGN_BUF_MAX_SIZE + UVM_METHOD_SIZE;
46
47 return 0;
48 }
49
50 // This is the storage required by a semaphore release.
get_push_end_min_size(uvm_channel_t * channel)51 static NvU32 get_push_end_min_size(uvm_channel_t *channel)
52 {
53 if (g_uvm_global.conf_computing_enabled) {
54 if (uvm_channel_is_ce(channel)) {
55 // Space (in bytes) used by uvm_push_end() on a CE channel when
56 // the Confidential Computing feature is enabled.
57 //
58 // Note that CE semaphore release pushes two memset and one
59 // encryption method on top of the regular release.
60 // Memset size
61 // -------------
62 // PUSH_2U (SET_REMAP) : 3 Words
63 // PUSH_2U (OFFSET_OUT) : 3 Words
64 // PUSH_1U (LINE_LENGTH_IN) : 2 Words
65 // PUSH_1U (LAUNCH_DMA) : 2 Words
66 // Total 10 * UVM_METHOD_SIZE : 40 Bytes
67 //
68 // Encrypt size
69 // -------------
70 // PUSH_1U (SET_SECURE_COPY_MODE) : 2 Words
71 // PUSH_4U (ENCRYPT_AUTH_TAG + IV) : 5 Words
72 // PUSH_4U (OFFSET_IN_OUT) : 5 Words
73 // PUSH_2U (LINE_LENGTH_IN) : 2 Words
74 // PUSH_2U (LAUNCH_DMA) : 2 Words
75 // Total 16 * UVM_METHOD_SIZE : 64 Bytes
76 //
77 // TOTAL : 144 Bytes
78
79 if (uvm_channel_is_wlc(channel)) {
80 // Same as CE + LCIC GPPut update + LCIC doorbell
81 return 24 + 144 + 24 + 24;
82 }
83
84 return 24 + 144;
85 }
86
87 UVM_ASSERT(uvm_channel_is_sec2(channel));
88
89 // A perfectly aligned inline buffer in SEC2 semaphore release.
90 // We add UVM_METHOD_SIZE because of the NOP method to reserve
91 // UVM_CSL_SIGN_AUTH_TAG_SIZE_BYTES (the inline buffer.)
92 return 48 + UVM_CSL_SIGN_AUTH_TAG_SIZE_BYTES + UVM_METHOD_SIZE;
93 }
94
95 UVM_ASSERT(uvm_channel_is_ce(channel));
96
97 // Space (in bytes) used by uvm_push_end() on a CE channel.
98 return 24;
99 }
100
get_push_end_max_size(uvm_channel_t * channel)101 static NvU32 get_push_end_max_size(uvm_channel_t *channel)
102 {
103 // WLC pushes are always padded to UVM_MAX_WLC_PUSH_SIZE
104 if (uvm_channel_is_wlc(channel))
105 return UVM_MAX_WLC_PUSH_SIZE;
106
107 // Space (in bytes) used by uvm_push_end() on a SEC2 channel.
108 // Note that SEC2 semaphore release uses an inline buffer with alignment
109 // requirements. This is the "worst" case semaphore_release storage.
110 if (uvm_channel_is_sec2(channel))
111 return 48 + UVM_CSL_SIGN_AUTH_TAG_SIZE_BYTES + UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT;
112
113 UVM_ASSERT(uvm_channel_is_ce(channel));
114
115 // Space (in bytes) used by uvm_push_end() on a CE channel.
116 return get_push_end_min_size(channel);
117 }
118
test_push_end_size(uvm_va_space_t * va_space)119 static NV_STATUS test_push_end_size(uvm_va_space_t *va_space)
120 {
121 uvm_gpu_t *gpu;
122
123 for_each_va_space_gpu(gpu, va_space) {
124 uvm_channel_type_t type;
125
126 for (type = 0; type < UVM_CHANNEL_TYPE_COUNT; ++type) {
127 uvm_push_t push;
128 NvU32 push_size_before;
129 NvU32 push_end_size_observed;
130 NvU32 push_end_size_expected[2];
131
132 // SEC2 is only available when Confidential Computing is enabled
133 if ((type == UVM_CHANNEL_TYPE_SEC2) && !g_uvm_global.conf_computing_enabled)
134 continue;
135
136 // WLC is only available when Confidential Computing is enabled
137 if ((type == UVM_CHANNEL_TYPE_WLC) && !g_uvm_global.conf_computing_enabled)
138 continue;
139
140 // LCIC doesn't accept pushes
141 if (type == UVM_CHANNEL_TYPE_LCIC)
142 continue;
143 TEST_NV_CHECK_RET(uvm_push_begin(gpu->channel_manager,
144 type,
145 &push,
146 "type %s",
147 uvm_channel_type_to_string(type)));
148
149 push_size_before = uvm_push_get_size(&push);
150 uvm_push_end(&push);
151 push_end_size_observed = uvm_push_get_size(&push) - push_size_before;
152
153 push_end_size_expected[0] = get_push_end_min_size(push.channel);
154 push_end_size_expected[1] = get_push_end_max_size(push.channel);
155
156 if (push_end_size_observed < push_end_size_expected[0] ||
157 push_end_size_observed > push_end_size_expected[1]) {
158 UVM_TEST_PRINT("push_end_size incorrect, %u instead of [%u:%u] on channel type %s for GPU %s\n",
159 push_end_size_observed,
160 push_end_size_expected[0],
161 push_end_size_expected[1],
162 uvm_channel_type_to_string(type),
163 uvm_gpu_name(gpu));
164 // The size mismatch error gets precedence over a wait error
165 (void) uvm_push_wait(&push);
166
167 return NV_ERR_INVALID_STATE;
168 }
169
170 TEST_NV_CHECK_RET(uvm_push_wait(&push));
171 }
172 }
173
174 return NV_OK;
175 }
176
177 typedef enum {
178 TEST_INLINE_ADD,
179 TEST_INLINE_GET,
180 TEST_INLINE_SINGLE_BUFFER,
181 TEST_INLINE_MAX,
182 } test_inline_type_t;
183
test_push_inline_data_gpu(uvm_gpu_t * gpu)184 static NV_STATUS test_push_inline_data_gpu(uvm_gpu_t *gpu)
185 {
186 static const size_t test_sizes[] = { 1, 2, 3, 4, 8, 31, 32, 1023, 1024, 1025, UVM_PUSH_INLINE_DATA_MAX_SIZE };
187 NV_STATUS status;
188 int i, j;
189 int test_inline_type;
190 uvm_push_t push;
191 uvm_mem_t *mem = NULL;
192 char *verif;
193
194 // TODO: Bug 3839176: test is waived on Confidential Computing because
195 // it assumes that GPU can access system memory without using encryption.
196 if (g_uvm_global.conf_computing_enabled)
197 return NV_OK;
198
199 status = uvm_mem_alloc_sysmem_and_map_cpu_kernel(UVM_PUSH_INLINE_DATA_MAX_SIZE, current->mm, &mem);
200 TEST_CHECK_GOTO(status == NV_OK, done);
201
202 status = uvm_mem_map_gpu_kernel(mem, gpu);
203 TEST_CHECK_GOTO(status == NV_OK, done);
204
205 verif = (char *)uvm_mem_get_cpu_addr_kernel(mem);
206
207 for (test_inline_type = 0; test_inline_type < TEST_INLINE_MAX; ++test_inline_type) {
208 for (i = 0; i < ARRAY_SIZE(test_sizes); ++i) {
209 size_t test_size = test_sizes[i];
210 uvm_push_inline_data_t data;
211 size_t inline_data_size = 0;
212 uvm_gpu_address_t data_gpu_address;
213 char *inline_buf;
214
215 status = uvm_push_begin(gpu->channel_manager,
216 UVM_CHANNEL_TYPE_GPU_INTERNAL,
217 &push,
218 "Inline data size %zu",
219 test_size);
220 TEST_CHECK_GOTO(status == NV_OK, done);
221
222 // Do a noop first to test inline data starting at different offsets
223 gpu->parent->host_hal->noop(&push, roundup(min(test_size, (size_t)4096), UVM_METHOD_SIZE));
224
225 switch (test_inline_type) {
226 case TEST_INLINE_ADD:
227 uvm_push_inline_data_begin(&push, &data);
228 for (j = 0; j < test_size; ++j) {
229 char value = 1 + i + j;
230 uvm_push_inline_data_add(&data, &value, 1);
231 }
232 inline_data_size = uvm_push_inline_data_size(&data);
233 data_gpu_address = uvm_push_inline_data_end(&data);
234 break;
235 case TEST_INLINE_GET:
236 uvm_push_inline_data_begin(&push, &data);
237 inline_buf = (char*)uvm_push_inline_data_get(&data, test_size);
238 inline_data_size = uvm_push_inline_data_size(&data);
239 data_gpu_address = uvm_push_inline_data_end(&data);
240 for (j = 0; j < test_size; ++j)
241 inline_buf[j] = 1 + i + j;
242 break;
243 case TEST_INLINE_SINGLE_BUFFER:
244 inline_buf = (char*)uvm_push_get_single_inline_buffer(&push,
245 test_size,
246 UVM_METHOD_SIZE,
247 &data_gpu_address);
248 inline_data_size = test_size;
249 for (j = 0; j < test_size; ++j)
250 inline_buf[j] = 1 + i + j;
251 break;
252 }
253
254
255 gpu->parent->ce_hal->memcopy(&push,
256 uvm_mem_gpu_address_virtual_kernel(mem, gpu),
257 data_gpu_address,
258 test_size);
259 status = uvm_push_end_and_wait(&push);
260 TEST_CHECK_GOTO(status == NV_OK, done);
261
262 TEST_CHECK_GOTO(inline_data_size == test_size, done);
263
264 for (j = 0; j < test_size; ++j) {
265 char expected = 1 + i + j;
266 if (verif[j] != expected) {
267 UVM_TEST_PRINT("size %zu verif[%d] = %d instead of %d\n", test_size, j, verif[j], expected);
268 status = NV_ERR_INVALID_STATE;
269 goto done;
270 }
271 }
272 }
273 }
274 done:
275 uvm_mem_free(mem);
276
277 return status;
278 }
279
test_push_inline_data(uvm_va_space_t * va_space)280 static NV_STATUS test_push_inline_data(uvm_va_space_t *va_space)
281 {
282 uvm_gpu_t *gpu;
283
284 for_each_va_space_gpu(gpu, va_space) {
285 TEST_CHECK_RET(test_push_inline_data_gpu(gpu) == NV_OK);
286 }
287
288 return NV_OK;
289 }
290
291 // Test that begins UVM_PUSH_MAX_CONCURRENT_PUSHES number of pushes before
292 // ending any of them on each GPU.
293 // Notably starting more than a single push is not safe to do outside of a test
294 // as if multiple threads tried doing so, it could easily deadlock.
test_concurrent_pushes(uvm_va_space_t * va_space)295 static NV_STATUS test_concurrent_pushes(uvm_va_space_t *va_space)
296 {
297 NV_STATUS status = NV_OK;
298 uvm_gpu_t *gpu;
299 uvm_push_t *pushes;
300 uvm_tracker_t tracker;
301
302 // When the Confidential Computing feature is enabled, a channel reserved at
303 // the start of a push cannot be reserved again until that push ends. The
304 // test is waived, because the number of pushes it starts per pool exceeds
305 // the number of channels in the pool, so it would block indefinitely.
306 if (g_uvm_global.conf_computing_enabled)
307 return NV_OK;
308
309 uvm_tracker_init(&tracker);
310
311 // As noted above, this test does unsafe things that would be detected by
312 // lock tracking, opt-out.
313 uvm_thread_context_lock_disable_tracking();
314
315 pushes = uvm_kvmalloc_zero(sizeof(*pushes) * UVM_PUSH_MAX_CONCURRENT_PUSHES);
316 if (pushes == NULL) {
317 status = NV_ERR_NO_MEMORY;
318 goto done;
319 }
320
321 for_each_va_space_gpu(gpu, va_space) {
322 NvU32 i;
323
324 for (i = 0; i < UVM_PUSH_MAX_CONCURRENT_PUSHES; ++i) {
325 uvm_push_t *push = &pushes[i];
326 status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, push, "concurrent push %u", i);
327 TEST_CHECK_GOTO(status == NV_OK, done);
328 }
329 for (i = 0; i < UVM_PUSH_MAX_CONCURRENT_PUSHES; ++i) {
330 uvm_push_t *push = &pushes[i];
331 uvm_push_end(push);
332 TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, push), done);
333 }
334 TEST_CHECK_GOTO(tracker.size != 0, done);
335
336 status = uvm_tracker_wait(&tracker);
337 TEST_CHECK_GOTO(status == NV_OK, done);
338 }
339
340 done:
341 uvm_thread_context_lock_enable_tracking();
342
343 uvm_tracker_deinit(&tracker);
344
345 uvm_kvfree(pushes);
346
347 return status;
348 }
349
add_to_counter(void * ptr,int value)350 static void add_to_counter(void* ptr, int value)
351 {
352 atomic_t *atomic = (atomic_t*) ptr;
353 atomic_add(value, atomic);
354 }
355
add_one_to_counter(void * ptr)356 static void add_one_to_counter(void* ptr)
357 {
358 add_to_counter(ptr, 1);
359 }
360
add_two_to_counter(void * ptr)361 static void add_two_to_counter(void* ptr)
362 {
363 add_to_counter(ptr, 2);
364 }
365
test_push_interleaving_on_gpu(uvm_gpu_t * gpu)366 static NV_STATUS test_push_interleaving_on_gpu(uvm_gpu_t* gpu)
367 {
368 NV_STATUS status;
369 uvm_channel_t *channel;
370 uvm_push_t push;
371 NvU32 i;
372 NvU32 *host_va;
373 NvU64 gpu_va;
374 NvU32 observed, expected;
375 unsigned int num_non_paused_pushes;
376 uvm_push_t pushes_not_ended[TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES];
377 const NvLength size = sizeof(NvU32) * (1 + TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES);
378 uvm_rm_mem_t *mem = NULL;
379 atomic_t on_complete_counter = ATOMIC_INIT(0);
380
381 // TODO: Bug 3839176: test is waived on Confidential Computing because
382 // it assumes that GPU can access system memory without using encryption.
383 if (g_uvm_global.conf_computing_enabled)
384 return NV_OK;
385
386 // This test issues virtual memcopies/memsets, which in SR-IOV heavy cannot
387 // be pushed to a proxy channel. Pushing to a UVM internal CE channel works
388 // in all scenarios.
389 channel = uvm_channel_any_of_type(gpu->channel_manager, UVM_CHANNEL_POOL_TYPE_CE);
390 TEST_CHECK_RET(channel != NULL);
391
392 if (channel->num_gpfifo_entries <= TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES) {
393 UVM_TEST_PRINT("Insufficient number of gpfifo entries per channel to run this test. Expected at least %u "
394 "entries, but found %u\n",
395 TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES + 1,
396 channel->num_gpfifo_entries);
397 return NV_ERR_INVALID_STATE;
398 }
399 num_non_paused_pushes = channel->num_gpfifo_entries;
400
401 // The UVM driver only allows push interleaving across separate threads, but
402 // it is hard to consistently replicate the interleaving. Instead, we
403 // temporarily disable lock tracking, so we can interleave pushes from a
404 // single thread.
405 uvm_thread_context_lock_disable_tracking();
406
407 status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, size, 0, &mem);
408 TEST_CHECK_GOTO(status == NV_OK, done);
409 host_va = (NvU32*)uvm_rm_mem_get_cpu_va(mem);
410 gpu_va = uvm_rm_mem_get_gpu_va(mem, gpu, uvm_channel_is_proxy(channel)).address;
411 memset(host_va, 0, size);
412
413 // Begin a few pushes on the channel, but do not end them yet.
414 // Each pushed method sets a magic number on an independent memory location.
415 for (i = 0; i < TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES; ++i) {
416 uvm_push_info_t *push_info;
417
418 status = uvm_push_begin_on_channel(channel, pushes_not_ended + i, "Set to 0x%x", 0xDEADBEEF + i);
419 TEST_CHECK_GOTO(status == NV_OK, done);
420 gpu->parent->ce_hal->memset_v_4(pushes_not_ended + i,
421 gpu_va + sizeof(NvU32) * (i + 1),
422 0xDEADBEEF + i,
423 sizeof(NvU32));
424
425 push_info = uvm_push_info_from_push(pushes_not_ended + i);
426 push_info->on_complete = add_two_to_counter;
427 push_info->on_complete_data = &on_complete_counter;
428 }
429
430 // Push N (N = #channel entries) value increments to the same channel.
431 for (i = 0; i < num_non_paused_pushes; ++i) {
432 uvm_push_info_t *push_info;
433
434 status = uvm_push_begin_on_channel(channel, &push, "inc to %u", i + 1);
435 TEST_CHECK_GOTO(status == NV_OK, done);
436 gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va, num_non_paused_pushes);
437
438 push_info = uvm_push_info_from_push(&push);
439 push_info->on_complete = add_one_to_counter;
440 push_info->on_complete_data = &on_complete_counter;
441
442 uvm_push_end(&push);
443 }
444
445 // End the pending pushes
446 for (i = 0; i < TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES; ++i)
447 uvm_push_end(pushes_not_ended + i);
448
449 // When the channel manager becomes idle, the GPU methods have been
450 // completed, and the CPU completion callbacks associated with the push
451 // have been invoked.
452 status = uvm_channel_manager_wait(channel->pool->manager);
453 TEST_CHECK_GOTO(status == NV_OK, done);
454
455 observed = host_va[0];
456 expected = num_non_paused_pushes;
457 if (observed != expected) {
458 UVM_TEST_PRINT("Observed counter %u but expected %u\n", observed, expected);
459 status = NV_ERR_INVALID_STATE;
460 goto done;
461 }
462
463 for (i = 0; i < TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES; ++i) {
464 observed = host_va[i + 1];
465 expected = 0xDEADBEEF + i;
466 if (observed != expected) {
467 UVM_TEST_PRINT("Observed magic number 0x%x but expected 0x%x\n", observed, expected);
468 status = NV_ERR_INVALID_STATE;
469 goto done;
470 }
471 }
472
473 observed = atomic_read(&on_complete_counter);
474 expected = TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES * 2 + num_non_paused_pushes;
475 if (observed != expected) {
476 UVM_TEST_PRINT("Wrong value of counter incremented by push info callback. Observed %u but expected %u\n",
477 observed,
478 expected);
479 status = NV_ERR_INVALID_STATE;
480 goto done;
481 }
482
483 done:
484 uvm_rm_mem_free(mem);
485 uvm_thread_context_lock_enable_tracking();
486
487 return status;
488 }
489
490 // Using a single thread, interleave pushes and check that the result is
491 // consistent with a non-interleaved sequence.
492 // 1) Begin a few pushes in channel X but do not end them. Each pushed (GPU)
493 // method sets a individual value in an independent system memory location.
494 // Each push is associated with a push info (CPU) callback that atomically
495 // adds 2 to a memory location M
496 // 2) Begin and end many pushes in the same channel X such that all the gpfifo
497 // entries are filled. All the pushed methods do the same thing: atomically
498 // increment a given system memory location.
499 // Each push is associated with a push info callback that atomically
500 // increments the memory location M
501 // 3) End the pending pushes
502 //
503 // The final state should be the same as in the non-interleaved sequence
504 // (1)-(3)-(2)
505 //
506 // Starting more than a single push is not safe to do outside of a test as if
507 // multiple threads tried doing so, it could easily deadlock.
test_push_interleaving(uvm_va_space_t * va_space)508 static NV_STATUS test_push_interleaving(uvm_va_space_t *va_space)
509 {
510 NV_STATUS status;
511 uvm_gpu_t *gpu;
512
513 BUILD_BUG_ON(TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES >= UVM_PUSH_MAX_CONCURRENT_PUSHES);
514
515 for_each_va_space_gpu(gpu, va_space) {
516 status = test_push_interleaving_on_gpu(gpu);
517 if (status != NV_OK)
518 return status;
519 }
520
521 return NV_OK;
522 }
523
524 // Push exactly UVM_MAX_PUSH_SIZE methods while acquiring a semaphore
525 // This is very tightly coupled with the pushbuffer implementation and method
526 // sizes, which is not ideal, but allows to test corner cases in the pushbuffer
527 // management code.
test_push_exactly_max_push(uvm_gpu_t * gpu,uvm_push_t * push,uvm_channel_type_t channel_type,uvm_gpu_semaphore_t * sema_to_acquire,NvU32 value)528 static NV_STATUS test_push_exactly_max_push(uvm_gpu_t *gpu,
529 uvm_push_t *push,
530 uvm_channel_type_t channel_type,
531 uvm_gpu_semaphore_t *sema_to_acquire,
532 NvU32 value)
533 {
534 NV_STATUS status;
535 NvU64 semaphore_gpu_va;
536 NvU32 push_end_size;
537
538 status = uvm_push_begin(gpu->channel_manager, channel_type, push, "Test push");
539 if (status != NV_OK)
540 return status;
541
542 TEST_CHECK_RET(uvm_push_has_space(push, UVM_MAX_PUSH_SIZE - get_push_begin_size(push->channel)));
543 TEST_CHECK_RET(!uvm_push_has_space(push, UVM_MAX_PUSH_SIZE - get_push_begin_size(push->channel) + 1));
544
545 semaphore_gpu_va = uvm_gpu_semaphore_get_gpu_va(sema_to_acquire, gpu, uvm_channel_is_proxy(push->channel));
546 gpu->parent->host_hal->semaphore_acquire(push, semaphore_gpu_va, value);
547
548 // Push a noop leaving just push_end_size in the pushbuffer.
549 push_end_size = get_push_end_max_size(push->channel);
550 gpu->parent->host_hal->noop(push, UVM_MAX_PUSH_SIZE - uvm_push_get_size(push) - push_end_size);
551
552 TEST_CHECK_RET(uvm_push_has_space(push, push_end_size));
553 TEST_CHECK_RET(!uvm_push_has_space(push, push_end_size + 1));
554 uvm_push_end(push);
555
556 UVM_ASSERT_MSG(uvm_push_get_size(push) == UVM_MAX_PUSH_SIZE, "push_size %u\n", uvm_push_get_size(push));
557
558 return NV_OK;
559 }
560
test_count_idle_chunks(uvm_pushbuffer_t * pushbuffer)561 static NvU32 test_count_idle_chunks(uvm_pushbuffer_t *pushbuffer)
562 {
563 NvU32 i;
564 NvU32 count = 0;
565 for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i)
566 count += test_bit(i, pushbuffer->idle_chunks) ? 1 : 0;
567 return count;
568 }
569
test_count_available_chunks(uvm_pushbuffer_t * pushbuffer)570 static NvU32 test_count_available_chunks(uvm_pushbuffer_t *pushbuffer)
571 {
572 NvU32 i;
573 NvU32 count = 0;
574 for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i)
575 count += test_bit(i, pushbuffer->available_chunks) ? 1 : 0;
576 return count;
577 }
578
579 // Reuse the whole pushbuffer 4 times, one UVM_MAX_PUSH_SIZE at a time
580 #define EXTRA_MAX_PUSHES_WHILE_FULL (4 * UVM_PUSHBUFFER_SIZE / UVM_MAX_PUSH_SIZE)
581
582 // Test doing pushes of exactly UVM_MAX_PUSH_SIZE size and only allowing them to
583 // complete one by one.
test_max_pushes_on_gpu(uvm_gpu_t * gpu)584 static NV_STATUS test_max_pushes_on_gpu(uvm_gpu_t *gpu)
585 {
586 NV_STATUS status;
587
588 uvm_tracker_t tracker;
589 uvm_gpu_semaphore_t sema;
590 NvU32 total_push_size = 0;
591 NvU32 push_count = 0;
592 NvU32 i;
593 uvm_channel_type_t channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL;
594
595 uvm_tracker_init(&tracker);
596
597 status = uvm_gpu_semaphore_alloc(gpu->semaphore_pool, &sema);
598 TEST_CHECK_GOTO(status == NV_OK, done);
599
600 uvm_gpu_semaphore_set_payload(&sema, 0);
601
602 // Use SEC2 channel when Confidential Compute is enabled since all other
603 // channel types need extra space for work launch, and the channel type
604 // really doesn't matter for this test.
605 if (g_uvm_global.conf_computing_enabled)
606 channel_type = UVM_CHANNEL_TYPE_SEC2;
607
608 // Need to wait for all channels to completely idle so that the pushbuffer
609 // is in completely idle state when we begin.
610 status = uvm_channel_manager_wait(gpu->channel_manager);
611 TEST_CHECK_GOTO(status == NV_OK, done);
612
613 while (uvm_pushbuffer_has_space(gpu->channel_manager->pushbuffer)) {
614 uvm_push_t push;
615
616 ++push_count;
617
618 status = test_push_exactly_max_push(gpu, &push, channel_type, &sema, push_count);
619 TEST_CHECK_GOTO(status == NV_OK, done);
620
621 total_push_size += uvm_push_get_size(&push);
622 TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done);
623 }
624
625 if (total_push_size != UVM_PUSHBUFFER_SIZE) {
626 UVM_TEST_PRINT("Unexpected space in the pushbuffer, total push %u\n", total_push_size);
627 uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
628 status = NV_ERR_INVALID_STATE;
629 goto done;
630 }
631
632 TEST_CHECK_GOTO(test_count_available_chunks(gpu->channel_manager->pushbuffer) == 0, done);
633 TEST_CHECK_GOTO(test_count_idle_chunks(gpu->channel_manager->pushbuffer) == 0, done);
634
635 for (i = 0; i < EXTRA_MAX_PUSHES_WHILE_FULL; ++i) {
636 uvm_push_t push;
637
638 // There should be no space for another push until the sema is
639 // incremented. Incrementing the same allows a single push to complete
640 // freeing exactly UVM_MAX_PUSH_SIZE space.
641 if (uvm_pushbuffer_has_space(gpu->channel_manager->pushbuffer)) {
642 UVM_TEST_PRINT("Unexpected space in the pushbuffer for iter %d\n", i);
643 uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
644 status = NV_ERR_INVALID_STATE;
645 goto done;
646 }
647
648 uvm_gpu_semaphore_set_payload(&sema, i + 1);
649
650 ++push_count;
651
652 // Take UVM_MAX_PUSH_SIZE space. This should leave no space left again.
653 status = test_push_exactly_max_push(gpu, &push, channel_type, &sema, push_count);
654 TEST_CHECK_GOTO(status == NV_OK, done);
655
656 TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done);
657 }
658
659 done:
660 uvm_gpu_semaphore_set_payload(&sema, push_count);
661 uvm_tracker_wait_deinit(&tracker);
662
663 uvm_gpu_semaphore_free(&sema);
664
665 return status;
666 }
667
668 // Test doing UVM_PUSHBUFFER_CHUNKS independent pushes expecting each one to use
669 // a different chunk in the pushbuffer.
test_idle_chunks_on_gpu(uvm_gpu_t * gpu)670 static NV_STATUS test_idle_chunks_on_gpu(uvm_gpu_t *gpu)
671 {
672 NV_STATUS status;
673
674 uvm_gpu_semaphore_t sema;
675 uvm_tracker_t tracker = UVM_TRACKER_INIT();
676 NvU32 i;
677 uvm_channel_type_t channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL;
678
679 // Use SEC2 channel when Confidential Compute is enabled since all other
680 // channel types need extra space for work launch, and the channel type
681 // really doesn't matter for this test.
682 if (g_uvm_global.conf_computing_enabled)
683 channel_type = UVM_CHANNEL_TYPE_SEC2;
684
685 uvm_tracker_init(&tracker);
686
687 status = uvm_gpu_semaphore_alloc(gpu->semaphore_pool, &sema);
688 TEST_CHECK_GOTO(status == NV_OK, done);
689
690 uvm_gpu_semaphore_set_payload(&sema, 0);
691
692 // Need to wait for all channels to completely idle so that the pushbuffer
693 // is in completely idle state when we begin.
694 status = uvm_channel_manager_wait(gpu->channel_manager);
695 TEST_CHECK_GOTO(status == NV_OK, done);
696
697 for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i) {
698 NvU64 semaphore_gpu_va;
699 uvm_push_t push;
700
701 status = uvm_push_begin(gpu->channel_manager, channel_type, &push, "Push using chunk %u", i);
702 TEST_CHECK_GOTO(status == NV_OK, done);
703
704 semaphore_gpu_va = uvm_gpu_semaphore_get_gpu_va(&sema, gpu, uvm_channel_is_proxy(push.channel));
705 gpu->parent->host_hal->semaphore_acquire(&push, semaphore_gpu_va, i + 1);
706 uvm_push_end(&push);
707
708 TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done);
709
710 if (test_count_idle_chunks(gpu->channel_manager->pushbuffer) != UVM_PUSHBUFFER_CHUNKS - i - 1) {
711 UVM_TEST_PRINT("Unexpected count of idle chunks in the pushbuffer %u instead of %u\n",
712 test_count_idle_chunks(gpu->channel_manager->pushbuffer), UVM_PUSHBUFFER_CHUNKS - i - 1);
713 uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
714 status = NV_ERR_INVALID_STATE;
715 goto done;
716 }
717 }
718 uvm_gpu_semaphore_set_payload(&sema, UVM_PUSHBUFFER_CHUNKS + 1);
719
720 status = uvm_channel_manager_wait(gpu->channel_manager);
721 TEST_CHECK_GOTO(status == NV_OK, done);
722
723 if (test_count_idle_chunks(gpu->channel_manager->pushbuffer) != UVM_PUSHBUFFER_CHUNKS) {
724 UVM_TEST_PRINT("Unexpected count of idle chunks in the pushbuffer %u\n",
725 test_count_idle_chunks(gpu->channel_manager->pushbuffer));
726 uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
727 status = NV_ERR_INVALID_STATE;
728 goto done;
729 }
730
731 done:
732 uvm_gpu_semaphore_set_payload(&sema, UVM_PUSHBUFFER_CHUNKS + 1);
733 uvm_tracker_wait(&tracker);
734
735 uvm_gpu_semaphore_free(&sema);
736 uvm_tracker_deinit(&tracker);
737
738 return status;
739 }
740
test_pushbuffer(uvm_va_space_t * va_space)741 static NV_STATUS test_pushbuffer(uvm_va_space_t *va_space)
742 {
743 uvm_gpu_t *gpu;
744
745 for_each_va_space_gpu(gpu, va_space) {
746 TEST_NV_CHECK_RET(test_max_pushes_on_gpu(gpu));
747 TEST_NV_CHECK_RET(test_idle_chunks_on_gpu(gpu));
748 }
749 return NV_OK;
750 }
751
752 typedef struct
753 {
754 NvU64 *timestmap_in_pushbuffer;
755 NvU64 timestamp;
756 } timestamp_test_t;
757
timestamp_on_complete(void * void_data)758 static void timestamp_on_complete(void *void_data)
759 {
760 timestamp_test_t *data = (timestamp_test_t *)void_data;
761
762 if (uvm_global_get_status() != NV_OK) {
763 // Do nothing if a global error has been set as the callback might be
764 // called from teardown where the reference to test data is no longer
765 // valid.
766 return;
767 }
768
769 data->timestamp = *data->timestmap_in_pushbuffer;
770 }
771
test_timestamp_on_gpu(uvm_gpu_t * gpu)772 static NV_STATUS test_timestamp_on_gpu(uvm_gpu_t *gpu)
773 {
774 NV_STATUS status;
775 uvm_push_t push;
776 timestamp_test_t test_data = {0};
777 NvU32 i;
778 NvU64 last_stamp = 0;
779
780 for (i = 0; i < 10; ++i) {
781 status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "Releasing a timestamp");
782 if (status != NV_OK)
783 return status;
784
785 test_data.timestmap_in_pushbuffer = uvm_push_timestamp(&push);
786 uvm_push_info_from_push(&push)->on_complete = timestamp_on_complete;
787 uvm_push_info_from_push(&push)->on_complete_data = &test_data;
788 uvm_push_end(&push);
789
790 // Synchronize the channel manager to make sure the on_complete
791 // callbacks have a chance to run.
792 status = uvm_channel_manager_wait(gpu->channel_manager);
793 TEST_CHECK_RET(status == NV_OK);
794
795 TEST_CHECK_RET(test_data.timestamp != 0);
796 TEST_CHECK_RET(test_data.timestamp > last_stamp);
797 last_stamp = test_data.timestamp;
798 }
799
800 return NV_OK;
801 }
802
test_timestamp(uvm_va_space_t * va_space)803 static NV_STATUS test_timestamp(uvm_va_space_t *va_space)
804 {
805 uvm_gpu_t *gpu;
806
807 for_each_va_space_gpu(gpu, va_space)
808 TEST_CHECK_RET(test_timestamp_on_gpu(gpu) == NV_OK);
809
810 return NV_OK;
811 }
812
sync_memcopy(uvm_channel_type_t type,uvm_mem_t * dst,uvm_mem_t * src)813 static NV_STATUS sync_memcopy(uvm_channel_type_t type, uvm_mem_t *dst, uvm_mem_t *src)
814 {
815 uvm_push_t push;
816 uvm_gpu_address_t dst_va;
817 uvm_gpu_address_t src_va;
818 uvm_gpu_t *gpu;
819 NV_STATUS status;
820
821 UVM_ASSERT(uvm_mem_is_vidmem(src) || uvm_mem_is_vidmem(dst));
822
823 if (type == UVM_CHANNEL_TYPE_CPU_TO_GPU || type == UVM_CHANNEL_TYPE_GPU_TO_CPU) {
824 gpu = (type == UVM_CHANNEL_TYPE_CPU_TO_GPU) ? dst->backing_gpu : src->backing_gpu;
825 status = uvm_push_begin(gpu->channel_manager, type, &push, uvm_channel_type_to_string(type));
826 if (status != NV_OK)
827 return status;
828
829 dst_va = uvm_mem_gpu_address_virtual_kernel(dst, gpu);
830 src_va = uvm_mem_gpu_address_virtual_kernel(src, gpu);
831 gpu->parent->ce_hal->memcopy(&push, dst_va, src_va, src->size);
832 }
833 else {
834 unsigned i;
835 const NvU32 chunk_size = src->chunk_size;
836
837 UVM_ASSERT((src->size % chunk_size) == 0);
838
839 gpu = src->backing_gpu;
840 status = uvm_push_begin_gpu_to_gpu(gpu->channel_manager,
841 dst->backing_gpu,
842 &push,
843 uvm_channel_type_to_string(type));
844
845 for (i = 0; i < src->size / chunk_size; i++) {
846 dst_va = uvm_mem_gpu_address_copy(dst, gpu, i * chunk_size, chunk_size);
847 src_va = uvm_mem_gpu_address_copy(src, gpu, i * chunk_size, chunk_size);
848 gpu->parent->ce_hal->memcopy(&push, dst_va, src_va, chunk_size);
849 }
850 }
851
852 return uvm_push_end_and_wait(&push);
853 }
854
can_do_peer_copies(uvm_va_space_t * va_space,uvm_gpu_t * gpu_a,uvm_gpu_t * gpu_b)855 static bool can_do_peer_copies(uvm_va_space_t *va_space, uvm_gpu_t *gpu_a, uvm_gpu_t *gpu_b)
856 {
857 if (gpu_a == gpu_b || !uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu_a->id)], gpu_b->id))
858 return false;
859
860 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu_b->id)], gpu_a->id));
861
862 // TODO: Bug 2028875. Indirect peers are not supported for now.
863 if (uvm_gpus_are_indirect_peers(gpu_a, gpu_b))
864 return false;
865
866 return true;
867 }
868
869 // Test the GPU to GPU push interface by transferring data between each
870 // permutation of GPU peers.
test_push_gpu_to_gpu(uvm_va_space_t * va_space)871 static NV_STATUS test_push_gpu_to_gpu(uvm_va_space_t *va_space)
872 {
873 NvU32 i;
874 NV_STATUS status;
875 uvm_gpu_t *gpu, *gpu_a, *gpu_b;
876 uvm_mem_t **mem;
877 NvU32 *host_ptr;
878 const size_t size = 1024 * 1024;
879 bool waive = true;
880
881 // TODO: Bug 3839176: the test is waived on Confidential Computing because
882 // it assumes that GPU can access system memory without using encryption.
883 if (g_uvm_global.conf_computing_enabled)
884 return NV_OK;
885
886 for_each_va_space_gpu(gpu_a, va_space) {
887 for_each_va_space_gpu(gpu_b, va_space) {
888 if (can_do_peer_copies(va_space, gpu_a, gpu_b)) {
889 waive = false;
890 break;
891 }
892 }
893 }
894
895 if (waive)
896 return NV_OK;
897
898 mem = uvm_kvmalloc_zero(sizeof(*mem) * UVM_ID_MAX_PROCESSORS);
899 if (!mem)
900 return NV_ERR_NO_MEMORY;
901
902 // Alloc and initialize host buffer
903 status = uvm_mem_alloc_sysmem_and_map_cpu_kernel(size, current->mm, &mem[UVM_ID_CPU_VALUE]);
904 TEST_CHECK_GOTO(status == NV_OK, done);
905
906 host_ptr = (NvU32 *)uvm_mem_get_cpu_addr_kernel(mem[UVM_ID_CPU_VALUE]);
907
908 for (i = 0; i < size / sizeof(NvU32); ++i)
909 host_ptr[i] = i + 1;
910
911 // Allocate vidmem on each GPU, and map the host buffer
912 for_each_va_space_gpu(gpu, va_space) {
913 status = uvm_mem_alloc_vidmem(size, gpu, &mem[uvm_id_value(gpu->id)]);
914 TEST_CHECK_GOTO(status == NV_OK, done);
915
916 status = uvm_mem_map_gpu_kernel(mem[uvm_id_value(gpu->id)], gpu);
917 TEST_CHECK_GOTO(status == NV_OK, done);
918
919 status = uvm_mem_map_gpu_kernel(mem[UVM_ID_CPU_VALUE], gpu);
920 TEST_CHECK_GOTO(status == NV_OK, done);
921 }
922
923 // Copy buffer between each pair of GPU peers, in both directions
924 for_each_va_space_gpu(gpu_a, va_space) {
925 for_each_va_space_gpu(gpu_b, va_space) {
926 if (!can_do_peer_copies(va_space, gpu_a, gpu_b))
927 continue;
928
929 // Copy from CPU to the first GPU, and then zero out the host copy
930 status = sync_memcopy(UVM_CHANNEL_TYPE_CPU_TO_GPU,
931 mem[uvm_id_value(gpu_a->id)],
932 mem[UVM_ID_CPU_VALUE]);
933 TEST_CHECK_GOTO(status == NV_OK, done);
934
935 memset(host_ptr, 0, size / sizeof(NvU32));
936
937 // Copy from the first GPU to the second GPU
938 status = sync_memcopy(UVM_CHANNEL_TYPE_GPU_TO_GPU,
939 mem[uvm_id_value(gpu_b->id)],
940 mem[uvm_id_value(gpu_a->id)]);
941 TEST_CHECK_GOTO(status == NV_OK, done);
942
943 // Copy from the second GPU back to the host, and check result
944 status = sync_memcopy(UVM_CHANNEL_TYPE_GPU_TO_CPU,
945 mem[UVM_ID_CPU_VALUE],
946 mem[uvm_id_value(gpu_b->id)]);
947 TEST_CHECK_GOTO(status == NV_OK, done);
948
949 for (i = 0; i < size / sizeof(NvU32); ++i) {
950 if (host_ptr[i] != i + 1) {
951 UVM_TEST_PRINT("host_ptr[%u] = %u instead of %u when copying between %s and %s\n",
952 i,
953 host_ptr[i],
954 i + 1,
955 uvm_gpu_name(gpu_a),
956 uvm_gpu_name(gpu_b));
957 status = NV_ERR_INVALID_STATE;
958 TEST_CHECK_GOTO(status == NV_OK, done);
959 }
960 }
961 }
962 }
963
964 done:
965 for_each_va_space_gpu(gpu, va_space)
966 uvm_mem_free(mem[uvm_id_value(gpu->id)]);
967
968 uvm_mem_free(mem[UVM_ID_CPU_VALUE]);
969 uvm_kvfree(mem);
970
971 return status;
972 }
973
uvm_test_push_sanity(UVM_TEST_PUSH_SANITY_PARAMS * params,struct file * filp)974 NV_STATUS uvm_test_push_sanity(UVM_TEST_PUSH_SANITY_PARAMS *params, struct file *filp)
975 {
976 NV_STATUS status;
977 uvm_va_space_t *va_space = uvm_va_space_get(filp);
978
979 // Take the global lock as some of the tests rely on being the
980 // only thread doing pushes and could deadlock otherwise.
981 uvm_mutex_lock(&g_uvm_global.global_lock);
982 uvm_va_space_down_read_rm(va_space);
983
984 status = test_push_end_size(va_space);
985 if (status != NV_OK)
986 goto done;
987
988 status = test_push_inline_data(va_space);
989 if (status != NV_OK)
990 goto done;
991
992 status = test_concurrent_pushes(va_space);
993 if (status != NV_OK)
994 goto done;
995
996 status = test_push_interleaving(va_space);
997 if (status != NV_OK)
998 goto done;
999
1000 status = test_push_gpu_to_gpu(va_space);
1001 if (status != NV_OK)
1002 goto done;
1003
1004 status = test_pushbuffer(va_space);
1005 if (status != NV_OK)
1006 goto done;
1007
1008 if (!params->skipTimestampTest) {
1009 status = test_timestamp(va_space);
1010 if (status != NV_OK)
1011 goto done;
1012 }
1013
1014 done:
1015 uvm_va_space_up_read_rm(va_space);
1016 uvm_mutex_unlock(&g_uvm_global.global_lock);
1017
1018 return status;
1019 }
1020