1 /*******************************************************************************
2     Copyright (c) 2015-2022 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include <asm/atomic.h>
25 
26 #include "uvm_global.h"
27 #include "uvm_channel.h"
28 #include "uvm_hal.h"
29 #include "uvm_mem.h"
30 #include "uvm_push.h"
31 #include "uvm_test.h"
32 #include "uvm_test_rng.h"
33 #include "uvm_thread_context.h"
34 #include "uvm_va_space.h"
35 #include "uvm_tracker.h"
36 #include "uvm_gpu_semaphore.h"
37 #include "uvm_kvmalloc.h"
38 
39 #define TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES 2
40 
41 static NvU32 get_push_end_size(uvm_channel_t *channel)
42 {
43     if (uvm_channel_is_ce(channel))
44         return UVM_PUSH_CE_END_SIZE;
45 
46     return 0;
47 }
48 
49 static NV_STATUS test_push_end_size(uvm_va_space_t *va_space)
50 {
51     uvm_gpu_t *gpu;
52 
53     for_each_va_space_gpu(gpu, va_space) {
54         uvm_channel_type_t type;
55 
56         for (type = 0; type < UVM_CHANNEL_TYPE_COUNT; ++type) {
57             uvm_push_t push;
58             NvU32 push_size_before;
59             NvU32 push_end_size_observed, push_end_size_expected;
60 
61             TEST_NV_CHECK_RET(uvm_push_begin(gpu->channel_manager,
62                                              type,
63                                              &push,
64                                              "type %s\n",
65                                              uvm_channel_type_to_string(type)));
66 
67             push_size_before = uvm_push_get_size(&push);
68             uvm_push_end(&push);
69 
70             push_end_size_expected = get_push_end_size(push.channel);
71             push_end_size_observed = uvm_push_get_size(&push) - push_size_before;
72 
73             if (push_end_size_observed != push_end_size_expected) {
74                 UVM_TEST_PRINT("push_end_size incorrect, %u instead of %u on channel type %s for GPU %s\n",
75                                push_end_size_observed,
76                                push_end_size_expected,
77                                uvm_channel_type_to_string(type),
78                                uvm_gpu_name(gpu));
79 
80                 // The size mismatch error gets precedence over a wait error
81                 (void) uvm_push_wait(&push);
82 
83                 return NV_ERR_INVALID_STATE;
84             }
85 
86             TEST_NV_CHECK_RET(uvm_push_wait(&push));
87         }
88     }
89 
90     return NV_OK;
91 }
92 
93 typedef enum {
94     TEST_INLINE_ADD,
95     TEST_INLINE_GET,
96     TEST_INLINE_SINGLE_BUFFER,
97     TEST_INLINE_MAX,
98 } test_inline_type_t;
99 
100 static NV_STATUS test_push_inline_data_gpu(uvm_gpu_t *gpu)
101 {
102     static const size_t test_sizes[] = { 1, 2, 3, 4, 8, 31, 32, 1023, 1024, 1025, UVM_PUSH_INLINE_DATA_MAX_SIZE };
103     NV_STATUS status;
104     int i, j;
105     int test_inline_type;
106     uvm_push_t push;
107     uvm_mem_t *mem = NULL;
108     char *verif;
109 
110     status = uvm_mem_alloc_sysmem_and_map_cpu_kernel(UVM_PUSH_INLINE_DATA_MAX_SIZE, current->mm, &mem);
111     TEST_CHECK_GOTO(status == NV_OK, done);
112 
113     status = uvm_mem_map_gpu_kernel(mem, gpu);
114     TEST_CHECK_GOTO(status == NV_OK, done);
115 
116     verif = (char *)uvm_mem_get_cpu_addr_kernel(mem);
117 
118     for (test_inline_type = 0; test_inline_type < TEST_INLINE_MAX; ++test_inline_type) {
119         for (i = 0; i < ARRAY_SIZE(test_sizes); ++i) {
120             size_t test_size = test_sizes[i];
121             uvm_push_inline_data_t data;
122             size_t inline_data_size = 0;
123             uvm_gpu_address_t data_gpu_address;
124             char *inline_buf;
125 
126             status = uvm_push_begin(gpu->channel_manager,
127                                     UVM_CHANNEL_TYPE_GPU_INTERNAL,
128                                     &push,
129                                     "Inline data size %zu",
130                                     test_size);
131             TEST_CHECK_GOTO(status == NV_OK, done);
132 
133             // Do a noop first to test inline data starting at different offsets
134             gpu->parent->host_hal->noop(&push, roundup(min(test_size, (size_t)4096), UVM_METHOD_SIZE));
135 
136             switch (test_inline_type) {
137                 case TEST_INLINE_ADD:
138                     uvm_push_inline_data_begin(&push, &data);
139                     for (j = 0; j < test_size; ++j) {
140                         char value = 1 + i + j;
141                         uvm_push_inline_data_add(&data, &value, 1);
142                     }
143                     inline_data_size = uvm_push_inline_data_size(&data);
144                     data_gpu_address = uvm_push_inline_data_end(&data);
145                     break;
146                 case TEST_INLINE_GET:
147                     uvm_push_inline_data_begin(&push, &data);
148                     inline_buf = (char*)uvm_push_inline_data_get(&data, test_size);
149                     inline_data_size = uvm_push_inline_data_size(&data);
150                     data_gpu_address = uvm_push_inline_data_end(&data);
151                     for (j = 0; j < test_size; ++j)
152                         inline_buf[j] = 1 + i + j;
153                     break;
154                 case TEST_INLINE_SINGLE_BUFFER:
155                     inline_buf = (char*)uvm_push_get_single_inline_buffer(&push, test_size, &data_gpu_address);
156                     inline_data_size = test_size;
157                     for (j = 0; j < test_size; ++j)
158                         inline_buf[j] = 1 + i + j;
159                     break;
160             }
161 
162 
163             gpu->parent->ce_hal->memcopy(&push,
164                                         uvm_mem_gpu_address_virtual_kernel(mem, gpu),
165                                         data_gpu_address,
166                                         test_size);
167             status = uvm_push_end_and_wait(&push);
168             TEST_CHECK_GOTO(status == NV_OK, done);
169 
170             TEST_CHECK_GOTO(inline_data_size == test_size, done);
171 
172             for (j = 0; j < test_size; ++j) {
173                 char expected = 1 + i + j;
174                 if (verif[j] != expected) {
175                     UVM_TEST_PRINT("size %zu verif[%d] = %d instead of %d\n", test_size, j, verif[j], expected);
176                     status = NV_ERR_INVALID_STATE;
177                     goto done;
178                 }
179             }
180         }
181     }
182 done:
183     uvm_mem_free(mem);
184 
185     return status;
186 }
187 
188 static NV_STATUS test_push_inline_data(uvm_va_space_t *va_space)
189 {
190     uvm_gpu_t *gpu;
191 
192     for_each_va_space_gpu(gpu, va_space) {
193         TEST_CHECK_RET(test_push_inline_data_gpu(gpu) == NV_OK);
194     }
195 
196     return NV_OK;
197 }
198 
199 // Test that begins UVM_PUSH_MAX_CONCURRENT_PUSHES number of pushes before
200 // ending any of them on each GPU.
201 // Notably starting more than a single push is not safe to do outside of a test
202 // as if multiple threads tried doing so, it could easily deadlock.
203 static NV_STATUS test_concurrent_pushes(uvm_va_space_t *va_space)
204 {
205     NV_STATUS status = NV_OK;
206     uvm_gpu_t *gpu;
207     NvU32 i;
208     uvm_push_t *pushes;
209     uvm_tracker_t tracker = UVM_TRACKER_INIT();
210     uvm_channel_type_t channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL;
211 
212     // As noted above, this test does unsafe things that would be detected by
213     // lock tracking, opt-out.
214     uvm_thread_context_lock_disable_tracking();
215 
216     pushes = uvm_kvmalloc_zero(sizeof(*pushes) * UVM_PUSH_MAX_CONCURRENT_PUSHES);
217     if (pushes == NULL) {
218         status = NV_ERR_NO_MEMORY;
219         goto done;
220     }
221 
222     for_each_va_space_gpu(gpu, va_space) {
223 
224         for (i = 0; i < UVM_PUSH_MAX_CONCURRENT_PUSHES; ++i) {
225             uvm_push_t *push = &pushes[i];
226             status = uvm_push_begin(gpu->channel_manager, channel_type, push, "concurrent push %u", i);
227             TEST_CHECK_GOTO(status == NV_OK, done);
228         }
229         for (i = 0; i < UVM_PUSH_MAX_CONCURRENT_PUSHES; ++i) {
230             uvm_push_t *push = &pushes[i];
231             uvm_push_end(push);
232             TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, push), done);
233         }
234         TEST_CHECK_GOTO(tracker.size != 0, done);
235 
236         status = uvm_tracker_wait(&tracker);
237         TEST_CHECK_GOTO(status == NV_OK, done);
238     }
239 
240 done:
241     uvm_thread_context_lock_enable_tracking();
242 
243     uvm_tracker_deinit(&tracker);
244 
245     uvm_kvfree(pushes);
246 
247     return status;
248 }
249 
250 static void add_to_counter(void* ptr, int value)
251 {
252     atomic_t *atomic = (atomic_t*) ptr;
253     atomic_add(value, atomic);
254 }
255 
256 static void add_one_to_counter(void* ptr)
257 {
258     add_to_counter(ptr, 1);
259 }
260 
261 static void add_two_to_counter(void* ptr)
262 {
263     add_to_counter(ptr, 2);
264 }
265 
266 static NV_STATUS test_push_interleaving_on_gpu(uvm_gpu_t* gpu)
267 {
268     NV_STATUS status;
269     uvm_channel_t *channel;
270     uvm_push_t push;
271     NvU32 i;
272     NvU32 *host_va;
273     NvU64 gpu_va;
274     NvU32 observed, expected;
275     unsigned int num_non_paused_pushes;
276     uvm_push_t pushes_not_ended[TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES];
277     const NvLength size = sizeof(NvU32) * (1 + TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES);
278     uvm_rm_mem_t *mem = NULL;
279     atomic_t on_complete_counter = ATOMIC_INIT(0);
280 
281     // This test issues virtual memcopies/memsets, which in SR-IOV heavy cannot
282     // be pushed to a proxy channel. Pushing to a UVM internal CE channel works
283     // in all scenarios.
284     channel = uvm_channel_any_of_type(gpu->channel_manager, UVM_CHANNEL_POOL_TYPE_CE);
285     TEST_CHECK_RET(channel != NULL);
286 
287     if (channel->num_gpfifo_entries <= TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES) {
288         UVM_TEST_PRINT("Insufficient number of gpfifo entries per channel to run this test. Expected at least %u "
289                        "entries, but found %u\n",
290                        TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES + 1,
291                        channel->num_gpfifo_entries);
292         return NV_ERR_INVALID_STATE;
293     }
294     num_non_paused_pushes = channel->num_gpfifo_entries;
295 
296     // The UVM driver only allows push interleaving across separate threads, but
297     // it is hard to consistenly replicate the interleaving. Instead, we
298     // temporarily disable lock tracking, so we can interleave pushes from a
299     // single thread.
300     uvm_thread_context_lock_disable_tracking();
301 
302     status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, size, 0, &mem);
303     TEST_CHECK_GOTO(status == NV_OK, done);
304     host_va = (NvU32*)uvm_rm_mem_get_cpu_va(mem);
305     gpu_va = uvm_rm_mem_get_gpu_va(mem, gpu, uvm_channel_is_proxy(channel));
306     memset(host_va, 0, size);
307 
308     // Begin a few pushes on the channel, but do not end them yet.
309     // Each pushed method sets a magic number on an independent memory location.
310     for (i = 0; i < TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES; ++i) {
311         uvm_push_info_t *push_info;
312 
313         status = uvm_push_begin_on_channel(channel, pushes_not_ended + i, "Set to 0x%x", 0xDEADBEEF + i);
314         TEST_CHECK_GOTO(status == NV_OK, done);
315         gpu->parent->ce_hal->memset_v_4(pushes_not_ended + i,
316                                         gpu_va + sizeof(NvU32) * (i + 1),
317                                         0xDEADBEEF + i,
318                                         sizeof(NvU32));
319 
320         push_info = uvm_push_info_from_push(pushes_not_ended + i);
321         push_info->on_complete = add_two_to_counter;
322         push_info->on_complete_data = &on_complete_counter;
323     }
324 
325     // Push N (N = #channel entries) value increments to the same channel.
326     for (i = 0; i < num_non_paused_pushes; ++i) {
327         uvm_push_info_t *push_info;
328 
329         status = uvm_push_begin_on_channel(channel, &push, "inc to %u", i + 1);
330         TEST_CHECK_GOTO(status == NV_OK, done);
331         gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va, num_non_paused_pushes);
332 
333         push_info = uvm_push_info_from_push(&push);
334         push_info->on_complete = add_one_to_counter;
335         push_info->on_complete_data = &on_complete_counter;
336 
337         uvm_push_end(&push);
338     }
339 
340     // End the pending pushes
341     for (i = 0; i < TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES; ++i)
342         uvm_push_end(pushes_not_ended + i);
343 
344     // When the channel manager becomes idle, the GPU methods have been
345     // completed, and the CPU completion callbacks associated with the push
346     // have been invoked.
347     status = uvm_channel_manager_wait(channel->pool->manager);
348     TEST_CHECK_GOTO(status == NV_OK, done);
349 
350     observed = host_va[0];
351     expected = num_non_paused_pushes;
352     if (observed != expected) {
353         UVM_TEST_PRINT("Observed counter %u but expected %u\n", observed, expected);
354         status = NV_ERR_INVALID_STATE;
355         goto done;
356     }
357 
358     for (i = 0; i < TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES; ++i) {
359         observed = host_va[i + 1];
360         expected = 0xDEADBEEF + i;
361         if (observed != expected) {
362             UVM_TEST_PRINT("Observed magic number 0x%x but expected 0x%x\n", observed, expected);
363             status = NV_ERR_INVALID_STATE;
364             goto done;
365         }
366     }
367 
368     observed = atomic_read(&on_complete_counter);
369     expected = TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES * 2 + num_non_paused_pushes;
370     if (observed != expected) {
371         UVM_TEST_PRINT("Wrong value of counter incremented by push info callback. Observed %u but expected %u\n",
372                        observed,
373                        expected);
374         status = NV_ERR_INVALID_STATE;
375         goto done;
376     }
377 
378 done:
379     uvm_rm_mem_free(mem);
380     uvm_thread_context_lock_enable_tracking();
381 
382     return status;
383 }
384 
385 // Using a single thread, interleave pushes and check that the result is
386 // consistent with a non-interleaved sequence.
387 // 1) Begin a few pushes in channel X but do not end them. Each pushed (GPU)
388 //    method sets a individual value in an independent system memory location.
389 //    Each push is associated with a push info (CPU) callback that atomically
390 //    adds 2 to a memory location M
391 // 2) Begin and end many pushes in the same channel X such that all the gpfifo
392 //    entries are filled. All the pushed methods do the same thing: atomically
393 //    increment a given system memory location.
394 //    Each push is associated with a push info callback that atomically
395 //    increments the memory location M
396 // 3) End the pending pushes
397 //
398 // The final state should be the same as in the non-interleaved sequence
399 // (1)-(3)-(2)
400 //
401 // Starting more than a single push is not safe to do outside of a test as if
402 // multiple threads tried doing so, it could easily deadlock.
403 static NV_STATUS test_push_interleaving(uvm_va_space_t *va_space)
404 {
405     NV_STATUS status;
406     uvm_gpu_t *gpu;
407 
408     BUILD_BUG_ON(TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES >= UVM_PUSH_MAX_CONCURRENT_PUSHES);
409 
410     for_each_va_space_gpu(gpu, va_space) {
411         status = test_push_interleaving_on_gpu(gpu);
412         if (status != NV_OK)
413             return status;
414     }
415 
416     return NV_OK;
417 }
418 
419 // Push exactly UVM_MAX_PUSH_SIZE methods while acquiring a semaphore
420 // This is very tightly coupled with the pushbuffer implementation and method
421 // sizes, which is not ideal, but allows to test corner cases in the pushbuffer
422 // management code.
423 static NV_STATUS test_push_exactly_max_push(uvm_gpu_t *gpu,
424                                             uvm_push_t *push,
425                                             uvm_channel_type_t channel_type,
426                                             uvm_gpu_semaphore_t *sema_to_acquire,
427                                             NvU32 value)
428 {
429     NV_STATUS status;
430     NvU64 semaphore_gpu_va;
431     NvU32 push_end_size;
432 
433     status = uvm_push_begin(gpu->channel_manager, channel_type, push, "Test push");
434     if (status != NV_OK)
435         return status;
436 
437     TEST_CHECK_RET(uvm_push_has_space(push, UVM_MAX_PUSH_SIZE));
438     TEST_CHECK_RET(!uvm_push_has_space(push, UVM_MAX_PUSH_SIZE + 1));
439 
440     semaphore_gpu_va = uvm_gpu_semaphore_get_gpu_va(sema_to_acquire, gpu, uvm_channel_is_proxy(push->channel));
441     gpu->parent->host_hal->semaphore_acquire(push, semaphore_gpu_va, value);
442 
443     // Push a noop leaving just push_end_size in the pushbuffer.
444     push_end_size = get_push_end_size(push->channel);
445     gpu->parent->host_hal->noop(push, UVM_MAX_PUSH_SIZE - uvm_push_get_size(push) - push_end_size);
446 
447     TEST_CHECK_RET(uvm_push_has_space(push, push_end_size));
448     TEST_CHECK_RET(!uvm_push_has_space(push, push_end_size + 1));
449     uvm_push_end(push);
450 
451     UVM_ASSERT_MSG(uvm_push_get_size(push) == UVM_MAX_PUSH_SIZE, "push_size %u\n", uvm_push_get_size(push));
452 
453     return NV_OK;
454 }
455 
456 static NvU32 test_count_idle_chunks(uvm_pushbuffer_t *pushbuffer)
457 {
458     NvU32 i;
459     NvU32 count = 0;
460     for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i)
461         count += test_bit(i, pushbuffer->idle_chunks) ? 1 : 0;
462     return count;
463 }
464 
465 static NvU32 test_count_available_chunks(uvm_pushbuffer_t *pushbuffer)
466 {
467     NvU32 i;
468     NvU32 count = 0;
469     for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i)
470         count += test_bit(i, pushbuffer->available_chunks) ? 1 : 0;
471     return count;
472 }
473 
474 // Reuse the whole pushbuffer 4 times, one UVM_MAX_PUSH_SIZE at a time
475 #define EXTRA_MAX_PUSHES_WHILE_FULL (4 * UVM_PUSHBUFFER_SIZE / UVM_MAX_PUSH_SIZE)
476 
477 // Test doing pushes of exactly UVM_MAX_PUSH_SIZE size and only allowing them to
478 // complete one by one.
479 static NV_STATUS test_max_pushes_on_gpu_and_channel_type(uvm_gpu_t *gpu, uvm_channel_type_t channel_type)
480 {
481     NV_STATUS status;
482 
483     uvm_tracker_t tracker;
484     uvm_gpu_semaphore_t sema;
485     NvU32 total_push_size = 0;
486     NvU32 push_count = 0;
487     NvU32 i;
488 
489     uvm_tracker_init(&tracker);
490 
491     status = uvm_gpu_semaphore_alloc(gpu->semaphore_pool, &sema);
492     TEST_CHECK_GOTO(status == NV_OK, done);
493 
494     uvm_gpu_semaphore_set_payload(&sema, 0);
495 
496     // Need to wait for all channels to completely idle so that the pushbuffer
497     // is in completely idle state when we begin.
498     status = uvm_channel_manager_wait(gpu->channel_manager);
499     TEST_CHECK_GOTO(status == NV_OK, done);
500 
501     while (uvm_pushbuffer_has_space(gpu->channel_manager->pushbuffer)) {
502         uvm_push_t push;
503 
504         ++push_count;
505 
506         status = test_push_exactly_max_push(gpu, &push, channel_type, &sema, push_count);
507         TEST_CHECK_GOTO(status == NV_OK, done);
508 
509         total_push_size += uvm_push_get_size(&push);
510         TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done);
511     }
512 
513     if (total_push_size != UVM_PUSHBUFFER_SIZE) {
514         UVM_TEST_PRINT("Unexpected space in the pushbuffer, total push %u\n", total_push_size);
515         uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
516         status = NV_ERR_INVALID_STATE;
517         goto done;
518     }
519 
520     TEST_CHECK_GOTO(test_count_available_chunks(gpu->channel_manager->pushbuffer) == 0, done);
521     TEST_CHECK_GOTO(test_count_idle_chunks(gpu->channel_manager->pushbuffer) == 0, done);
522 
523     for (i = 0; i < EXTRA_MAX_PUSHES_WHILE_FULL; ++i) {
524         uvm_push_t push;
525 
526         // There should be no space for another push until the sema is
527         // incremented. Incrementing the same allows a single push to complete
528         // freeing exactly UVM_MAX_PUSH_SIZE space.
529         if (uvm_pushbuffer_has_space(gpu->channel_manager->pushbuffer)) {
530             UVM_TEST_PRINT("Unexpected space in the pushbuffer for iter %d\n", i);
531             uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
532             status = NV_ERR_INVALID_STATE;
533             goto done;
534         }
535 
536         uvm_gpu_semaphore_set_payload(&sema, i + 1);
537 
538         ++push_count;
539 
540         // Take UVM_MAX_PUSH_SIZE space. This should leave no space left again.
541         status = test_push_exactly_max_push(gpu, &push, channel_type, &sema, push_count);
542         TEST_CHECK_GOTO(status == NV_OK, done);
543 
544         TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done);
545     }
546 
547 done:
548     uvm_gpu_semaphore_set_payload(&sema, push_count);
549     uvm_tracker_wait_deinit(&tracker);
550 
551     uvm_gpu_semaphore_free(&sema);
552 
553     return status;
554 }
555 
556 static NV_STATUS test_max_pushes_on_gpu(uvm_gpu_t *gpu)
557 {
558 
559     TEST_NV_CHECK_RET(test_max_pushes_on_gpu_and_channel_type(gpu, UVM_CHANNEL_TYPE_GPU_INTERNAL));
560 
561     return NV_OK;
562 }
563 
564 // Test doing UVM_PUSHBUFFER_CHUNKS independent pushes expecting each one to use
565 // a different chunk in the pushbuffer.
566 static NV_STATUS test_idle_chunks_on_gpu(uvm_gpu_t *gpu)
567 {
568     NV_STATUS status;
569 
570     uvm_gpu_semaphore_t sema;
571     uvm_tracker_t tracker = UVM_TRACKER_INIT();
572     NvU32 i;
573 
574     uvm_tracker_init(&tracker);
575 
576     status = uvm_gpu_semaphore_alloc(gpu->semaphore_pool, &sema);
577     TEST_CHECK_GOTO(status == NV_OK, done);
578 
579     uvm_gpu_semaphore_set_payload(&sema, 0);
580 
581     // Need to wait for all channels to completely idle so that the pushbuffer
582     // is in completely idle state when we begin.
583     status = uvm_channel_manager_wait(gpu->channel_manager);
584     TEST_CHECK_GOTO(status == NV_OK, done);
585 
586     for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i) {
587         NvU64 semaphore_gpu_va;
588         uvm_push_t push;
589 
590         status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "Push using chunk %u", i);
591         TEST_CHECK_GOTO(status == NV_OK, done);
592 
593         semaphore_gpu_va = uvm_gpu_semaphore_get_gpu_va(&sema, gpu, uvm_channel_is_proxy(push.channel));
594         gpu->parent->host_hal->semaphore_acquire(&push, semaphore_gpu_va, i + 1);
595         uvm_push_end(&push);
596 
597         TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done);
598 
599         if (test_count_idle_chunks(gpu->channel_manager->pushbuffer) != UVM_PUSHBUFFER_CHUNKS - i - 1) {
600             UVM_TEST_PRINT("Unexpected count of idle chunks in the pushbuffer %u instead of %u\n",
601                            test_count_idle_chunks(gpu->channel_manager->pushbuffer), UVM_PUSHBUFFER_CHUNKS - i - 1);
602             uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
603             status = NV_ERR_INVALID_STATE;
604             goto done;
605         }
606     }
607     uvm_gpu_semaphore_set_payload(&sema, UVM_PUSHBUFFER_CHUNKS + 1);
608 
609     status = uvm_channel_manager_wait(gpu->channel_manager);
610     TEST_CHECK_GOTO(status == NV_OK, done);
611 
612     if (test_count_idle_chunks(gpu->channel_manager->pushbuffer) != UVM_PUSHBUFFER_CHUNKS) {
613         UVM_TEST_PRINT("Unexpected count of idle chunks in the pushbuffer %u\n",
614                        test_count_idle_chunks(gpu->channel_manager->pushbuffer));
615         uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
616         status = NV_ERR_INVALID_STATE;
617         goto done;
618     }
619 
620 done:
621     uvm_gpu_semaphore_set_payload(&sema, UVM_PUSHBUFFER_CHUNKS + 1);
622     uvm_tracker_wait(&tracker);
623 
624     uvm_gpu_semaphore_free(&sema);
625     uvm_tracker_deinit(&tracker);
626 
627     return status;
628 }
629 
630 static NV_STATUS test_pushbuffer(uvm_va_space_t *va_space)
631 {
632     uvm_gpu_t *gpu;
633 
634     for_each_va_space_gpu(gpu, va_space) {
635         TEST_NV_CHECK_RET(test_max_pushes_on_gpu(gpu));
636         TEST_NV_CHECK_RET(test_idle_chunks_on_gpu(gpu));
637     }
638     return NV_OK;
639 }
640 
641 typedef struct
642 {
643     NvU64 *timestmap_in_pushbuffer;
644     NvU64 timestamp;
645 } timestamp_test_t;
646 
647 static void timestamp_on_complete(void *void_data)
648 {
649     timestamp_test_t *data = (timestamp_test_t *)void_data;
650 
651     if (uvm_global_get_status() != NV_OK) {
652         // Do nothing if a global error has been set as the callback might be
653         // called from teardown where the reference to test data is no longer
654         // valid.
655         return;
656     }
657 
658     data->timestamp = *data->timestmap_in_pushbuffer;
659 }
660 
661 static NV_STATUS test_timestamp_on_gpu(uvm_gpu_t *gpu)
662 {
663     NV_STATUS status;
664     uvm_push_t push;
665     timestamp_test_t test_data = {0};
666     NvU32 i;
667     NvU64 last_stamp = 0;
668 
669     for (i = 0; i < 10; ++i) {
670         status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "Releasing a timestamp");
671         if (status != NV_OK)
672             return status;
673 
674         test_data.timestmap_in_pushbuffer = uvm_push_timestamp(&push);
675         uvm_push_info_from_push(&push)->on_complete = timestamp_on_complete;
676         uvm_push_info_from_push(&push)->on_complete_data = &test_data;
677         uvm_push_end(&push);
678 
679         // Synchronize the channel manager to make sure the on_complete
680         // callbacks have a chance to run.
681         status = uvm_channel_manager_wait(gpu->channel_manager);
682         TEST_CHECK_RET(status == NV_OK);
683 
684         TEST_CHECK_RET(test_data.timestamp != 0);
685         TEST_CHECK_RET(test_data.timestamp > last_stamp);
686         last_stamp = test_data.timestamp;
687     }
688 
689     return NV_OK;
690 }
691 
692 static NV_STATUS test_timestamp(uvm_va_space_t *va_space)
693 {
694     uvm_gpu_t *gpu;
695 
696     for_each_va_space_gpu(gpu, va_space)
697         TEST_CHECK_RET(test_timestamp_on_gpu(gpu) == NV_OK);
698 
699     return NV_OK;
700 }
701 
702 static NV_STATUS sync_memcopy(uvm_channel_type_t type, uvm_mem_t *dst, uvm_mem_t *src)
703 {
704     uvm_push_t push;
705     uvm_gpu_address_t dst_va;
706     uvm_gpu_address_t src_va;
707     uvm_gpu_t *gpu;
708     NV_STATUS status;
709 
710     UVM_ASSERT(uvm_mem_is_vidmem(src) || uvm_mem_is_vidmem(dst));
711 
712     if (type == UVM_CHANNEL_TYPE_CPU_TO_GPU || type == UVM_CHANNEL_TYPE_GPU_TO_CPU) {
713         gpu = (type == UVM_CHANNEL_TYPE_CPU_TO_GPU) ? dst->backing_gpu : src->backing_gpu;
714         status = uvm_push_begin(gpu->channel_manager, type, &push, uvm_channel_type_to_string(type));
715         if (status != NV_OK)
716             return status;
717 
718         dst_va = uvm_mem_gpu_address_virtual_kernel(dst, gpu);
719         src_va = uvm_mem_gpu_address_virtual_kernel(src, gpu);
720         gpu->parent->ce_hal->memcopy(&push, dst_va, src_va, src->size);
721     }
722     else {
723         unsigned i;
724         const NvU32 chunk_size = src->chunk_size;
725 
726         UVM_ASSERT((src->size % chunk_size) == 0);
727 
728         gpu = src->backing_gpu;
729         status = uvm_push_begin_gpu_to_gpu(gpu->channel_manager,
730                                            dst->backing_gpu,
731                                            &push,
732                                            uvm_channel_type_to_string(type));
733 
734         for (i = 0; i < src->size / chunk_size; i++) {
735             dst_va = uvm_mem_gpu_address_copy(dst, gpu, i * chunk_size, chunk_size);
736             src_va = uvm_mem_gpu_address_copy(src, gpu, i * chunk_size, chunk_size);
737             gpu->parent->ce_hal->memcopy(&push, dst_va, src_va, chunk_size);
738         }
739     }
740 
741     return uvm_push_end_and_wait(&push);
742 }
743 
744 static bool can_do_peer_copies(uvm_va_space_t *va_space, uvm_gpu_t *gpu_a, uvm_gpu_t *gpu_b)
745 {
746     if (gpu_a == gpu_b || !uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu_a->id)], gpu_b->id))
747         return false;
748 
749     UVM_ASSERT(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu_b->id)], gpu_a->id));
750 
751     // TODO: Bug 2028875. Indirect peers are not supported for now.
752     if (uvm_gpus_are_indirect_peers(gpu_a, gpu_b))
753         return false;
754 
755     return true;
756 }
757 
758 // Test the GPU to GPU push interface by transferring data between each
759 // permutation of GPU peers.
760 static NV_STATUS test_push_gpu_to_gpu(uvm_va_space_t *va_space)
761 {
762     NvU32 i;
763     NV_STATUS status;
764     uvm_gpu_t *gpu, *gpu_a, *gpu_b;
765     uvm_mem_t *mem[UVM_ID_MAX_PROCESSORS] = {NULL};
766     NvU32 *host_ptr;
767     const size_t size = 1024 * 1024;
768     bool waive = true;
769 
770     for_each_va_space_gpu(gpu_a, va_space) {
771 
772         for_each_va_space_gpu(gpu_b, va_space) {
773             if (can_do_peer_copies(va_space, gpu_a, gpu_b)) {
774                 waive = false;
775                 break;
776             }
777         }
778     }
779 
780     if (waive)
781         return NV_OK;
782 
783     // Alloc and initialize host buffer
784     status = uvm_mem_alloc_sysmem_and_map_cpu_kernel(size, current->mm, &mem[UVM_ID_CPU_VALUE]);
785     TEST_CHECK_GOTO(status == NV_OK, done);
786 
787     host_ptr = (NvU32 *)uvm_mem_get_cpu_addr_kernel(mem[UVM_ID_CPU_VALUE]);
788 
789     for (i = 0; i < size / sizeof(NvU32); ++i)
790         host_ptr[i] = i + 1;
791 
792     // Allocate vidmem on each GPU, and map the host buffer
793     for_each_va_space_gpu(gpu, va_space) {
794         status = uvm_mem_alloc_vidmem(size, gpu, &mem[uvm_id_value(gpu->id)]);
795         TEST_CHECK_GOTO(status == NV_OK, done);
796 
797         status = uvm_mem_map_gpu_kernel(mem[uvm_id_value(gpu->id)], gpu);
798         TEST_CHECK_GOTO(status == NV_OK, done);
799 
800         status = uvm_mem_map_gpu_kernel(mem[UVM_ID_CPU_VALUE], gpu);
801         TEST_CHECK_GOTO(status == NV_OK, done);
802     }
803 
804     // Copy buffer between each pair of GPU peers, in both directions
805     for_each_va_space_gpu(gpu_a, va_space) {
806         for_each_va_space_gpu(gpu_b, va_space) {
807             if (!can_do_peer_copies(va_space, gpu_a, gpu_b))
808                 continue;
809 
810             // Copy from CPU to the first GPU, and then zero out the host copy
811             status = sync_memcopy(UVM_CHANNEL_TYPE_CPU_TO_GPU,
812                                   mem[uvm_id_value(gpu_a->id)],
813                                   mem[UVM_ID_CPU_VALUE]);
814             TEST_CHECK_GOTO(status == NV_OK, done);
815 
816             memset(host_ptr, 0, size / sizeof(NvU32));
817 
818             // Copy from the first GPU to the second GPU
819             status = sync_memcopy(UVM_CHANNEL_TYPE_GPU_TO_GPU,
820                                   mem[uvm_id_value(gpu_b->id)],
821                                   mem[uvm_id_value(gpu_a->id)]);
822             TEST_CHECK_GOTO(status == NV_OK, done);
823 
824             // Copy from the second GPU back to the host, and check result
825             status = sync_memcopy(UVM_CHANNEL_TYPE_GPU_TO_CPU,
826                                   mem[UVM_ID_CPU_VALUE],
827                                   mem[uvm_id_value(gpu_b->id)]);
828             TEST_CHECK_GOTO(status == NV_OK, done);
829 
830             for (i = 0; i < size / sizeof(NvU32); ++i) {
831                 if (host_ptr[i] != i + 1) {
832                     UVM_TEST_PRINT("host_ptr[%u] = %u instead of %u when copying between %s and %s\n",
833                                    i,
834                                    host_ptr[i],
835                                    i + 1,
836                                    uvm_gpu_name(gpu_a),
837                                    uvm_gpu_name(gpu_b));
838                     status = NV_ERR_INVALID_STATE;
839                     TEST_CHECK_GOTO(status == NV_OK, done);
840                 }
841             }
842         }
843     }
844 
845  done:
846     for_each_va_space_gpu(gpu, va_space)
847         uvm_mem_free(mem[uvm_id_value(gpu->id)]);
848 
849     uvm_mem_free(mem[UVM_ID_CPU_VALUE]);
850 
851     return status;
852 }
853 
854 NV_STATUS uvm_test_push_sanity(UVM_TEST_PUSH_SANITY_PARAMS *params, struct file *filp)
855 {
856     NV_STATUS status;
857     uvm_va_space_t *va_space = uvm_va_space_get(filp);
858 
859     // Take the global lock as some of the tests rely on being the
860     // only thread doing pushes and could deadlock otherwise.
861     uvm_mutex_lock(&g_uvm_global.global_lock);
862     uvm_va_space_down_read_rm(va_space);
863 
864     status = test_push_end_size(va_space);
865     if (status != NV_OK)
866         goto done;
867 
868     status = test_push_inline_data(va_space);
869     if (status != NV_OK)
870         goto done;
871 
872     status = test_concurrent_pushes(va_space);
873     if (status != NV_OK)
874         goto done;
875 
876     status = test_push_interleaving(va_space);
877     if (status != NV_OK)
878         goto done;
879 
880     status = test_push_gpu_to_gpu(va_space);
881     if (status != NV_OK)
882         goto done;
883 
884     status = test_pushbuffer(va_space);
885     if (status != NV_OK)
886         goto done;
887 
888     if (!params->skipTimestampTest) {
889         status = test_timestamp(va_space);
890         if (status != NV_OK)
891             goto done;
892     }
893 
894 done:
895     uvm_va_space_up_read_rm(va_space);
896     uvm_mutex_unlock(&g_uvm_global.global_lock);
897 
898     return status;
899 }
900