1 /*******************************************************************************
2     Copyright (c) 2015-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_channel.h"
25 #include "uvm_global.h"
26 #include "uvm_hal.h"
27 #include "uvm_kvmalloc.h"
28 #include "uvm_push.h"
29 #include "uvm_test.h"
30 #include "uvm_tracker.h"
31 #include "uvm_va_space.h"
32 #include "uvm_rm_mem.h"
33 #include "uvm_mem.h"
34 #include "uvm_gpu.h"
35 
36 #define CE_TEST_MEM_SIZE (2 * 1024 * 1024)
37 #define CE_TEST_MEM_END_SIZE 32
38 #define CE_TEST_MEM_BEGIN_SIZE 32
39 #define CE_TEST_MEM_MIDDLE_SIZE (CE_TEST_MEM_SIZE - CE_TEST_MEM_BEGIN_SIZE - CE_TEST_MEM_END_SIZE)
40 #define CE_TEST_MEM_MIDDLE_OFFSET (CE_TEST_MEM_BEGIN_SIZE)
41 #define CE_TEST_MEM_END_OFFSET (CE_TEST_MEM_SIZE - CE_TEST_MEM_BEGIN_SIZE)
42 #define CE_TEST_MEM_COUNT 5
43 
44 static NV_STATUS test_non_pipelined(uvm_gpu_t *gpu)
45 {
46     NvU32 i;
47     NV_STATUS status;
48     uvm_rm_mem_t *mem[CE_TEST_MEM_COUNT] = { NULL };
49     uvm_rm_mem_t *host_mem = NULL;
50     NvU32 *host_ptr;
51     NvU64 host_mem_gpu_va, mem_gpu_va;
52     NvU64 dst_va;
53     NvU64 src_va;
54     uvm_push_t push;
55     bool is_proxy;
56 
57     // TODO: Bug 3839176: the test is waived on Confidential Computing because
58     // it assumes that GPU can access system memory without using encryption.
59     if (uvm_conf_computing_mode_enabled(gpu))
60         return NV_OK;
61 
62     status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, CE_TEST_MEM_SIZE, 0, &host_mem);
63     TEST_CHECK_GOTO(status == NV_OK, done);
64     host_ptr = (NvU32 *)uvm_rm_mem_get_cpu_va(host_mem);
65     memset(host_ptr, 0, CE_TEST_MEM_SIZE);
66 
67     for (i = 0; i < CE_TEST_MEM_COUNT; ++i) {
68         status = uvm_rm_mem_alloc(gpu, UVM_RM_MEM_TYPE_GPU, CE_TEST_MEM_SIZE, 0, &mem[i]);
69         TEST_CHECK_GOTO(status == NV_OK, done);
70     }
71 
72     status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "Non-pipelined test");
73     TEST_CHECK_GOTO(status == NV_OK, done);
74 
75     is_proxy = uvm_channel_is_proxy(push.channel);
76     host_mem_gpu_va = uvm_rm_mem_get_gpu_va(host_mem, gpu, is_proxy).address;
77 
78     // All of the following CE transfers are done from a single (L)CE and
79     // disabling pipelining is enough to order them when needed. Only push_end
80     // needs a MEMBAR SYS to order everything with the CPU.
81 
82     // Initialize to a bad value
83     for (i = 0; i < CE_TEST_MEM_COUNT; ++i) {
84         mem_gpu_va = uvm_rm_mem_get_gpu_va(mem[i], gpu, is_proxy).address;
85 
86         uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
87         uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
88         gpu->parent->ce_hal->memset_v_4(&push, mem_gpu_va, 1337 + i, CE_TEST_MEM_SIZE);
89     }
90 
91     // Set the first buffer to 1
92     uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
93     mem_gpu_va = uvm_rm_mem_get_gpu_va(mem[0], gpu, is_proxy).address;
94     gpu->parent->ce_hal->memset_v_4(&push, mem_gpu_va, 1, CE_TEST_MEM_SIZE);
95 
96     for (i = 0; i < CE_TEST_MEM_COUNT; ++i) {
97         NvU32 dst = i + 1;
98         if (dst == CE_TEST_MEM_COUNT)
99             dst_va = host_mem_gpu_va;
100         else
101             dst_va = uvm_rm_mem_get_gpu_va(mem[dst], gpu, is_proxy).address;
102 
103         src_va = uvm_rm_mem_get_gpu_va(mem[i], gpu, is_proxy).address;
104 
105         // The first memcpy needs to be non-pipelined as otherwise the previous
106         // memset/memcpy to the source may not be done yet.
107 
108         // Alternate the order of copying the beginning and the end
109         if (i % 2 == 0) {
110             uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
111             gpu->parent->ce_hal->memcopy_v_to_v(&push, dst_va + CE_TEST_MEM_END_OFFSET, src_va + CE_TEST_MEM_END_OFFSET, CE_TEST_MEM_END_SIZE);
112 
113             uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
114             uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
115             gpu->parent->ce_hal->memcopy_v_to_v(&push,
116                                                dst_va + CE_TEST_MEM_MIDDLE_OFFSET,
117                                                src_va + CE_TEST_MEM_MIDDLE_OFFSET,
118                                                CE_TEST_MEM_MIDDLE_SIZE);
119 
120             uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
121             uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
122             gpu->parent->ce_hal->memcopy_v_to_v(&push, dst_va, src_va, CE_TEST_MEM_BEGIN_SIZE);
123         }
124         else {
125             uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
126             gpu->parent->ce_hal->memcopy_v_to_v(&push, dst_va, src_va, CE_TEST_MEM_BEGIN_SIZE);
127 
128             uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
129             uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
130             gpu->parent->ce_hal->memcopy_v_to_v(&push,
131                                                dst_va + CE_TEST_MEM_MIDDLE_OFFSET,
132                                                src_va + CE_TEST_MEM_MIDDLE_OFFSET,
133                                                CE_TEST_MEM_MIDDLE_SIZE);
134 
135             uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
136             uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
137             gpu->parent->ce_hal->memcopy_v_to_v(&push,
138                                                dst_va + CE_TEST_MEM_END_OFFSET,
139                                                src_va + CE_TEST_MEM_END_OFFSET,
140                                                CE_TEST_MEM_END_SIZE);
141         }
142     }
143 
144     status = uvm_push_end_and_wait(&push);
145     TEST_CHECK_GOTO(status == NV_OK, done);
146 
147 
148     for (i = 0; i < CE_TEST_MEM_SIZE / sizeof(NvU32); ++i) {
149         if (host_ptr[i] != 1) {
150             UVM_TEST_PRINT("host_ptr[%u] = %u instead of 1\n", i, host_ptr[i]);
151             status = NV_ERR_INVALID_STATE;
152             goto done;
153         }
154     }
155 
156 done:
157     for (i = 0; i < CE_TEST_MEM_COUNT; ++i) {
158         uvm_rm_mem_free(mem[i]);
159     }
160     uvm_rm_mem_free(host_mem);
161 
162     return status;
163 }
164 
165 #define REDUCTIONS 32
166 
167 static NV_STATUS test_membar(uvm_gpu_t *gpu)
168 {
169     NvU32 i;
170     NV_STATUS status;
171     uvm_rm_mem_t *host_mem = NULL;
172     NvU32 *host_ptr;
173     NvU64 host_mem_gpu_va;
174     uvm_push_t push;
175     NvU32 value;
176 
177     // TODO: Bug 3839176: the test is waived on Confidential Computing because
178     // it assumes that GPU can access system memory without using encryption.
179     if (uvm_conf_computing_mode_enabled(gpu))
180         return NV_OK;
181 
182     status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, sizeof(NvU32), 0, &host_mem);
183     TEST_CHECK_GOTO(status == NV_OK, done);
184     host_ptr = (NvU32 *)uvm_rm_mem_get_cpu_va(host_mem);
185     *host_ptr = 0;
186 
187     status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &push, "Membar test");
188     TEST_CHECK_GOTO(status == NV_OK, done);
189 
190     host_mem_gpu_va = uvm_rm_mem_get_gpu_va(host_mem, gpu, uvm_channel_is_proxy(push.channel)).address;
191 
192     for (i = 0; i < REDUCTIONS; ++i) {
193         uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
194         gpu->parent->ce_hal->semaphore_reduction_inc(&push, host_mem_gpu_va, REDUCTIONS + 1);
195     }
196 
197     // Without a sys membar the channel tracking semaphore can and does complete
198     // before all the reductions.
199     status = uvm_push_end_and_wait(&push);
200     TEST_CHECK_GOTO(status == NV_OK, done);
201 
202     value = *host_ptr;
203     if (value != REDUCTIONS) {
204         UVM_TEST_PRINT("Value = %u instead of %u, GPU %s\n", value, REDUCTIONS, uvm_gpu_name(gpu));
205         status = NV_ERR_INVALID_STATE;
206         goto done;
207     }
208 
209 done:
210     uvm_rm_mem_free(host_mem);
211 
212     return status;
213 }
214 
215 static void push_memset(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 value, size_t element_size, size_t size)
216 {
217     switch (element_size) {
218         case 1:
219             uvm_push_get_gpu(push)->parent->ce_hal->memset_1(push, dst, (NvU8)value, size);
220             break;
221         case 4:
222             uvm_push_get_gpu(push)->parent->ce_hal->memset_4(push, dst, (NvU32)value, size);
223             break;
224         case 8:
225             uvm_push_get_gpu(push)->parent->ce_hal->memset_8(push, dst, value, size);
226             break;
227         default:
228             UVM_ASSERT(0);
229     }
230 }
231 
232 static NV_STATUS test_unaligned_memset(uvm_gpu_t *gpu,
233                                        uvm_gpu_address_t gpu_verif_addr,
234                                        NvU8 *cpu_verif_addr,
235                                        size_t size,
236                                        size_t element_size,
237                                        size_t offset)
238 {
239     uvm_push_t push;
240     NV_STATUS status;
241     size_t i;
242     NvU64 value64 = (offset + 2) * (1ull << 32) + (offset + 1);
243     NvU64 test_value, expected_value = 0;
244     uvm_gpu_address_t dst;
245 
246     // Copy a single element at an unaligned position and make sure it doesn't
247     // clobber anything else
248     TEST_CHECK_RET(gpu_verif_addr.address % element_size == 0);
249     TEST_CHECK_RET(offset + element_size <= size);
250     dst = gpu_verif_addr;
251     dst.address += offset;
252 
253     memset(cpu_verif_addr, (NvU8)(~value64), size);
254 
255     status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push,
256                             "memset_%zu offset %zu",
257                             element_size, offset);
258     TEST_CHECK_RET(status == NV_OK);
259 
260     push_memset(&push, dst, value64, element_size, element_size);
261     status = uvm_push_end_and_wait(&push);
262     TEST_CHECK_RET(status == NV_OK);
263 
264     // Make sure all bytes of element are present
265     test_value = 0;
266     memcpy(&test_value, cpu_verif_addr + offset, element_size);
267 
268     switch (element_size) {
269         case 1:
270             expected_value = (NvU8)value64;
271             break;
272         case 4:
273             expected_value = (NvU32)value64;
274             break;
275         case 8:
276             expected_value = value64;
277             break;
278         default:
279             UVM_ASSERT(0);
280     }
281 
282     if (test_value != expected_value) {
283         UVM_TEST_PRINT("memset_%zu offset %zu failed, written value is 0x%llx instead of 0x%llx\n",
284                        element_size, offset, test_value, expected_value);
285         return NV_ERR_INVALID_STATE;
286     }
287 
288     // Make sure all other bytes are unchanged
289     for (i = 0; i < size; i++) {
290         if (i >= offset && i < offset + element_size)
291             continue;
292         if (cpu_verif_addr[i] != (NvU8)(~value64)) {
293             UVM_TEST_PRINT("memset_%zu offset %zu failed, immutable byte %zu changed value from 0x%x to 0x%x\n",
294                            element_size, offset, i, (NvU8)(~value64),
295                            cpu_verif_addr[i]);
296             return NV_ERR_INVALID_STATE;
297         }
298     }
299 
300     return NV_OK;
301 }
302 
303 static NV_STATUS test_memcpy_and_memset_inner(uvm_gpu_t *gpu,
304                                               uvm_gpu_address_t dst,
305                                               uvm_gpu_address_t src,
306                                               size_t size,
307                                               size_t element_size,
308                                               uvm_gpu_address_t gpu_verif_addr,
309                                               void *cpu_verif_addr,
310                                               int test_iteration)
311 {
312     uvm_push_t push;
313     size_t i;
314     const char *src_type = src.is_virtual ? "virtual" : "physical";
315     const char *src_loc = src.aperture == UVM_APERTURE_SYS ? "sysmem" : "vidmem";
316     const char *dst_type = dst.is_virtual ? "virtual" : "physical";
317     const char *dst_loc = dst.aperture == UVM_APERTURE_SYS ? "sysmem" : "vidmem";
318 
319     NvU64 value64 = (test_iteration + 2) * (1ull << 32) + (test_iteration + 1);
320     NvU64 test_value = 0, expected_value = 0;
321 
322     TEST_NV_CHECK_RET(uvm_push_begin(gpu->channel_manager,
323                                      UVM_CHANNEL_TYPE_GPU_INTERNAL,
324                                      &push,
325                                      "Memset %s %s (0x%llx) and memcopy to %s %s (0x%llx), iter %d",
326                                      src_type,
327                                      src_loc,
328                                      src.address,
329                                      dst_type,
330                                      dst_loc,
331                                      dst.address,
332                                      test_iteration));
333 
334     // Waive if any of the input addresses is physical but the channel does not
335     // support physical addressing
336     if (!uvm_channel_is_privileged(push.channel) && (!dst.is_virtual || !src.is_virtual)) {
337         TEST_NV_CHECK_RET(uvm_push_end_and_wait(&push));
338         return NV_OK;
339     }
340 
341     // The input virtual addresses exist in UVM's internal address space, not
342     // the proxy address space
343     if (uvm_channel_is_proxy(push.channel)) {
344         TEST_NV_CHECK_RET(uvm_push_end_and_wait(&push));
345         return NV_ERR_INVALID_STATE;
346     }
347 
348     // If physical accesses aren't supported, silently convert to virtual to
349     // test the flat mapping.
350     TEST_CHECK_RET(gpu_verif_addr.is_virtual);
351 
352     if (!src.is_virtual)
353         src = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(src.aperture, src.address));
354 
355     if (!dst.is_virtual)
356         dst = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(dst.aperture, dst.address));
357 
358     // Memset src with the appropriate element size, then memcpy to dst and from
359     // dst to the verif location (physical sysmem).
360 
361     push_memset(&push, src, value64, element_size, size);
362     gpu->parent->ce_hal->memcopy(&push, dst, src, size);
363     gpu->parent->ce_hal->memcopy(&push, gpu_verif_addr, dst, size);
364 
365     TEST_NV_CHECK_RET(uvm_push_end_and_wait(&push));
366 
367     for (i = 0; i < size / element_size; i++) {
368         switch (element_size) {
369             case 1:
370                 expected_value = (NvU8)value64;
371                 test_value = ((NvU8 *)cpu_verif_addr)[i];
372                 break;
373             case 4:
374                 expected_value = (NvU32)value64;
375                 test_value = ((NvU32 *)cpu_verif_addr)[i];
376                 break;
377             case 8:
378                 expected_value = value64;
379                 test_value = ((NvU64 *)cpu_verif_addr)[i];
380                 break;
381             default:
382                 UVM_ASSERT(0);
383         }
384 
385         if (test_value != expected_value) {
386             UVM_TEST_PRINT("memset_%zu of %s %s and memcpy into %s %s failed, value[%zu] = 0x%llx instead of 0x%llx\n",
387                            element_size, src_type, src_loc, dst_type, dst_loc,
388                            i, test_value, expected_value);
389             return NV_ERR_INVALID_STATE;
390         }
391     }
392 
393     return NV_OK;
394 }
395 
396 static NV_STATUS test_memcpy_and_memset(uvm_gpu_t *gpu)
397 {
398     NV_STATUS status = NV_OK;
399     bool is_proxy_va_space = false;
400     uvm_gpu_address_t gpu_verif_addr;
401     void *cpu_verif_addr;
402     uvm_mem_t *verif_mem = NULL;
403     uvm_mem_t *sys_uvm_mem = NULL;
404     uvm_mem_t *gpu_uvm_mem = NULL;
405     uvm_rm_mem_t *sys_rm_mem = NULL;
406     uvm_rm_mem_t *gpu_rm_mem = NULL;
407     uvm_gpu_address_t gpu_addresses[4] = {0};
408     size_t size = gpu->big_page.internal_size;
409     static const size_t element_sizes[] = {1, 4, 8};
410     const size_t iterations = 4;
411     size_t i, j, k, s;
412     uvm_mem_alloc_params_t mem_params = {0};
413 
414     if (uvm_conf_computing_mode_enabled(gpu))
415         TEST_NV_CHECK_GOTO(uvm_mem_alloc_sysmem_dma_and_map_cpu_kernel(size, gpu, current->mm, &verif_mem), done);
416     else
417         TEST_NV_CHECK_GOTO(uvm_mem_alloc_sysmem_and_map_cpu_kernel(size, current->mm, &verif_mem), done);
418     TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(verif_mem, gpu), done);
419 
420     gpu_verif_addr = uvm_mem_gpu_address_virtual_kernel(verif_mem, gpu);
421     cpu_verif_addr = uvm_mem_get_cpu_addr_kernel(verif_mem);
422 
423     for (i = 0; i < iterations; ++i) {
424         for (s = 0; s < ARRAY_SIZE(element_sizes); s++) {
425             TEST_NV_CHECK_GOTO(test_unaligned_memset(gpu,
426                                                      gpu_verif_addr,
427                                                      cpu_verif_addr,
428                                                      size,
429                                                      element_sizes[s],
430                                                      i),
431                                done);
432         }
433     }
434 
435     // Virtual address (in UVM's internal address space) backed by sysmem
436     TEST_NV_CHECK_GOTO(uvm_rm_mem_alloc(gpu, UVM_RM_MEM_TYPE_SYS, size, 0, &sys_rm_mem), done);
437     gpu_addresses[0] = uvm_rm_mem_get_gpu_va(sys_rm_mem, gpu, is_proxy_va_space);
438 
439     if (uvm_conf_computing_mode_enabled(gpu)) {
440         for (i = 0; i < iterations; ++i) {
441             for (s = 0; s < ARRAY_SIZE(element_sizes); s++) {
442                 TEST_NV_CHECK_GOTO(test_memcpy_and_memset_inner(gpu,
443                                                                 gpu_addresses[0],
444                                                                 gpu_addresses[0],
445                                                                 size,
446                                                                 element_sizes[s],
447                                                                 gpu_verif_addr,
448                                                                 cpu_verif_addr,
449                                                                 i),
450                                     done);
451 
452             }
453         }
454 
455         // Because gpu_verif_addr is in sysmem, when the Confidential
456         // Computing feature is enabled, only the previous cases are valid.
457         // TODO: Bug 3839176: the test partially waived on Confidential
458         // Computing because it assumes that GPU can access system memory
459         // without using encryption.
460         goto done;
461     }
462 
463     // Using a page size equal to the allocation size ensures that the UVM
464     // memories about to be allocated are physically contiguous. And since the
465     // size is a valid GPU page size, the memories can be virtually mapped on
466     // the GPU if needed.
467     mem_params.size = size;
468     mem_params.page_size = size;
469     mem_params.mm = current->mm;
470 
471     // Physical address in sysmem
472     TEST_NV_CHECK_GOTO(uvm_mem_alloc(&mem_params, &sys_uvm_mem), done);
473     TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_phys(sys_uvm_mem, gpu), done);
474     gpu_addresses[1] = uvm_mem_gpu_address_physical(sys_uvm_mem, gpu, 0, size);
475 
476     // Physical address in vidmem
477     mem_params.backing_gpu = gpu;
478     TEST_NV_CHECK_GOTO(uvm_mem_alloc(&mem_params, &gpu_uvm_mem), done);
479     gpu_addresses[2] = uvm_mem_gpu_address_physical(gpu_uvm_mem, gpu, 0, size);
480 
481     // Virtual address (in UVM's internal address space) backed by vidmem
482     TEST_NV_CHECK_GOTO(uvm_rm_mem_alloc(gpu, UVM_RM_MEM_TYPE_GPU, size, 0, &gpu_rm_mem), done);
483     gpu_addresses[3] = uvm_rm_mem_get_gpu_va(gpu_rm_mem, gpu, is_proxy_va_space);
484 
485 
486     for (i = 0; i < iterations; ++i) {
487         for (j = 0; j < ARRAY_SIZE(gpu_addresses); ++j) {
488             for (k = 0; k < ARRAY_SIZE(gpu_addresses); ++k) {
489                 for (s = 0; s < ARRAY_SIZE(element_sizes); s++) {
490                     TEST_NV_CHECK_GOTO(test_memcpy_and_memset_inner(gpu,
491                                                                     gpu_addresses[k],
492                                                                     gpu_addresses[j],
493                                                                     size,
494                                                                     element_sizes[s],
495                                                                     gpu_verif_addr,
496                                                                     cpu_verif_addr,
497                                                                     i),
498                                        done);
499                 }
500             }
501         }
502     }
503 
504 done:
505     uvm_rm_mem_free(sys_rm_mem);
506     uvm_rm_mem_free(gpu_rm_mem);
507     uvm_mem_free(gpu_uvm_mem);
508     uvm_mem_free(sys_uvm_mem);
509     uvm_mem_free(verif_mem);
510 
511     return status;
512 }
513 
514 static NV_STATUS test_semaphore_alloc_sem(uvm_gpu_t *gpu, size_t size, uvm_mem_t **mem_out)
515 {
516     NvU64 gpu_va;
517     NV_STATUS status = NV_OK;
518     uvm_mem_t *mem = NULL;
519 
520     TEST_NV_CHECK_RET(uvm_mem_alloc_sysmem_and_map_cpu_kernel(size, current->mm, &mem));
521 
522     TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(mem, gpu), error);
523 
524     gpu_va = uvm_mem_get_gpu_va_kernel(mem, gpu);
525 
526     // This semaphore resides in the uvm_mem region, i.e., it has the GPU VA
527     // MSbit set. The intent is to validate semaphore operations when the
528     // semaphore's VA is in the high-end of the GPU effective virtual address
529     // space spectrum, i.e., its VA upper-bit is set.
530     TEST_CHECK_GOTO(gpu_va & (1ULL << (gpu->address_space_tree.hal->num_va_bits() - 1)), error);
531 
532     *mem_out = mem;
533 
534     return NV_OK;
535 
536 error:
537     uvm_mem_free(mem);
538     return status;
539 }
540 
541 // test_semaphore_reduction_inc is similar in concept to test_membar(). It uses
542 // uvm_mem (instead of uvm_rm_mem) as the semaphore, i.e., it assumes that the
543 // CE HAL has been validated, since uvm_mem needs the CE memset/memcopy to be
544 // operational as a pre-requisite for GPU PTE writes. The purpose of
545 // test_semaphore_reduction_inc is to validate the reduction inc operation on
546 // semaphores with their VA's upper-bit set.
547 static NV_STATUS test_semaphore_reduction_inc(uvm_gpu_t *gpu)
548 {
549     NV_STATUS status;
550     uvm_push_t push;
551     uvm_mem_t *mem;
552     NvU64 gpu_va;
553     NvU32 i;
554     NvU32 *host_ptr = NULL;
555     NvU32 value;
556 
557     // Semaphore reduction needs 1 word (4 bytes).
558     const size_t size = sizeof(NvU32);
559 
560     // TODO: Bug 3839176: the test is waived on Confidential Computing because
561     // it assumes that GPU can access system memory without using encryption.
562     if (uvm_conf_computing_mode_enabled(gpu))
563         return NV_OK;
564 
565     status = test_semaphore_alloc_sem(gpu, size, &mem);
566     TEST_CHECK_RET(status == NV_OK);
567 
568     // Initialize the counter of reductions.
569     host_ptr = uvm_mem_get_cpu_addr_kernel(mem);
570     TEST_CHECK_GOTO(host_ptr != NULL, done);
571     *host_ptr = 0;
572 
573     gpu_va = uvm_mem_get_gpu_va_kernel(mem, gpu);
574 
575     status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "semaphore_reduction_inc test");
576     TEST_CHECK_GOTO(status == NV_OK, done);
577 
578     for (i = 0; i < REDUCTIONS; i++) {
579         uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
580         gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va, i+1);
581     }
582 
583     status = uvm_push_end_and_wait(&push);
584     TEST_CHECK_GOTO(status == NV_OK, done);
585 
586     value = *host_ptr;
587     if (value != REDUCTIONS) {
588         UVM_TEST_PRINT("Value = %u instead of %u, GPU %s\n", value, REDUCTIONS, uvm_gpu_name(gpu));
589         status = NV_ERR_INVALID_STATE;
590         goto done;
591     }
592 
593 done:
594     uvm_mem_free(mem);
595 
596     return status;
597 }
598 
599 static NV_STATUS test_semaphore_release(uvm_gpu_t *gpu)
600 {
601     NV_STATUS status;
602     uvm_push_t push;
603     uvm_mem_t *mem;
604     NvU64 gpu_va;
605     NvU32 value;
606     NvU32 *host_ptr = NULL;
607     NvU32 payload = 0xA5A55A5A;
608 
609     // Semaphore release needs 1 word (4 bytes).
610     const size_t size = sizeof(NvU32);
611 
612     // TODO: Bug 3839176: the test is waived on Confidential Computing because
613     // it assumes that GPU can access system memory without using encryption.
614     if (uvm_conf_computing_mode_enabled(gpu))
615         return NV_OK;
616 
617     status = test_semaphore_alloc_sem(gpu, size, &mem);
618     TEST_CHECK_RET(status == NV_OK);
619 
620     // Initialize the payload.
621     host_ptr = uvm_mem_get_cpu_addr_kernel(mem);
622     TEST_CHECK_GOTO(host_ptr != NULL, done);
623     *host_ptr = 0;
624 
625     gpu_va = uvm_mem_get_gpu_va_kernel(mem, gpu);
626 
627     status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "semaphore_release test");
628     TEST_CHECK_GOTO(status == NV_OK, done);
629 
630     gpu->parent->ce_hal->semaphore_release(&push, gpu_va, payload);
631 
632     status = uvm_push_end_and_wait(&push);
633     TEST_CHECK_GOTO(status == NV_OK, done);
634 
635     value = *host_ptr;
636     if (value != payload) {
637         UVM_TEST_PRINT("Semaphore payload = %u instead of %u, GPU %s\n", value, payload, uvm_gpu_name(gpu));
638         status = NV_ERR_INVALID_STATE;
639         goto done;
640     }
641 
642 done:
643     uvm_mem_free(mem);
644 
645     return status;
646 }
647 
648 static NV_STATUS test_semaphore_timestamp(uvm_gpu_t *gpu)
649 {
650     NV_STATUS status;
651     uvm_push_t push;
652     uvm_mem_t *mem;
653     NvU64 gpu_va;
654     NvU32 i;
655     NvU64 *timestamp;
656     NvU64 last_timestamp = 0;
657 
658     // 2 iterations:
659     //   1: compare retrieved timestamp with 0;
660     //   2: compare retrieved timestamp with previous timestamp (obtained in 1).
661     const NvU32 iterations = 2;
662 
663     // The semaphore is 4 words long (16 bytes).
664     const size_t size = 16;
665 
666     // TODO: Bug 3839176: the test is waived on Confidential Computing because
667     // it assumes that GPU can access system memory without using encryption.
668     if (uvm_conf_computing_mode_enabled(gpu))
669         return NV_OK;
670 
671     status = test_semaphore_alloc_sem(gpu, size, &mem);
672     TEST_CHECK_RET(status == NV_OK);
673 
674     timestamp = uvm_mem_get_cpu_addr_kernel(mem);
675     TEST_CHECK_GOTO(timestamp != NULL, done);
676     memset(timestamp, 0, size);
677 
678     // Shift the timestamp pointer to where the semaphore timestamp info is.
679     timestamp += 1;
680 
681     gpu_va = uvm_mem_get_gpu_va_kernel(mem, gpu);
682 
683     for (i = 0; i < iterations; i++) {
684         status = uvm_push_begin(gpu->channel_manager,
685                                 UVM_CHANNEL_TYPE_GPU_INTERNAL,
686                                 &push,
687                                 "semaphore_timestamp test, iter: %u",
688                                 i);
689         TEST_CHECK_GOTO(status == NV_OK, done);
690 
691         gpu->parent->ce_hal->semaphore_timestamp(&push, gpu_va);
692 
693         status = uvm_push_end_and_wait(&push);
694         TEST_CHECK_GOTO(status == NV_OK, done);
695 
696         TEST_CHECK_GOTO(*timestamp != 0, done);
697         TEST_CHECK_GOTO(*timestamp >= last_timestamp, done);
698         last_timestamp = *timestamp;
699     }
700 
701 done:
702     uvm_mem_free(mem);
703 
704     return status;
705 }
706 
707 static bool mem_match(uvm_mem_t *mem1, uvm_mem_t *mem2, size_t size)
708 {
709     void *mem1_addr;
710     void *mem2_addr;
711 
712     UVM_ASSERT(uvm_mem_is_sysmem(mem1));
713     UVM_ASSERT(uvm_mem_is_sysmem(mem2));
714     UVM_ASSERT(mem1->size >= size);
715     UVM_ASSERT(mem2->size >= size);
716 
717     mem1_addr = uvm_mem_get_cpu_addr_kernel(mem1);
718     mem2_addr = uvm_mem_get_cpu_addr_kernel(mem2);
719 
720     return !memcmp(mem1_addr, mem2_addr, size);
721 }
722 
723 static NV_STATUS zero_vidmem(uvm_mem_t *mem)
724 {
725     uvm_push_t push;
726     uvm_gpu_address_t gpu_address;
727     uvm_gpu_t *gpu = mem->backing_gpu;
728 
729     UVM_ASSERT(uvm_mem_is_vidmem(mem));
730 
731     TEST_NV_CHECK_RET(uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "zero vidmem"));
732 
733     gpu_address = uvm_mem_gpu_address_virtual_kernel(mem, gpu);
734     gpu->parent->ce_hal->memset_1(&push, gpu_address, 0, mem->size);
735 
736     TEST_NV_CHECK_RET(uvm_push_end_and_wait(&push));
737 
738     return NV_OK;
739 }
740 
741 static void write_range_cpu(uvm_mem_t *mem, NvU64 base_val)
742 {
743     NvU64 *mem_cpu_va;
744     unsigned i;
745 
746     UVM_ASSERT(uvm_mem_is_sysmem(mem));
747     UVM_ASSERT(IS_ALIGNED(mem->size, sizeof(*mem_cpu_va)));
748 
749     mem_cpu_va = (NvU64 *) uvm_mem_get_cpu_addr_kernel(mem);
750 
751     for (i = 0; i < (mem->size / sizeof(*mem_cpu_va)); i++)
752         mem_cpu_va[i] = base_val++;
753 }
754 
755 static NV_STATUS alloc_vidmem_protected(uvm_gpu_t *gpu, uvm_mem_t **mem, size_t size)
756 {
757     NV_STATUS status;
758 
759     UVM_ASSERT(mem);
760 
761     *mem = NULL;
762 
763     TEST_NV_CHECK_RET(uvm_mem_alloc_vidmem(size, gpu, mem));
764     TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(*mem, gpu), err);
765     TEST_NV_CHECK_GOTO(zero_vidmem(*mem), err);
766 
767     return NV_OK;
768 
769 err:
770     uvm_mem_free(*mem);
771     return status;
772 }
773 
774 static NV_STATUS alloc_sysmem_unprotected(uvm_gpu_t *gpu, uvm_mem_t **mem, size_t size)
775 {
776     NV_STATUS status;
777 
778     UVM_ASSERT(mem);
779 
780     *mem = NULL;
781 
782     TEST_NV_CHECK_RET(uvm_mem_alloc_sysmem_dma(size, gpu, NULL, mem));
783     TEST_NV_CHECK_GOTO(uvm_mem_map_cpu_kernel(*mem), err);
784     TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(*mem, gpu), err);
785 
786     memset(uvm_mem_get_cpu_addr_kernel(*mem), 0, (*mem)->size);
787 
788     return NV_OK;
789 
790 err:
791     uvm_mem_free(*mem);
792     return status;
793 }
794 
795 static void cpu_encrypt(uvm_channel_t *channel,
796                         uvm_mem_t *dst_mem,
797                         uvm_mem_t *src_mem,
798                         uvm_mem_t *auth_tag_mem,
799                         size_t size,
800                         NvU32 copy_size)
801 {
802     size_t offset = 0;
803     char *src_plain = (char *) uvm_mem_get_cpu_addr_kernel(src_mem);
804     char *dst_cipher = (char *) uvm_mem_get_cpu_addr_kernel(dst_mem);
805     char *auth_tag_buffer = (char *) uvm_mem_get_cpu_addr_kernel(auth_tag_mem);
806 
807     while (offset < size) {
808         uvm_conf_computing_cpu_encrypt(channel, dst_cipher, src_plain, NULL, copy_size, auth_tag_buffer);
809 
810         offset += copy_size;
811         dst_cipher += copy_size;
812         src_plain += copy_size;
813         auth_tag_buffer += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
814     }
815 }
816 
817 static void cpu_acquire_encryption_ivs(uvm_channel_t *channel,
818                                        size_t size,
819                                        NvU32 copy_size,
820                                        UvmCslIv *ivs)
821 {
822     size_t offset = 0;
823     int i = 0;
824 
825     for (; offset < size; offset += copy_size)
826         uvm_conf_computing_acquire_encryption_iv(channel, &ivs[i++]);
827 }
828 
829 static void cpu_encrypt_rev(uvm_channel_t *channel,
830                             uvm_mem_t *dst_mem,
831                             uvm_mem_t *src_mem,
832                             uvm_mem_t *auth_tag_mem,
833                             size_t size,
834                             NvU32 copy_size,
835                             UvmCslIv *encrypt_iv)
836 {
837     char *src_plain = (char *) uvm_mem_get_cpu_addr_kernel(src_mem);
838     char *dst_cipher = (char *) uvm_mem_get_cpu_addr_kernel(dst_mem);
839     char *auth_tag_buffer = (char *) uvm_mem_get_cpu_addr_kernel(auth_tag_mem);
840     int i;
841 
842     // CPU encrypt order is the opposite of the GPU decrypt order
843     for (i = (size / copy_size) - 1; i >= 0; i--) {
844         uvm_conf_computing_cpu_encrypt(channel,
845                                        dst_cipher + i * copy_size,
846                                        src_plain + i * copy_size,
847                                        encrypt_iv + i,
848                                        copy_size,
849                                        auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
850     }
851 }
852 
853 static NV_STATUS cpu_decrypt_in_order(uvm_channel_t *channel,
854                                       uvm_mem_t *dst_mem,
855                                       uvm_mem_t *src_mem,
856                                       const UvmCslIv *decrypt_iv,
857                                       uvm_mem_t *auth_tag_mem,
858                                       size_t size,
859                                       NvU32 copy_size)
860 {
861     size_t i;
862     char *dst_plain = (char *) uvm_mem_get_cpu_addr_kernel(dst_mem);
863     char *src_cipher = (char *) uvm_mem_get_cpu_addr_kernel(src_mem);
864     char *auth_tag_buffer = (char *) uvm_mem_get_cpu_addr_kernel(auth_tag_mem);
865 
866     for (i = 0; i < size / copy_size; i++) {
867         TEST_NV_CHECK_RET(uvm_conf_computing_cpu_decrypt(channel,
868                                                          dst_plain + i * copy_size,
869                                                          src_cipher + i * copy_size,
870                                                          decrypt_iv + i,
871                                                          copy_size,
872                                                          auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE));
873     }
874 
875     return NV_OK;
876 }
877 static NV_STATUS cpu_decrypt_out_of_order(uvm_channel_t *channel,
878                                           uvm_mem_t *dst_mem,
879                                           uvm_mem_t *src_mem,
880                                           const UvmCslIv *decrypt_iv,
881                                           uvm_mem_t *auth_tag_mem,
882                                           size_t size,
883                                           NvU32 copy_size)
884 {
885     int i;
886     char *dst_plain = (char *) uvm_mem_get_cpu_addr_kernel(dst_mem);
887     char *src_cipher = (char *) uvm_mem_get_cpu_addr_kernel(src_mem);
888     char *auth_tag_buffer = (char *) uvm_mem_get_cpu_addr_kernel(auth_tag_mem);
889 
890     UVM_ASSERT((size / copy_size) <= INT_MAX);
891 
892     // CPU decrypt order is the opposite of the GPU decrypt order
893     for (i = (size / copy_size) - 1; i >= 0; i--) {
894         TEST_NV_CHECK_RET(uvm_conf_computing_cpu_decrypt(channel,
895                                                          dst_plain + i * copy_size,
896                                                          src_cipher + i * copy_size,
897                                                          decrypt_iv + i,
898                                                          copy_size,
899                                                          auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE));
900     }
901 
902     return NV_OK;
903 }
904 
905 // GPU address to use as source or destination in CE decrypt/encrypt operations.
906 // If the uvm_mem backing storage is contiguous in the [offset, offset + size)
907 // interval, the physical address gets priority over the virtual counterpart.
908 static uvm_gpu_address_t gpu_address(uvm_mem_t *mem, uvm_gpu_t *gpu, NvU64 offset, NvU32 size)
909 {
910     uvm_gpu_address_t gpu_virtual_address;
911 
912     if (uvm_mem_is_physically_contiguous(mem, offset, size))
913         return uvm_mem_gpu_address_physical(mem, gpu, offset, size);
914 
915     gpu_virtual_address = uvm_mem_gpu_address_virtual_kernel(mem, gpu);
916     gpu_virtual_address.address += offset;
917 
918     return gpu_virtual_address;
919 }
920 
921 // Automatically get the correct address for the authentication tag. The
922 // addressing mode of the tag should match that of the reference address
923 // (destination pointer for GPU encrypt, source pointer for GPU encrypt)
924 static uvm_gpu_address_t auth_tag_gpu_address(uvm_mem_t *auth_tag_mem,
925                                               uvm_gpu_t *gpu,
926                                               size_t offset,
927                                               uvm_gpu_address_t reference)
928 {
929     uvm_gpu_address_t auth_tag_gpu_address;
930 
931     if (!reference.is_virtual)
932         return uvm_mem_gpu_address_physical(auth_tag_mem, gpu, offset, UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
933 
934     auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(auth_tag_mem, gpu);
935     auth_tag_gpu_address.address += offset;
936 
937     return auth_tag_gpu_address;
938 }
939 
940 // Note: no membar is issued in any of the GPU transfers (encryptions)
941 static void gpu_encrypt(uvm_push_t *push,
942                         uvm_mem_t *dst_mem,
943                         uvm_mem_t *src_mem,
944                         uvm_mem_t *auth_tag_mem,
945                         UvmCslIv *decrypt_iv,
946                         size_t size,
947                         NvU32 copy_size)
948 {
949     size_t i;
950     size_t num_iterations = size / copy_size;
951     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
952 
953     for (i = 0; i < num_iterations; i++) {
954         uvm_gpu_address_t dst_cipher = gpu_address(dst_mem, gpu, i * copy_size, copy_size);
955         uvm_gpu_address_t src_plain = gpu_address(src_mem, gpu, i * copy_size, copy_size);
956         uvm_gpu_address_t auth_tag = auth_tag_gpu_address(auth_tag_mem,
957                                                           gpu,
958                                                           i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
959                                                           dst_cipher);
960 
961         uvm_conf_computing_log_gpu_encryption(push->channel, decrypt_iv);
962 
963         if (i > 0)
964             uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
965 
966         uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
967 
968         gpu->parent->ce_hal->encrypt(push, dst_cipher, src_plain, copy_size, auth_tag);
969         decrypt_iv++;
970     }
971 }
972 
973 // Note: no membar is issued in any of the GPU transfers (decryptions)
974 static void gpu_decrypt(uvm_push_t *push,
975                         uvm_mem_t *dst_mem,
976                         uvm_mem_t *src_mem,
977                         uvm_mem_t *auth_tag_mem,
978                         size_t size,
979                         NvU32 copy_size)
980 {
981     size_t i;
982     size_t num_iterations = size / copy_size;
983     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
984 
985     for (i = 0; i < num_iterations; i++) {
986         uvm_gpu_address_t dst_plain = gpu_address(dst_mem, gpu, i * copy_size, copy_size);
987         uvm_gpu_address_t src_cipher = gpu_address(src_mem, gpu, i * copy_size, copy_size);
988         uvm_gpu_address_t auth_tag = auth_tag_gpu_address(auth_tag_mem,
989                                                           gpu,
990                                                           i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
991                                                           src_cipher);
992 
993         if (i > 0)
994             uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
995 
996         uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
997 
998         gpu->parent->ce_hal->decrypt(push, dst_plain, src_cipher, copy_size, auth_tag);
999     }
1000 }
1001 
1002 static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
1003                                            uvm_channel_type_t decrypt_channel_type,
1004                                            uvm_channel_type_t encrypt_channel_type,
1005                                            size_t size,
1006                                            NvU32 copy_size,
1007                                            bool decrypt_in_order,
1008                                            bool encrypt_in_order)
1009 {
1010     uvm_push_t push;
1011     NvU64 init_value;
1012     NV_STATUS status = NV_OK;
1013     uvm_mem_t *src_plain = NULL;
1014     uvm_mem_t *src_cipher = NULL;
1015     uvm_mem_t *dst_cipher = NULL;
1016     uvm_mem_t *dst_plain_gpu = NULL;
1017     uvm_mem_t *dst_plain = NULL;
1018     uvm_mem_t *auth_tag_mem = NULL;
1019     size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
1020     UvmCslIv *decrypt_iv = NULL;
1021     UvmCslIv *encrypt_iv = NULL;
1022     uvm_tracker_t tracker;
1023     size_t src_plain_size;
1024 
1025     TEST_CHECK_RET(copy_size <= size);
1026     TEST_CHECK_RET(IS_ALIGNED(size, copy_size));
1027 
1028     uvm_tracker_init(&tracker);
1029 
1030     decrypt_iv = uvm_kvmalloc_zero((size / copy_size) * sizeof(UvmCslIv));
1031     if (!decrypt_iv) {
1032         status = NV_ERR_NO_MEMORY;
1033         goto out;
1034     }
1035 
1036     encrypt_iv = uvm_kvmalloc_zero((size / copy_size) * sizeof(UvmCslIv));
1037     if (!encrypt_iv) {
1038         status = NV_ERR_NO_MEMORY;
1039         goto out;
1040     }
1041 
1042     TEST_NV_CHECK_GOTO(alloc_sysmem_unprotected(gpu, &src_cipher, size), out);
1043     TEST_NV_CHECK_GOTO(alloc_vidmem_protected(gpu, &dst_plain_gpu, size), out);
1044     TEST_NV_CHECK_GOTO(alloc_sysmem_unprotected(gpu, &dst_cipher, size), out);
1045     TEST_NV_CHECK_GOTO(alloc_sysmem_unprotected(gpu, &dst_plain, size), out);
1046     TEST_NV_CHECK_GOTO(alloc_sysmem_unprotected(gpu, &auth_tag_mem, auth_tag_buffer_size), out);
1047 
1048     // The plaintext CPU buffer size should fit the initialization value
1049     src_plain_size = UVM_ALIGN_UP(size, sizeof(init_value));
1050     TEST_NV_CHECK_GOTO(alloc_sysmem_unprotected(gpu, &src_plain, src_plain_size), out);
1051 
1052     // Initialize the plaintext CPU buffer using a value that uniquely
1053     // identifies the given inputs
1054     TEST_CHECK_GOTO((((NvU64) size) < (1ULL << 63)), out);
1055     init_value = ((NvU64) decrypt_in_order << 63) | ((NvU64) size) | ((NvU64) copy_size);
1056     write_range_cpu(src_plain, init_value);
1057 
1058     TEST_NV_CHECK_GOTO(uvm_push_begin(gpu->channel_manager,
1059                                       decrypt_channel_type,
1060                                       &push,
1061                                       "CPU > GPU decrypt"),
1062                        out);
1063 
1064     // CPU (decrypted) > CPU (encrypted), using CPU, if in-order
1065     // acquire IVs if not in-order
1066     if (encrypt_in_order)
1067         cpu_encrypt(push.channel, src_cipher, src_plain, auth_tag_mem, size, copy_size);
1068     else
1069         cpu_acquire_encryption_ivs(push.channel, size, copy_size, encrypt_iv);
1070 
1071     // CPU (encrypted) > GPU (decrypted), using GPU
1072     gpu_decrypt(&push, dst_plain_gpu, src_cipher, auth_tag_mem, size, copy_size);
1073 
1074     // Use acquired IVs to encrypt in reverse order
1075     if (!encrypt_in_order)
1076         cpu_encrypt_rev(push.channel, src_cipher, src_plain, auth_tag_mem, size, copy_size, encrypt_iv);
1077 
1078     uvm_push_end(&push);
1079     TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), out);
1080 
1081     // GPU (decrypted) > CPU (encrypted), using GPU
1082     TEST_NV_CHECK_GOTO(uvm_push_begin_acquire(gpu->channel_manager,
1083                                               encrypt_channel_type,
1084                                               &tracker,
1085                                               &push,
1086                                               "GPU > CPU encrypt"),
1087                        out);
1088 
1089     gpu_encrypt(&push, dst_cipher, dst_plain_gpu, auth_tag_mem, decrypt_iv, size, copy_size);
1090 
1091     TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), out);
1092 
1093     TEST_CHECK_GOTO(!mem_match(src_plain, src_cipher, size), out);
1094 
1095     TEST_CHECK_GOTO(!mem_match(dst_cipher, src_plain, size), out);
1096 
1097     // CPU (encrypted) > CPU (decrypted), using CPU
1098     if (decrypt_in_order) {
1099         TEST_NV_CHECK_GOTO(cpu_decrypt_in_order(push.channel,
1100                                                 dst_plain,
1101                                                 dst_cipher,
1102                                                 decrypt_iv,
1103                                                 auth_tag_mem,
1104                                                 size,
1105                                                 copy_size),
1106                            out);
1107     }
1108     else {
1109         TEST_NV_CHECK_GOTO(cpu_decrypt_out_of_order(push.channel,
1110                                                     dst_plain,
1111                                                     dst_cipher,
1112                                                     decrypt_iv,
1113                                                     auth_tag_mem,
1114                                                     size,
1115                                                     copy_size),
1116                            out);
1117     }
1118 
1119     TEST_CHECK_GOTO(mem_match(src_plain, dst_plain, size), out);
1120 
1121 out:
1122     uvm_mem_free(auth_tag_mem);
1123     uvm_mem_free(dst_plain);
1124     uvm_mem_free(dst_plain_gpu);
1125     uvm_mem_free(dst_cipher);
1126     uvm_mem_free(src_cipher);
1127     uvm_mem_free(src_plain);
1128     uvm_tracker_deinit(&tracker);
1129     uvm_kvfree(decrypt_iv);
1130     uvm_kvfree(encrypt_iv);
1131 
1132     return status;
1133 }
1134 
1135 static NV_STATUS test_encryption_decryption(uvm_gpu_t *gpu,
1136                                             uvm_channel_type_t decrypt_channel_type,
1137                                             uvm_channel_type_t encrypt_channel_type)
1138 {
1139     bool cpu_decrypt_in_order = true;
1140     bool cpu_encrypt_in_order = true;
1141     size_t size[] = {UVM_PAGE_SIZE_4K, UVM_PAGE_SIZE_4K * 2, UVM_PAGE_SIZE_2M};
1142     size_t copy_size[] = {UVM_PAGE_SIZE_4K, UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_2M};
1143     unsigned i;
1144 
1145     struct {
1146         bool encrypt_in_order;
1147         bool decrypt_in_order;
1148     } orders[] = {{true, true}, {true, false}, {false, true}, {false, false}};
1149 
1150     struct {
1151         size_t size;
1152         NvU32 copy_size;
1153     } small_sizes[] = {{1, 1}, {3, 1}, {8, 1}, {2, 2}, {8, 4}, {UVM_PAGE_SIZE_4K - 8, 8}, {UVM_PAGE_SIZE_4K + 8, 8}};
1154 
1155     // Only Confidential Computing uses CE encryption/decryption
1156     if (!uvm_conf_computing_mode_enabled(gpu))
1157         return NV_OK;
1158 
1159     // Use a size, and copy size, that are not a multiple of common page sizes.
1160     for (i = 0; i < ARRAY_SIZE(small_sizes); ++i) {
1161         // Skip tests that need large pushbuffer on WLC. Secure work launch
1162         // needs to do at least one decrypt operation so tests that only need
1163         // one operation work ok. Tests using more operations might overflow
1164         // UVM_MAX_WLC_PUSH_SIZE.
1165         if (encrypt_channel_type == UVM_CHANNEL_TYPE_WLC && (small_sizes[i].size / small_sizes[i].copy_size > 1))
1166             continue;
1167 
1168         TEST_NV_CHECK_RET(test_cpu_to_gpu_roundtrip(gpu,
1169                                                     decrypt_channel_type,
1170                                                     encrypt_channel_type,
1171                                                     small_sizes[i].size,
1172                                                     small_sizes[i].copy_size,
1173                                                     cpu_decrypt_in_order,
1174                                                     cpu_encrypt_in_order));
1175     }
1176 
1177     // Use sizes, and copy sizes, that are a multiple of common page sizes.
1178     // This is the most typical usage of encrypt/decrypt in the UVM driver.
1179     for (i = 0; i < ARRAY_SIZE(orders); ++i) {
1180         unsigned j;
1181 
1182         cpu_encrypt_in_order = orders[i].encrypt_in_order;
1183         cpu_decrypt_in_order = orders[i].decrypt_in_order;
1184 
1185         for (j = 0; j < ARRAY_SIZE(size); ++j) {
1186             unsigned k;
1187 
1188             for (k = 0; k < ARRAY_SIZE(copy_size); ++k) {
1189                 if (copy_size[k] > size[j])
1190                     continue;
1191 
1192                 // Skip tests that need large pushbuffer on WLC. Secure work
1193                 // launch needs to do at least one decrypt operation so tests
1194                 // that only need one operation work ok. Tests using more
1195                 // operations might overflow UVM_MAX_WLC_PUSH_SIZE.
1196                 if (encrypt_channel_type == UVM_CHANNEL_TYPE_WLC && (size[j] / copy_size[k] > 1))
1197                     continue;
1198 
1199                 // There is no difference between in-order and out-of-order
1200                 // decryption when encrypting once.
1201                 if ((copy_size[k] == size[j]) && !cpu_decrypt_in_order)
1202                     continue;
1203 
1204                 TEST_NV_CHECK_RET(test_cpu_to_gpu_roundtrip(gpu,
1205                                                             decrypt_channel_type,
1206                                                             encrypt_channel_type,
1207                                                             size[j],
1208                                                             copy_size[k],
1209                                                             cpu_decrypt_in_order,
1210                                                             cpu_encrypt_in_order));
1211             }
1212         }
1213     }
1214 
1215     return NV_OK;
1216 }
1217 
1218 static NV_STATUS test_ce(uvm_va_space_t *va_space, bool skipTimestampTest)
1219 {
1220     uvm_gpu_t *gpu;
1221 
1222     for_each_va_space_gpu(gpu, va_space) {
1223         TEST_NV_CHECK_RET(test_non_pipelined(gpu));
1224         TEST_NV_CHECK_RET(test_membar(gpu));
1225         TEST_NV_CHECK_RET(test_memcpy_and_memset(gpu));
1226         TEST_NV_CHECK_RET(test_semaphore_reduction_inc(gpu));
1227         TEST_NV_CHECK_RET(test_semaphore_release(gpu));
1228 
1229         if (!skipTimestampTest)
1230             TEST_NV_CHECK_RET(test_semaphore_timestamp(gpu));
1231 
1232         TEST_NV_CHECK_RET(test_encryption_decryption(gpu, UVM_CHANNEL_TYPE_CPU_TO_GPU, UVM_CHANNEL_TYPE_GPU_TO_CPU));
1233         TEST_NV_CHECK_RET(test_encryption_decryption(gpu, UVM_CHANNEL_TYPE_WLC, UVM_CHANNEL_TYPE_WLC));
1234    }
1235 
1236     return NV_OK;
1237 }
1238 
1239 NV_STATUS uvm_test_ce_sanity(UVM_TEST_CE_SANITY_PARAMS *params, struct file *filp)
1240 {
1241     NV_STATUS status;
1242     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1243 
1244     uvm_va_space_down_read_rm(va_space);
1245 
1246     status = test_ce(va_space, params->skipTimestampTest);
1247     if (status != NV_OK)
1248         goto done;
1249 
1250 done:
1251     uvm_va_space_up_read_rm(va_space);
1252 
1253     return status;
1254 }
1255