1 /*******************************************************************************
2     Copyright (c) 2020-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_hal.h"
25 #include "uvm_global.h"
26 #include "uvm_push.h"
27 #include "uvm_mem.h"
28 #include "uvm_conf_computing.h"
29 #include "clc8b5.h"
30 
31 static NvU32 ce_aperture(uvm_aperture_t aperture)
32 {
33     BUILD_BUG_ON(HWCONST(C8B5, SET_SRC_PHYS_MODE, TARGET, LOCAL_FB) !=
34                  HWCONST(C8B5, SET_DST_PHYS_MODE, TARGET, LOCAL_FB));
35     BUILD_BUG_ON(HWCONST(C8B5, SET_SRC_PHYS_MODE, TARGET, COHERENT_SYSMEM) !=
36                  HWCONST(C8B5, SET_DST_PHYS_MODE, TARGET, COHERENT_SYSMEM));
37     BUILD_BUG_ON(HWCONST(C8B5, SET_SRC_PHYS_MODE, TARGET, PEERMEM) !=
38                  HWCONST(C8B5, SET_DST_PHYS_MODE, TARGET, PEERMEM));
39 
40     if (aperture == UVM_APERTURE_SYS) {
41         return HWCONST(C8B5, SET_SRC_PHYS_MODE, TARGET, COHERENT_SYSMEM);
42     }
43     else if (aperture == UVM_APERTURE_VID) {
44         return HWCONST(C8B5, SET_SRC_PHYS_MODE, TARGET, LOCAL_FB);
45     }
46     else {
47         return HWCONST(C8B5, SET_SRC_PHYS_MODE, TARGET, PEERMEM) |
48                HWVALUE(C8B5, SET_SRC_PHYS_MODE, FLA, 0) |
49                HWVALUE(C8B5, SET_SRC_PHYS_MODE, PEER_ID, UVM_APERTURE_PEER_ID(aperture));
50     }
51 }
52 
53 void uvm_hal_hopper_ce_offset_out(uvm_push_t *push, NvU64 offset_out)
54 {
55     NV_PUSH_2U(C8B5, OFFSET_OUT_UPPER, HWVALUE(C8B5, OFFSET_OUT_UPPER, UPPER, NvOffset_HI32(offset_out)),
56                      OFFSET_OUT_LOWER, HWVALUE(C8B5, OFFSET_OUT_LOWER, VALUE, NvOffset_LO32(offset_out)));
57 }
58 
59 void uvm_hal_hopper_ce_offset_in_out(uvm_push_t *push, NvU64 offset_in, NvU64 offset_out)
60 {
61     NV_PUSH_4U(C8B5, OFFSET_IN_UPPER,  HWVALUE(C8B5, OFFSET_IN_UPPER,  UPPER, NvOffset_HI32(offset_in)),
62                      OFFSET_IN_LOWER,  HWVALUE(C8B5, OFFSET_IN_LOWER,  VALUE, NvOffset_LO32(offset_in)),
63                      OFFSET_OUT_UPPER, HWVALUE(C8B5, OFFSET_OUT_UPPER, UPPER, NvOffset_HI32(offset_out)),
64                      OFFSET_OUT_LOWER, HWVALUE(C8B5, OFFSET_OUT_LOWER, VALUE, NvOffset_LO32(offset_out)));
65 }
66 
67 // Return the flush type and the flush enablement.
68 static NvU32 hopper_get_flush_value(uvm_push_t *push)
69 {
70     NvU32 flush_value;
71     uvm_membar_t membar = uvm_push_get_and_reset_membar_flag(push);
72 
73     if (membar == UVM_MEMBAR_NONE) {
74         // No MEMBAR requested, don't use a flush.
75         flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);
76     }
77     else {
78         flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
79 
80         if (membar == UVM_MEMBAR_GPU)
81             flush_value |= HWCONST(C8B5, LAUNCH_DMA, FLUSH_TYPE, GL);
82         else
83             flush_value |= HWCONST(C8B5, LAUNCH_DMA, FLUSH_TYPE, SYS);
84     }
85 
86     return flush_value;
87 }
88 
89 void uvm_hal_hopper_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload)
90 {
91     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
92     NvU32 launch_dma_plc_mode;
93 
94     NV_PUSH_3U(C8B5, SET_SEMAPHORE_A, HWVALUE(C8B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)),
95                      SET_SEMAPHORE_B, HWVALUE(C8B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)),
96                      SET_SEMAPHORE_PAYLOAD, payload);
97 
98     launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();
99 
100     NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) |
101        HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
102        HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_PAYLOAD_SIZE, ONE_WORD) |
103        HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_SEMAPHORE_NO_TIMESTAMP) |
104        launch_dma_plc_mode);
105 }
106 
107 void uvm_hal_hopper_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, NvU32 payload)
108 {
109     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
110     NvU32 launch_dma_plc_mode;
111 
112     NV_PUSH_3U(C8B5, SET_SEMAPHORE_A, HWVALUE(C8B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)),
113                      SET_SEMAPHORE_B, HWVALUE(C8B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)),
114                      SET_SEMAPHORE_PAYLOAD, payload);
115 
116     launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();
117 
118     NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) |
119        HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
120        HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_PAYLOAD_SIZE, ONE_WORD) |
121        HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_SEMAPHORE_NO_TIMESTAMP) |
122        HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_REDUCTION, INC) |
123        HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_REDUCTION_SIGN, UNSIGNED) |
124        HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_REDUCTION_ENABLE, TRUE) |
125        launch_dma_plc_mode);
126 }
127 
128 void uvm_hal_hopper_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va)
129 {
130     uvm_gpu_t *gpu;
131     NvU32 launch_dma_plc_mode;
132 
133     NV_PUSH_3U(C8B5, SET_SEMAPHORE_A, HWVALUE(C8B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)),
134                      SET_SEMAPHORE_B, HWVALUE(C8B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)),
135                      SET_SEMAPHORE_PAYLOAD, 0xdeadbeef);
136 
137     gpu = uvm_push_get_gpu(push);
138     launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();
139 
140     NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) |
141        HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
142        HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_PAYLOAD_SIZE, ONE_WORD) |
143        HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_SEMAPHORE_WITH_TIMESTAMP) |
144        launch_dma_plc_mode);
145 }
146 
147 static NvU32 hopper_memset_push_phys_mode(uvm_push_t *push, uvm_gpu_address_t dst)
148 {
149     if (dst.is_virtual)
150         return HWCONST(C8B5, LAUNCH_DMA, DST_TYPE, VIRTUAL);
151 
152     NV_PUSH_1U(C8B5, SET_DST_PHYS_MODE, ce_aperture(dst.aperture));
153     return HWCONST(C8B5, LAUNCH_DMA, DST_TYPE, PHYSICAL);
154 }
155 
156 static bool va_is_flat_vidmem(uvm_gpu_t *gpu, NvU64 va)
157 {
158     return (uvm_mmu_parent_gpu_needs_static_vidmem_mapping(gpu->parent) ||
159             uvm_mmu_parent_gpu_needs_dynamic_vidmem_mapping(gpu->parent)) &&
160            va >= gpu->parent->flat_vidmem_va_base &&
161            va < gpu->parent->flat_vidmem_va_base + UVM_GPU_MAX_PHYS_MEM;
162 }
163 
164 // Return whether a memset should use the fast scrubber. If so, convert dst to
165 // the address needed by the fast scrubber.
166 static bool hopper_scrub_enable(uvm_gpu_t *gpu, uvm_gpu_address_t *dst, size_t size)
167 {
168     if (!IS_ALIGNED(dst->address, UVM_PAGE_SIZE_4K) || !IS_ALIGNED(size, UVM_PAGE_SIZE_4K))
169         return false;
170 
171     // When CE physical writes are disallowed, higher layers will convert
172     // physical memsets to virtual using the flat mapping. Those layers are
173     // unaware of the fast scrubber, which is safe to use specifically when CE
174     // physical access is disallowed. Detect such memsets within the flat vidmem
175     // region and convert them back to physical, since the fast scrubber only
176     // works with physical addressing.
177     if (dst->is_virtual && !gpu->parent->ce_phys_vidmem_write_supported && va_is_flat_vidmem(gpu, dst->address)) {
178         *dst = uvm_gpu_address_physical(UVM_APERTURE_VID, dst->address - gpu->parent->flat_vidmem_va_base);
179         return true;
180     }
181 
182     return !dst->is_virtual && dst->aperture == UVM_APERTURE_VID;
183 }
184 
185 static NvU32 hopper_memset_copy_type(uvm_gpu_address_t dst)
186 {
187     if (g_uvm_global.conf_computing_enabled && dst.is_unprotected)
188         return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, NONPROT2NONPROT);
189     return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, DEFAULT);
190 }
191 
192 NvU32 uvm_hal_hopper_ce_memcopy_copy_type(uvm_gpu_address_t dst, uvm_gpu_address_t src)
193 {
194     if (g_uvm_global.conf_computing_enabled && dst.is_unprotected && src.is_unprotected)
195         return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, NONPROT2NONPROT);
196 
197     return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, DEFAULT);
198 }
199 
200 static void hopper_memset_common(uvm_push_t *push,
201                                  uvm_gpu_address_t dst,
202                                  size_t num_elements,
203                                  size_t memset_element_size)
204 {
205     // If >4GB memsets ever become an important use case, this function should
206     // use multi-line transfers so we don't have to iterate (bug 1766588).
207     static const size_t max_single_memset = 0xFFFFFFFF;
208 
209     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
210     NvU32 pipelined_value;
211     NvU32 launch_dma_dst_type;
212     NvU32 launch_dma_plc_mode;
213     NvU32 launch_dma_remap_enable;
214     NvU32 launch_dma_scrub_enable;
215     NvU32 flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);
216     NvU32 copy_type_value = hopper_memset_copy_type(dst);
217     bool is_scrub = hopper_scrub_enable(gpu, &dst, num_elements * memset_element_size);
218 
219     UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_is_valid(push, dst, num_elements, memset_element_size),
220                    "Memset validation failed in channel %s, GPU %s",
221                    push->channel->name,
222                    uvm_gpu_name(gpu));
223 
224     launch_dma_dst_type = hopper_memset_push_phys_mode(push, dst);
225     launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();
226 
227     if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED))
228         pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
229     else
230         pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED);
231 
232     if (memset_element_size == 8 && is_scrub) {
233         launch_dma_remap_enable = HWCONST(C8B5, LAUNCH_DMA, REMAP_ENABLE, FALSE);
234         launch_dma_scrub_enable = HWCONST(C8B5, LAUNCH_DMA, MEMORY_SCRUB_ENABLE, TRUE);
235 
236         NV_PUSH_1U(C8B5, SET_MEMORY_SCRUB_PARAMETERS,
237            HWCONST(C8B5, SET_MEMORY_SCRUB_PARAMETERS, DISCARDABLE, FALSE));
238 
239         // Scrub requires disabling remap, and with remap disabled the element
240         // size is 1.
241         num_elements *= memset_element_size;
242         memset_element_size = 1;
243     }
244     else {
245         launch_dma_remap_enable = HWCONST(C8B5, LAUNCH_DMA, REMAP_ENABLE, TRUE);
246         launch_dma_scrub_enable = HWCONST(C8B5, LAUNCH_DMA, MEMORY_SCRUB_ENABLE, FALSE);
247     }
248 
249     do {
250         NvU32 memset_this_time = (NvU32)min(num_elements, max_single_memset);
251 
252         // In the last operation, a flush/membar may be issued after the memset.
253         if (num_elements == memset_this_time)
254             flush_value = hopper_get_flush_value(push);
255 
256         gpu->parent->ce_hal->offset_out(push, dst.address);
257 
258         NV_PUSH_1U(C8B5, LINE_LENGTH_IN, memset_this_time);
259 
260         NV_PUSH_1U(C8B5, LAUNCH_DMA,
261            HWCONST(C8B5, LAUNCH_DMA, SRC_MEMORY_LAYOUT, PITCH) |
262            HWCONST(C8B5, LAUNCH_DMA, DST_MEMORY_LAYOUT, PITCH) |
263            HWCONST(C8B5, LAUNCH_DMA, MULTI_LINE_ENABLE, FALSE) |
264            flush_value |
265            launch_dma_remap_enable |
266            launch_dma_scrub_enable |
267            launch_dma_dst_type |
268            launch_dma_plc_mode |
269            copy_type_value |
270            pipelined_value);
271 
272         dst.address += memset_this_time * memset_element_size;
273         num_elements -= memset_this_time;
274         pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
275     } while (num_elements > 0);
276 }
277 
278 void uvm_hal_hopper_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 value, size_t size)
279 {
280     UVM_ASSERT_MSG(size % 8 == 0, "size: %zd\n", size);
281 
282     size /= 8;
283 
284     NV_PUSH_3U(C8B5, SET_REMAP_CONST_A, (NvU32)value,
285                      SET_REMAP_CONST_B, (NvU32)(value >> 32),
286                      SET_REMAP_COMPONENTS,
287        HWCONST(C8B5, SET_REMAP_COMPONENTS, DST_X,               CONST_A) |
288        HWCONST(C8B5, SET_REMAP_COMPONENTS, DST_Y,               CONST_B) |
289        HWCONST(C8B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE,      FOUR)    |
290        HWCONST(C8B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS,  TWO));
291 
292     hopper_memset_common(push, dst, size, 8);
293 }
294 
295 void uvm_hal_hopper_ce_memset_1(uvm_push_t *push, uvm_gpu_address_t dst, NvU8 value, size_t size)
296 {
297     if (hopper_scrub_enable(uvm_push_get_gpu(push), &dst, size)) {
298         NvU64 value64 = value;
299 
300         value64 |= value64 << 8;
301         value64 |= value64 << 16;
302         value64 |= value64 << 32;
303 
304         uvm_hal_hopper_ce_memset_8(push, dst, value64, size);
305         return;
306     }
307 
308     NV_PUSH_2U(C8B5, SET_REMAP_CONST_B,    (NvU32)value,
309                      SET_REMAP_COMPONENTS,
310        HWCONST(C8B5, SET_REMAP_COMPONENTS, DST_X,               CONST_B) |
311        HWCONST(C8B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE,      ONE)     |
312        HWCONST(C8B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS,  ONE));
313 
314     hopper_memset_common(push, dst, size, 1);
315 }
316 
317 void uvm_hal_hopper_ce_memset_4(uvm_push_t *push, uvm_gpu_address_t dst, NvU32 value, size_t size)
318 {
319     UVM_ASSERT_MSG(size % 4 == 0, "size: %zd\n", size);
320 
321     if (hopper_scrub_enable(uvm_push_get_gpu(push), &dst, size)) {
322         NvU64 value64 = value;
323 
324         value64 |= value64 << 32;
325 
326         uvm_hal_hopper_ce_memset_8(push, dst, value64, size);
327         return;
328     }
329 
330     size /= 4;
331 
332     NV_PUSH_2U(C8B5, SET_REMAP_CONST_B,    value,
333                      SET_REMAP_COMPONENTS,
334        HWCONST(C8B5, SET_REMAP_COMPONENTS, DST_X,               CONST_B) |
335        HWCONST(C8B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE,      FOUR)    |
336        HWCONST(C8B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS,  ONE));
337 
338     hopper_memset_common(push, dst, size, 4);
339 }
340 
341 bool uvm_hal_hopper_ce_memset_is_valid(uvm_push_t *push,
342                                        uvm_gpu_address_t dst,
343                                        size_t num_elements,
344                                        size_t element_size)
345 {
346     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
347 
348     // In HCC, if a memset uses physical addressing for the destination, then
349     // it must write to (protected) vidmem. If the memset uses virtual
350     // addressing, and the backing storage is not vidmem, the access is only
351     // legal if the copy type is NONPROT2NONPROT, and the destination is
352     // unprotected sysmem, but the validation does not detect it.
353     if (uvm_conf_computing_mode_is_hcc(gpu) && !dst.is_virtual && dst.aperture != UVM_APERTURE_VID)
354         return false;
355 
356     if (!gpu->parent->ce_phys_vidmem_write_supported) {
357         size_t size = num_elements * element_size;
358         uvm_gpu_address_t temp = dst;
359 
360         // Physical vidmem writes are disallowed, unless using the scrubber
361         if (!dst.is_virtual && dst.aperture == UVM_APERTURE_VID && !hopper_scrub_enable(gpu, &temp, size)) {
362             UVM_ERR_PRINT("Destination address of vidmem memset must be virtual, not physical: {%s, 0x%llx} size %zu\n",
363                           uvm_gpu_address_aperture_string(dst),
364                           dst.address,
365                           size);
366             return false;
367         }
368     }
369 
370     return true;
371 }
372 
373 bool uvm_hal_hopper_ce_memcopy_is_valid(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src)
374 {
375     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
376 
377     if (uvm_conf_computing_mode_is_hcc(gpu)) {
378         // In HCC, if a memcopy uses physical addressing for either the
379         // destination or the source, then the corresponding aperture must be
380         // vidmem. If virtual addressing is used, and the backing storage is
381         // sysmem the access is only legal if the copy type is NONPROT2NONPROT,
382         // but the validation does not detect it. In other words the copy
383         // source and destination is unprotected sysmem.
384         if (!src.is_virtual && (src.aperture != UVM_APERTURE_VID))
385             return false;
386 
387         if (!dst.is_virtual && (dst.aperture != UVM_APERTURE_VID))
388             return false;
389 
390         if (dst.is_unprotected != src.is_unprotected)
391             return false;
392     }
393 
394     if (!gpu->parent->ce_phys_vidmem_write_supported && !dst.is_virtual && dst.aperture == UVM_APERTURE_VID) {
395         UVM_ERR_PRINT("Destination address of vidmem memcopy must be virtual, not physical: {%s, 0x%llx}\n",
396                       uvm_gpu_address_aperture_string(dst),
397                       dst.address);
398         return false;
399     }
400 
401     return true;
402 }
403 
404 // Specialized version of uvm_hal_volta_ce_memcopy used for encryption and
405 // decryption. Pre-Hopper functionality, such as validation or address patching,
406 // has been removed.
407 static void encrypt_or_decrypt(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src, NvU32 size)
408 {
409     NvU32 pipelined_value;
410     NvU32 launch_dma_src_dst_type;
411     NvU32 launch_dma_plc_mode;
412     NvU32 flush_value;
413     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
414 
415     // HW allows unaligned operations only if the entire buffer is in one 32B
416     // sector. Operations on buffers larger than 32B have to be aligned.
417     if (size > UVM_CONF_COMPUTING_BUF_ALIGNMENT) {
418         UVM_ASSERT(IS_ALIGNED(src.address, UVM_CONF_COMPUTING_BUF_ALIGNMENT));
419         UVM_ASSERT(IS_ALIGNED(dst.address, UVM_CONF_COMPUTING_BUF_ALIGNMENT));
420     }
421     else {
422         UVM_ASSERT((dst.address >> UVM_CONF_COMPUTING_BUF_ALIGNMENT) ==
423                    ((dst.address + size - 1) >> UVM_CONF_COMPUTING_BUF_ALIGNMENT));
424         UVM_ASSERT((src.address >> UVM_CONF_COMPUTING_BUF_ALIGNMENT) ==
425                    ((src.address + size - 1) >> UVM_CONF_COMPUTING_BUF_ALIGNMENT));
426     }
427 
428     launch_dma_src_dst_type = gpu->parent->ce_hal->phys_mode(push, dst, src);
429     launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();
430 
431     if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED))
432         pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
433     else
434         pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED);
435 
436     flush_value = hopper_get_flush_value(push);
437 
438     gpu->parent->ce_hal->offset_in_out(push, src.address, dst.address);
439 
440     NV_PUSH_1U(C8B5, LINE_LENGTH_IN, size);
441 
442     NV_PUSH_1U(C8B5, LAUNCH_DMA, HWCONST(C8B5, LAUNCH_DMA, SRC_MEMORY_LAYOUT, PITCH) |
443                                  HWCONST(C8B5, LAUNCH_DMA, DST_MEMORY_LAYOUT, PITCH) |
444                                  HWCONST(C8B5, LAUNCH_DMA, MULTI_LINE_ENABLE, FALSE) |
445                                  HWCONST(C8B5, LAUNCH_DMA, REMAP_ENABLE, FALSE) |
446                                  HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, SECURE) |
447                                  flush_value |
448                                  launch_dma_src_dst_type |
449                                  launch_dma_plc_mode |
450                                  pipelined_value);
451 }
452 
453 // The GPU CE encrypt operation requires clients to pass a valid
454 // address where the used IV will be written. But this requirement is
455 // unnecessary, because UVM should instead rely on the CSL
456 // nvUvmInterfaceCslLogDeviceEncryption API to independently track
457 // the expected IV.
458 //
459 // To satisfy the HW requirement the same unprotected sysmem address is
460 // passed to all GPU-side encryptions. This dummy buffer is allocated at
461 // GPU initialization time.
462 static NvU64 encrypt_iv_address(uvm_push_t *push, uvm_gpu_address_t dst)
463 {
464     NvU64 iv_address;
465     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
466 
467     // Match addressing mode of destination and IV
468     if (dst.is_virtual) {
469         iv_address = uvm_rm_mem_get_gpu_va(gpu->conf_computing.iv_rm_mem, gpu, false).address;
470     }
471     else {
472         iv_address = uvm_mem_gpu_physical(gpu->conf_computing.iv_mem,
473                                           gpu,
474                                           0,
475                                           gpu->conf_computing.iv_mem->size).address;
476     }
477 
478     UVM_ASSERT(IS_ALIGNED(iv_address, UVM_CONF_COMPUTING_IV_ALIGNMENT));
479 
480     return iv_address;
481 }
482 
483 // TODO: Bug 3842953: adapt CE encrypt/decrypt for p2p encrypted transfers
484 void uvm_hal_hopper_ce_encrypt(uvm_push_t *push,
485                                uvm_gpu_address_t dst,
486                                uvm_gpu_address_t src,
487                                NvU32 size,
488                                uvm_gpu_address_t auth_tag)
489 {
490 
491     NvU32 auth_tag_address_hi32, auth_tag_address_lo32;
492     NvU64 iv_address;
493     NvU32 iv_address_hi32, iv_address_lo32;
494     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
495 
496     UVM_ASSERT(uvm_conf_computing_mode_is_hcc(gpu));
497     UVM_ASSERT(IS_ALIGNED(auth_tag.address, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT));
498 
499     if (!src.is_virtual)
500         UVM_ASSERT(src.aperture == UVM_APERTURE_VID);
501 
502     // The addressing mode (and aperture, if applicable) of the destination
503     // pointer determines the addressing mode and aperture used by the
504     // encryption to reference the other two addresses written by it:
505     // authentication tag, and IV. If the client passes a sysmem physical
506     // address as destination, then the authentication tag must also be a sysmem
507     // physical address.
508     UVM_ASSERT(dst.is_virtual == auth_tag.is_virtual);
509 
510     if (!dst.is_virtual) {
511         UVM_ASSERT(dst.aperture == UVM_APERTURE_SYS);
512         UVM_ASSERT(auth_tag.aperture == UVM_APERTURE_SYS);
513     }
514 
515     NV_PUSH_1U(C8B5, SET_SECURE_COPY_MODE, HWCONST(C8B5, SET_SECURE_COPY_MODE, MODE, ENCRYPT));
516 
517     auth_tag_address_hi32 = HWVALUE(C8B5, SET_ENCRYPT_AUTH_TAG_ADDR_UPPER, UPPER, NvU64_HI32(auth_tag.address));
518     auth_tag_address_lo32 = HWVALUE(C8B5, SET_ENCRYPT_AUTH_TAG_ADDR_LOWER, LOWER, NvU64_LO32(auth_tag.address));
519 
520     iv_address = encrypt_iv_address(push, dst);
521 
522     iv_address_hi32 = HWVALUE(C8B5, SET_ENCRYPT_IV_ADDR_UPPER, UPPER, NvU64_HI32(iv_address));
523     iv_address_lo32 = HWVALUE(C8B5, SET_ENCRYPT_IV_ADDR_LOWER, LOWER, NvU64_LO32(iv_address));
524 
525     NV_PUSH_4U(C8B5, SET_ENCRYPT_AUTH_TAG_ADDR_UPPER, auth_tag_address_hi32,
526                      SET_ENCRYPT_AUTH_TAG_ADDR_LOWER, auth_tag_address_lo32,
527                      SET_ENCRYPT_IV_ADDR_UPPER, iv_address_hi32,
528                      SET_ENCRYPT_IV_ADDR_LOWER, iv_address_lo32);
529 
530     encrypt_or_decrypt(push, dst, src, size);
531 }
532 
533 // TODO: Bug 3842953: adapt CE encrypt/decrypt for p2p encrypted transfers
534 void uvm_hal_hopper_ce_decrypt(uvm_push_t *push,
535                                uvm_gpu_address_t dst,
536                                uvm_gpu_address_t src,
537                                NvU32 size,
538                                uvm_gpu_address_t auth_tag)
539 {
540 
541     NvU32 auth_tag_address_hi32, auth_tag_address_lo32;
542     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
543 
544     UVM_ASSERT(uvm_conf_computing_mode_is_hcc(gpu));
545     UVM_ASSERT(IS_ALIGNED(auth_tag.address, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT));
546 
547     // The addressing mode (and aperture, if applicable) of the source and
548     // authentication pointers should match. But unlike in the encryption case,
549     // clients are not forced to pass a valid IV address.
550     UVM_ASSERT(src.is_virtual == auth_tag.is_virtual);
551 
552     if (!src.is_virtual) {
553         UVM_ASSERT(src.aperture == UVM_APERTURE_SYS);
554         UVM_ASSERT(auth_tag.aperture == UVM_APERTURE_SYS);
555     }
556 
557     if (!dst.is_virtual)
558         UVM_ASSERT(dst.aperture == UVM_APERTURE_VID);
559 
560     NV_PUSH_1U(C8B5, SET_SECURE_COPY_MODE, HWCONST(C8B5, SET_SECURE_COPY_MODE, MODE, DECRYPT));
561 
562     auth_tag_address_hi32 = HWVALUE(C8B5, SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_UPPER, UPPER, NvU64_HI32(auth_tag.address));
563     auth_tag_address_lo32 = HWVALUE(C8B5, SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_LOWER, LOWER, NvU64_LO32(auth_tag.address));
564 
565     NV_PUSH_2U(C8B5, SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_UPPER, auth_tag_address_hi32,
566                      SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_LOWER, auth_tag_address_lo32);
567 
568     encrypt_or_decrypt(push, dst, src, size);
569 }
570 
571