1 /******************************************************************************* 2 Copyright (c) 2020-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_hal.h" 25 #include "uvm_global.h" 26 #include "uvm_push.h" 27 #include "uvm_mem.h" 28 #include "uvm_conf_computing.h" 29 #include "clc8b5.h" 30 31 static NvU32 ce_aperture(uvm_aperture_t aperture) 32 { 33 BUILD_BUG_ON(HWCONST(C8B5, SET_SRC_PHYS_MODE, TARGET, LOCAL_FB) != 34 HWCONST(C8B5, SET_DST_PHYS_MODE, TARGET, LOCAL_FB)); 35 BUILD_BUG_ON(HWCONST(C8B5, SET_SRC_PHYS_MODE, TARGET, COHERENT_SYSMEM) != 36 HWCONST(C8B5, SET_DST_PHYS_MODE, TARGET, COHERENT_SYSMEM)); 37 BUILD_BUG_ON(HWCONST(C8B5, SET_SRC_PHYS_MODE, TARGET, PEERMEM) != 38 HWCONST(C8B5, SET_DST_PHYS_MODE, TARGET, PEERMEM)); 39 40 if (aperture == UVM_APERTURE_SYS) { 41 return HWCONST(C8B5, SET_SRC_PHYS_MODE, TARGET, COHERENT_SYSMEM); 42 } 43 else if (aperture == UVM_APERTURE_VID) { 44 return HWCONST(C8B5, SET_SRC_PHYS_MODE, TARGET, LOCAL_FB); 45 } 46 else { 47 return HWCONST(C8B5, SET_SRC_PHYS_MODE, TARGET, PEERMEM) | 48 HWVALUE(C8B5, SET_SRC_PHYS_MODE, FLA, 0) | 49 HWVALUE(C8B5, SET_SRC_PHYS_MODE, PEER_ID, UVM_APERTURE_PEER_ID(aperture)); 50 } 51 } 52 53 void uvm_hal_hopper_ce_offset_out(uvm_push_t *push, NvU64 offset_out) 54 { 55 NV_PUSH_2U(C8B5, OFFSET_OUT_UPPER, HWVALUE(C8B5, OFFSET_OUT_UPPER, UPPER, NvOffset_HI32(offset_out)), 56 OFFSET_OUT_LOWER, HWVALUE(C8B5, OFFSET_OUT_LOWER, VALUE, NvOffset_LO32(offset_out))); 57 } 58 59 void uvm_hal_hopper_ce_offset_in_out(uvm_push_t *push, NvU64 offset_in, NvU64 offset_out) 60 { 61 NV_PUSH_4U(C8B5, OFFSET_IN_UPPER, HWVALUE(C8B5, OFFSET_IN_UPPER, UPPER, NvOffset_HI32(offset_in)), 62 OFFSET_IN_LOWER, HWVALUE(C8B5, OFFSET_IN_LOWER, VALUE, NvOffset_LO32(offset_in)), 63 OFFSET_OUT_UPPER, HWVALUE(C8B5, OFFSET_OUT_UPPER, UPPER, NvOffset_HI32(offset_out)), 64 OFFSET_OUT_LOWER, HWVALUE(C8B5, OFFSET_OUT_LOWER, VALUE, NvOffset_LO32(offset_out))); 65 } 66 67 // Return the flush type and the flush enablement. 68 static NvU32 hopper_get_flush_value(uvm_push_t *push) 69 { 70 NvU32 flush_value; 71 uvm_membar_t membar = uvm_push_get_and_reset_membar_flag(push); 72 73 if (membar == UVM_MEMBAR_NONE) { 74 // No MEMBAR requested, don't use a flush. 75 flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE); 76 } 77 else { 78 flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE); 79 80 if (membar == UVM_MEMBAR_GPU) 81 flush_value |= HWCONST(C8B5, LAUNCH_DMA, FLUSH_TYPE, GL); 82 else 83 flush_value |= HWCONST(C8B5, LAUNCH_DMA, FLUSH_TYPE, SYS); 84 } 85 86 return flush_value; 87 } 88 89 void uvm_hal_hopper_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload) 90 { 91 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 92 NvU32 launch_dma_plc_mode; 93 94 NV_PUSH_3U(C8B5, SET_SEMAPHORE_A, HWVALUE(C8B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)), 95 SET_SEMAPHORE_B, HWVALUE(C8B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)), 96 SET_SEMAPHORE_PAYLOAD, payload); 97 98 launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode(); 99 100 NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) | 101 HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) | 102 HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_PAYLOAD_SIZE, ONE_WORD) | 103 HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_SEMAPHORE_NO_TIMESTAMP) | 104 launch_dma_plc_mode); 105 } 106 107 void uvm_hal_hopper_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, NvU32 payload) 108 { 109 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 110 NvU32 launch_dma_plc_mode; 111 112 NV_PUSH_3U(C8B5, SET_SEMAPHORE_A, HWVALUE(C8B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)), 113 SET_SEMAPHORE_B, HWVALUE(C8B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)), 114 SET_SEMAPHORE_PAYLOAD, payload); 115 116 launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode(); 117 118 NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) | 119 HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) | 120 HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_PAYLOAD_SIZE, ONE_WORD) | 121 HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_SEMAPHORE_NO_TIMESTAMP) | 122 HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_REDUCTION, INC) | 123 HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_REDUCTION_SIGN, UNSIGNED) | 124 HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_REDUCTION_ENABLE, TRUE) | 125 launch_dma_plc_mode); 126 } 127 128 void uvm_hal_hopper_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va) 129 { 130 uvm_gpu_t *gpu; 131 NvU32 launch_dma_plc_mode; 132 133 NV_PUSH_3U(C8B5, SET_SEMAPHORE_A, HWVALUE(C8B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)), 134 SET_SEMAPHORE_B, HWVALUE(C8B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)), 135 SET_SEMAPHORE_PAYLOAD, 0xdeadbeef); 136 137 gpu = uvm_push_get_gpu(push); 138 launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode(); 139 140 NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) | 141 HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) | 142 HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_PAYLOAD_SIZE, ONE_WORD) | 143 HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_SEMAPHORE_WITH_TIMESTAMP) | 144 launch_dma_plc_mode); 145 } 146 147 static NvU32 hopper_memset_push_phys_mode(uvm_push_t *push, uvm_gpu_address_t dst) 148 { 149 if (dst.is_virtual) 150 return HWCONST(C8B5, LAUNCH_DMA, DST_TYPE, VIRTUAL); 151 152 NV_PUSH_1U(C8B5, SET_DST_PHYS_MODE, ce_aperture(dst.aperture)); 153 return HWCONST(C8B5, LAUNCH_DMA, DST_TYPE, PHYSICAL); 154 } 155 156 static bool va_is_flat_vidmem(uvm_gpu_t *gpu, NvU64 va) 157 { 158 return (uvm_mmu_parent_gpu_needs_static_vidmem_mapping(gpu->parent) || 159 uvm_mmu_parent_gpu_needs_dynamic_vidmem_mapping(gpu->parent)) && 160 va >= gpu->parent->flat_vidmem_va_base && 161 va < gpu->parent->flat_vidmem_va_base + UVM_GPU_MAX_PHYS_MEM; 162 } 163 164 // Return whether a memset should use the fast scrubber. If so, convert dst to 165 // the address needed by the fast scrubber. 166 static bool hopper_scrub_enable(uvm_gpu_t *gpu, uvm_gpu_address_t *dst, size_t size) 167 { 168 if (!IS_ALIGNED(dst->address, UVM_PAGE_SIZE_4K) || !IS_ALIGNED(size, UVM_PAGE_SIZE_4K)) 169 return false; 170 171 // When CE physical writes are disallowed, higher layers will convert 172 // physical memsets to virtual using the flat mapping. Those layers are 173 // unaware of the fast scrubber, which is safe to use specifically when CE 174 // physical access is disallowed. Detect such memsets within the flat vidmem 175 // region and convert them back to physical, since the fast scrubber only 176 // works with physical addressing. 177 if (dst->is_virtual && !gpu->parent->ce_phys_vidmem_write_supported && va_is_flat_vidmem(gpu, dst->address)) { 178 *dst = uvm_gpu_address_physical(UVM_APERTURE_VID, dst->address - gpu->parent->flat_vidmem_va_base); 179 return true; 180 } 181 182 return !dst->is_virtual && dst->aperture == UVM_APERTURE_VID; 183 } 184 185 static NvU32 hopper_memset_copy_type(uvm_gpu_address_t dst) 186 { 187 if (g_uvm_global.conf_computing_enabled && dst.is_unprotected) 188 return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, NONPROT2NONPROT); 189 return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, DEFAULT); 190 } 191 192 NvU32 uvm_hal_hopper_ce_memcopy_copy_type(uvm_gpu_address_t dst, uvm_gpu_address_t src) 193 { 194 if (g_uvm_global.conf_computing_enabled && dst.is_unprotected && src.is_unprotected) 195 return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, NONPROT2NONPROT); 196 197 return HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, DEFAULT); 198 } 199 200 static void hopper_memset_common(uvm_push_t *push, 201 uvm_gpu_address_t dst, 202 size_t num_elements, 203 size_t memset_element_size) 204 { 205 // If >4GB memsets ever become an important use case, this function should 206 // use multi-line transfers so we don't have to iterate (bug 1766588). 207 static const size_t max_single_memset = 0xFFFFFFFF; 208 209 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 210 NvU32 pipelined_value; 211 NvU32 launch_dma_dst_type; 212 NvU32 launch_dma_plc_mode; 213 NvU32 launch_dma_remap_enable; 214 NvU32 launch_dma_scrub_enable; 215 NvU32 flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE); 216 NvU32 copy_type_value = hopper_memset_copy_type(dst); 217 bool is_scrub = hopper_scrub_enable(gpu, &dst, num_elements * memset_element_size); 218 219 UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_is_valid(push, dst, num_elements, memset_element_size), 220 "Memset validation failed in channel %s, GPU %s", 221 push->channel->name, 222 uvm_gpu_name(gpu)); 223 224 launch_dma_dst_type = hopper_memset_push_phys_mode(push, dst); 225 launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode(); 226 227 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED)) 228 pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED); 229 else 230 pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED); 231 232 if (memset_element_size == 8 && is_scrub) { 233 launch_dma_remap_enable = HWCONST(C8B5, LAUNCH_DMA, REMAP_ENABLE, FALSE); 234 launch_dma_scrub_enable = HWCONST(C8B5, LAUNCH_DMA, MEMORY_SCRUB_ENABLE, TRUE); 235 236 NV_PUSH_1U(C8B5, SET_MEMORY_SCRUB_PARAMETERS, 237 HWCONST(C8B5, SET_MEMORY_SCRUB_PARAMETERS, DISCARDABLE, FALSE)); 238 239 // Scrub requires disabling remap, and with remap disabled the element 240 // size is 1. 241 num_elements *= memset_element_size; 242 memset_element_size = 1; 243 } 244 else { 245 launch_dma_remap_enable = HWCONST(C8B5, LAUNCH_DMA, REMAP_ENABLE, TRUE); 246 launch_dma_scrub_enable = HWCONST(C8B5, LAUNCH_DMA, MEMORY_SCRUB_ENABLE, FALSE); 247 } 248 249 do { 250 NvU32 memset_this_time = (NvU32)min(num_elements, max_single_memset); 251 252 // In the last operation, a flush/membar may be issued after the memset. 253 if (num_elements == memset_this_time) 254 flush_value = hopper_get_flush_value(push); 255 256 gpu->parent->ce_hal->offset_out(push, dst.address); 257 258 NV_PUSH_1U(C8B5, LINE_LENGTH_IN, memset_this_time); 259 260 NV_PUSH_1U(C8B5, LAUNCH_DMA, 261 HWCONST(C8B5, LAUNCH_DMA, SRC_MEMORY_LAYOUT, PITCH) | 262 HWCONST(C8B5, LAUNCH_DMA, DST_MEMORY_LAYOUT, PITCH) | 263 HWCONST(C8B5, LAUNCH_DMA, MULTI_LINE_ENABLE, FALSE) | 264 flush_value | 265 launch_dma_remap_enable | 266 launch_dma_scrub_enable | 267 launch_dma_dst_type | 268 launch_dma_plc_mode | 269 copy_type_value | 270 pipelined_value); 271 272 dst.address += memset_this_time * memset_element_size; 273 num_elements -= memset_this_time; 274 pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED); 275 } while (num_elements > 0); 276 } 277 278 void uvm_hal_hopper_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 value, size_t size) 279 { 280 UVM_ASSERT_MSG(size % 8 == 0, "size: %zd\n", size); 281 282 size /= 8; 283 284 NV_PUSH_3U(C8B5, SET_REMAP_CONST_A, (NvU32)value, 285 SET_REMAP_CONST_B, (NvU32)(value >> 32), 286 SET_REMAP_COMPONENTS, 287 HWCONST(C8B5, SET_REMAP_COMPONENTS, DST_X, CONST_A) | 288 HWCONST(C8B5, SET_REMAP_COMPONENTS, DST_Y, CONST_B) | 289 HWCONST(C8B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE, FOUR) | 290 HWCONST(C8B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS, TWO)); 291 292 hopper_memset_common(push, dst, size, 8); 293 } 294 295 void uvm_hal_hopper_ce_memset_1(uvm_push_t *push, uvm_gpu_address_t dst, NvU8 value, size_t size) 296 { 297 if (hopper_scrub_enable(uvm_push_get_gpu(push), &dst, size)) { 298 NvU64 value64 = value; 299 300 value64 |= value64 << 8; 301 value64 |= value64 << 16; 302 value64 |= value64 << 32; 303 304 uvm_hal_hopper_ce_memset_8(push, dst, value64, size); 305 return; 306 } 307 308 NV_PUSH_2U(C8B5, SET_REMAP_CONST_B, (NvU32)value, 309 SET_REMAP_COMPONENTS, 310 HWCONST(C8B5, SET_REMAP_COMPONENTS, DST_X, CONST_B) | 311 HWCONST(C8B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE, ONE) | 312 HWCONST(C8B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS, ONE)); 313 314 hopper_memset_common(push, dst, size, 1); 315 } 316 317 void uvm_hal_hopper_ce_memset_4(uvm_push_t *push, uvm_gpu_address_t dst, NvU32 value, size_t size) 318 { 319 UVM_ASSERT_MSG(size % 4 == 0, "size: %zd\n", size); 320 321 if (hopper_scrub_enable(uvm_push_get_gpu(push), &dst, size)) { 322 NvU64 value64 = value; 323 324 value64 |= value64 << 32; 325 326 uvm_hal_hopper_ce_memset_8(push, dst, value64, size); 327 return; 328 } 329 330 size /= 4; 331 332 NV_PUSH_2U(C8B5, SET_REMAP_CONST_B, value, 333 SET_REMAP_COMPONENTS, 334 HWCONST(C8B5, SET_REMAP_COMPONENTS, DST_X, CONST_B) | 335 HWCONST(C8B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE, FOUR) | 336 HWCONST(C8B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS, ONE)); 337 338 hopper_memset_common(push, dst, size, 4); 339 } 340 341 bool uvm_hal_hopper_ce_memset_is_valid(uvm_push_t *push, 342 uvm_gpu_address_t dst, 343 size_t num_elements, 344 size_t element_size) 345 { 346 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 347 348 // In HCC, if a memset uses physical addressing for the destination, then 349 // it must write to (protected) vidmem. If the memset uses virtual 350 // addressing, and the backing storage is not vidmem, the access is only 351 // legal if the copy type is NONPROT2NONPROT, and the destination is 352 // unprotected sysmem, but the validation does not detect it. 353 if (uvm_conf_computing_mode_is_hcc(gpu) && !dst.is_virtual && dst.aperture != UVM_APERTURE_VID) 354 return false; 355 356 if (!gpu->parent->ce_phys_vidmem_write_supported) { 357 size_t size = num_elements * element_size; 358 uvm_gpu_address_t temp = dst; 359 360 // Physical vidmem writes are disallowed, unless using the scrubber 361 if (!dst.is_virtual && dst.aperture == UVM_APERTURE_VID && !hopper_scrub_enable(gpu, &temp, size)) { 362 UVM_ERR_PRINT("Destination address of vidmem memset must be virtual, not physical: {%s, 0x%llx} size %zu\n", 363 uvm_gpu_address_aperture_string(dst), 364 dst.address, 365 size); 366 return false; 367 } 368 } 369 370 return true; 371 } 372 373 bool uvm_hal_hopper_ce_memcopy_is_valid(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src) 374 { 375 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 376 377 if (uvm_conf_computing_mode_is_hcc(gpu)) { 378 // In HCC, if a memcopy uses physical addressing for either the 379 // destination or the source, then the corresponding aperture must be 380 // vidmem. If virtual addressing is used, and the backing storage is 381 // sysmem the access is only legal if the copy type is NONPROT2NONPROT, 382 // but the validation does not detect it. In other words the copy 383 // source and destination is unprotected sysmem. 384 if (!src.is_virtual && (src.aperture != UVM_APERTURE_VID)) 385 return false; 386 387 if (!dst.is_virtual && (dst.aperture != UVM_APERTURE_VID)) 388 return false; 389 390 if (dst.is_unprotected != src.is_unprotected) 391 return false; 392 } 393 394 if (!gpu->parent->ce_phys_vidmem_write_supported && !dst.is_virtual && dst.aperture == UVM_APERTURE_VID) { 395 UVM_ERR_PRINT("Destination address of vidmem memcopy must be virtual, not physical: {%s, 0x%llx}\n", 396 uvm_gpu_address_aperture_string(dst), 397 dst.address); 398 return false; 399 } 400 401 return true; 402 } 403 404 // Specialized version of uvm_hal_volta_ce_memcopy used for encryption and 405 // decryption. Pre-Hopper functionality, such as validation or address patching, 406 // has been removed. 407 static void encrypt_or_decrypt(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src, NvU32 size) 408 { 409 NvU32 pipelined_value; 410 NvU32 launch_dma_src_dst_type; 411 NvU32 launch_dma_plc_mode; 412 NvU32 flush_value; 413 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 414 415 // HW allows unaligned operations only if the entire buffer is in one 32B 416 // sector. Operations on buffers larger than 32B have to be aligned. 417 if (size > UVM_CONF_COMPUTING_BUF_ALIGNMENT) { 418 UVM_ASSERT(IS_ALIGNED(src.address, UVM_CONF_COMPUTING_BUF_ALIGNMENT)); 419 UVM_ASSERT(IS_ALIGNED(dst.address, UVM_CONF_COMPUTING_BUF_ALIGNMENT)); 420 } 421 else { 422 UVM_ASSERT((dst.address >> UVM_CONF_COMPUTING_BUF_ALIGNMENT) == 423 ((dst.address + size - 1) >> UVM_CONF_COMPUTING_BUF_ALIGNMENT)); 424 UVM_ASSERT((src.address >> UVM_CONF_COMPUTING_BUF_ALIGNMENT) == 425 ((src.address + size - 1) >> UVM_CONF_COMPUTING_BUF_ALIGNMENT)); 426 } 427 428 launch_dma_src_dst_type = gpu->parent->ce_hal->phys_mode(push, dst, src); 429 launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode(); 430 431 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED)) 432 pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED); 433 else 434 pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED); 435 436 flush_value = hopper_get_flush_value(push); 437 438 gpu->parent->ce_hal->offset_in_out(push, src.address, dst.address); 439 440 NV_PUSH_1U(C8B5, LINE_LENGTH_IN, size); 441 442 NV_PUSH_1U(C8B5, LAUNCH_DMA, HWCONST(C8B5, LAUNCH_DMA, SRC_MEMORY_LAYOUT, PITCH) | 443 HWCONST(C8B5, LAUNCH_DMA, DST_MEMORY_LAYOUT, PITCH) | 444 HWCONST(C8B5, LAUNCH_DMA, MULTI_LINE_ENABLE, FALSE) | 445 HWCONST(C8B5, LAUNCH_DMA, REMAP_ENABLE, FALSE) | 446 HWCONST(C8B5, LAUNCH_DMA, COPY_TYPE, SECURE) | 447 flush_value | 448 launch_dma_src_dst_type | 449 launch_dma_plc_mode | 450 pipelined_value); 451 } 452 453 // The GPU CE encrypt operation requires clients to pass a valid 454 // address where the used IV will be written. But this requirement is 455 // unnecessary, because UVM should instead rely on the CSL 456 // nvUvmInterfaceCslLogDeviceEncryption API to independently track 457 // the expected IV. 458 // 459 // To satisfy the HW requirement the same unprotected sysmem address is 460 // passed to all GPU-side encryptions. This dummy buffer is allocated at 461 // GPU initialization time. 462 static NvU64 encrypt_iv_address(uvm_push_t *push, uvm_gpu_address_t dst) 463 { 464 NvU64 iv_address; 465 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 466 467 // Match addressing mode of destination and IV 468 if (dst.is_virtual) { 469 iv_address = uvm_rm_mem_get_gpu_va(gpu->conf_computing.iv_rm_mem, gpu, false).address; 470 } 471 else { 472 iv_address = uvm_mem_gpu_physical(gpu->conf_computing.iv_mem, 473 gpu, 474 0, 475 gpu->conf_computing.iv_mem->size).address; 476 } 477 478 UVM_ASSERT(IS_ALIGNED(iv_address, UVM_CONF_COMPUTING_IV_ALIGNMENT)); 479 480 return iv_address; 481 } 482 483 // TODO: Bug 3842953: adapt CE encrypt/decrypt for p2p encrypted transfers 484 void uvm_hal_hopper_ce_encrypt(uvm_push_t *push, 485 uvm_gpu_address_t dst, 486 uvm_gpu_address_t src, 487 NvU32 size, 488 uvm_gpu_address_t auth_tag) 489 { 490 491 NvU32 auth_tag_address_hi32, auth_tag_address_lo32; 492 NvU64 iv_address; 493 NvU32 iv_address_hi32, iv_address_lo32; 494 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 495 496 UVM_ASSERT(uvm_conf_computing_mode_is_hcc(gpu)); 497 UVM_ASSERT(IS_ALIGNED(auth_tag.address, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT)); 498 499 if (!src.is_virtual) 500 UVM_ASSERT(src.aperture == UVM_APERTURE_VID); 501 502 // The addressing mode (and aperture, if applicable) of the destination 503 // pointer determines the addressing mode and aperture used by the 504 // encryption to reference the other two addresses written by it: 505 // authentication tag, and IV. If the client passes a sysmem physical 506 // address as destination, then the authentication tag must also be a sysmem 507 // physical address. 508 UVM_ASSERT(dst.is_virtual == auth_tag.is_virtual); 509 510 if (!dst.is_virtual) { 511 UVM_ASSERT(dst.aperture == UVM_APERTURE_SYS); 512 UVM_ASSERT(auth_tag.aperture == UVM_APERTURE_SYS); 513 } 514 515 NV_PUSH_1U(C8B5, SET_SECURE_COPY_MODE, HWCONST(C8B5, SET_SECURE_COPY_MODE, MODE, ENCRYPT)); 516 517 auth_tag_address_hi32 = HWVALUE(C8B5, SET_ENCRYPT_AUTH_TAG_ADDR_UPPER, UPPER, NvU64_HI32(auth_tag.address)); 518 auth_tag_address_lo32 = HWVALUE(C8B5, SET_ENCRYPT_AUTH_TAG_ADDR_LOWER, LOWER, NvU64_LO32(auth_tag.address)); 519 520 iv_address = encrypt_iv_address(push, dst); 521 522 iv_address_hi32 = HWVALUE(C8B5, SET_ENCRYPT_IV_ADDR_UPPER, UPPER, NvU64_HI32(iv_address)); 523 iv_address_lo32 = HWVALUE(C8B5, SET_ENCRYPT_IV_ADDR_LOWER, LOWER, NvU64_LO32(iv_address)); 524 525 NV_PUSH_4U(C8B5, SET_ENCRYPT_AUTH_TAG_ADDR_UPPER, auth_tag_address_hi32, 526 SET_ENCRYPT_AUTH_TAG_ADDR_LOWER, auth_tag_address_lo32, 527 SET_ENCRYPT_IV_ADDR_UPPER, iv_address_hi32, 528 SET_ENCRYPT_IV_ADDR_LOWER, iv_address_lo32); 529 530 encrypt_or_decrypt(push, dst, src, size); 531 } 532 533 // TODO: Bug 3842953: adapt CE encrypt/decrypt for p2p encrypted transfers 534 void uvm_hal_hopper_ce_decrypt(uvm_push_t *push, 535 uvm_gpu_address_t dst, 536 uvm_gpu_address_t src, 537 NvU32 size, 538 uvm_gpu_address_t auth_tag) 539 { 540 541 NvU32 auth_tag_address_hi32, auth_tag_address_lo32; 542 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 543 544 UVM_ASSERT(uvm_conf_computing_mode_is_hcc(gpu)); 545 UVM_ASSERT(IS_ALIGNED(auth_tag.address, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT)); 546 547 // The addressing mode (and aperture, if applicable) of the source and 548 // authentication pointers should match. But unlike in the encryption case, 549 // clients are not forced to pass a valid IV address. 550 UVM_ASSERT(src.is_virtual == auth_tag.is_virtual); 551 552 if (!src.is_virtual) { 553 UVM_ASSERT(src.aperture == UVM_APERTURE_SYS); 554 UVM_ASSERT(auth_tag.aperture == UVM_APERTURE_SYS); 555 } 556 557 if (!dst.is_virtual) 558 UVM_ASSERT(dst.aperture == UVM_APERTURE_VID); 559 560 NV_PUSH_1U(C8B5, SET_SECURE_COPY_MODE, HWCONST(C8B5, SET_SECURE_COPY_MODE, MODE, DECRYPT)); 561 562 auth_tag_address_hi32 = HWVALUE(C8B5, SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_UPPER, UPPER, NvU64_HI32(auth_tag.address)); 563 auth_tag_address_lo32 = HWVALUE(C8B5, SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_LOWER, LOWER, NvU64_LO32(auth_tag.address)); 564 565 NV_PUSH_2U(C8B5, SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_UPPER, auth_tag_address_hi32, 566 SET_DECRYPT_AUTH_TAG_COMPARE_ADDR_LOWER, auth_tag_address_lo32); 567 568 encrypt_or_decrypt(push, dst, src, size); 569 } 570 571