1 /******************************************************************************* 2 Copyright (c) 2021-2022 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_hal.h" 25 #include "uvm_push.h" 26 #include "clb06f.h" 27 #include "clb0b5.h" 28 29 void uvm_hal_maxwell_ce_init(uvm_push_t *push) 30 { 31 // Notably this sends SET_OBJECT with the CE class on subchannel 0 instead 32 // of the recommended by HW subchannel 4 (subchannel 4 is recommended to 33 // match CE usage on GRCE). For the UVM driver using subchannel 0 has the 34 // benefit of also verifying that we ended up on the right PBDMA though as 35 // SET_OBJECT with CE class on subchannel 0 would fail on GRCE. 36 NV_PUSH_1U(B06F, SET_OBJECT, uvm_push_get_gpu(push)->parent->rm_info.ceClass); 37 } 38 39 void uvm_hal_maxwell_ce_offset_out(uvm_push_t *push, NvU64 offset_out) 40 { 41 NV_PUSH_2U(B0B5, OFFSET_OUT_UPPER, HWVALUE(B0B5, OFFSET_OUT_UPPER, UPPER, NvOffset_HI32(offset_out)), 42 OFFSET_OUT_LOWER, HWVALUE(B0B5, OFFSET_OUT_LOWER, VALUE, NvOffset_LO32(offset_out))); 43 } 44 45 void uvm_hal_maxwell_ce_offset_in_out(uvm_push_t *push, NvU64 offset_in, NvU64 offset_out) 46 { 47 NV_PUSH_4U(B0B5, OFFSET_IN_UPPER, HWVALUE(B0B5, OFFSET_IN_UPPER, UPPER, NvOffset_HI32(offset_in)), 48 OFFSET_IN_LOWER, HWVALUE(B0B5, OFFSET_IN_LOWER, VALUE, NvOffset_LO32(offset_in)), 49 OFFSET_OUT_UPPER, HWVALUE(B0B5, OFFSET_OUT_UPPER, UPPER, NvOffset_HI32(offset_out)), 50 OFFSET_OUT_LOWER, HWVALUE(B0B5, OFFSET_OUT_LOWER, VALUE, NvOffset_LO32(offset_out))); 51 } 52 53 void uvm_hal_maxwell_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload) 54 { 55 NvU32 flush_value; 56 bool use_flush; 57 58 use_flush = uvm_hal_membar_before_semaphore(push); 59 60 if (use_flush) 61 flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE); 62 else 63 flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE); 64 65 NV_PUSH_3U(B0B5, SET_SEMAPHORE_A, HWVALUE(B0B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)), 66 SET_SEMAPHORE_B, HWVALUE(B0B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)), 67 SET_SEMAPHORE_PAYLOAD, payload); 68 69 NV_PUSH_1U(B0B5, LAUNCH_DMA, flush_value | 70 HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) | 71 HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_ONE_WORD_SEMAPHORE)); 72 } 73 74 void uvm_hal_maxwell_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, NvU32 payload) 75 { 76 NvU32 flush_value; 77 bool use_flush; 78 79 use_flush = uvm_hal_membar_before_semaphore(push); 80 81 if (use_flush) 82 flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE); 83 else 84 flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE); 85 86 NV_PUSH_3U(B0B5, SET_SEMAPHORE_A, HWVALUE(B0B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)), 87 SET_SEMAPHORE_B, HWVALUE(B0B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)), 88 SET_SEMAPHORE_PAYLOAD, payload); 89 90 NV_PUSH_1U(B0B5, LAUNCH_DMA, flush_value | 91 HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) | 92 HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_ONE_WORD_SEMAPHORE) | 93 HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_REDUCTION, INC) | 94 HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_REDUCTION_SIGN, UNSIGNED) | 95 HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_REDUCTION_ENABLE, TRUE)); 96 } 97 98 void uvm_hal_maxwell_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va) 99 { 100 NvU32 flush_value; 101 bool use_flush; 102 103 use_flush = uvm_hal_membar_before_semaphore(push); 104 105 if (use_flush) 106 flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE); 107 else 108 flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE); 109 110 NV_PUSH_3U(B0B5, SET_SEMAPHORE_A, HWVALUE(B0B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)), 111 SET_SEMAPHORE_B, HWVALUE(B0B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)), 112 SET_SEMAPHORE_PAYLOAD, 0xdeadbeef); 113 114 NV_PUSH_1U(B0B5, LAUNCH_DMA, flush_value | 115 HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) | 116 HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_FOUR_WORD_SEMAPHORE)); 117 } 118 119 static void maxwell_membar_after_transfer(uvm_push_t *push) 120 { 121 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 122 123 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE)) 124 return; 125 126 // Flush on transfers only works when paired with a semaphore release. Use a 127 // host WFI + MEMBAR. 128 // Bug 1709888 129 gpu->parent->host_hal->wait_for_idle(push); 130 131 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU)) 132 gpu->parent->host_hal->membar_gpu(push); 133 else 134 gpu->parent->host_hal->membar_sys(push); 135 } 136 137 static NvU32 ce_aperture(uvm_aperture_t aperture) 138 { 139 BUILD_BUG_ON(HWCONST(B0B5, SET_SRC_PHYS_MODE, TARGET, LOCAL_FB) != 140 HWCONST(B0B5, SET_DST_PHYS_MODE, TARGET, LOCAL_FB)); 141 BUILD_BUG_ON(HWCONST(B0B5, SET_SRC_PHYS_MODE, TARGET, COHERENT_SYSMEM) != 142 HWCONST(B0B5, SET_DST_PHYS_MODE, TARGET, COHERENT_SYSMEM)); 143 144 UVM_ASSERT_MSG(aperture == UVM_APERTURE_VID || aperture == UVM_APERTURE_SYS, "aperture 0x%x\n", aperture); 145 146 if (aperture == UVM_APERTURE_SYS) 147 return HWCONST(B0B5, SET_SRC_PHYS_MODE, TARGET, COHERENT_SYSMEM); 148 else 149 return HWCONST(B0B5, SET_SRC_PHYS_MODE, TARGET, LOCAL_FB); 150 } 151 152 // Push SET_{SRC,DST}_PHYS mode if needed and return LAUNCH_DMA_{SRC,DST}_TYPE 153 // flags 154 NvU32 uvm_hal_maxwell_ce_phys_mode(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src) 155 { 156 NvU32 launch_dma_src_dst_type = 0; 157 158 if (src.is_virtual) 159 launch_dma_src_dst_type |= HWCONST(B0B5, LAUNCH_DMA, SRC_TYPE, VIRTUAL); 160 else 161 launch_dma_src_dst_type |= HWCONST(B0B5, LAUNCH_DMA, SRC_TYPE, PHYSICAL); 162 163 if (dst.is_virtual) 164 launch_dma_src_dst_type |= HWCONST(B0B5, LAUNCH_DMA, DST_TYPE, VIRTUAL); 165 else 166 launch_dma_src_dst_type |= HWCONST(B0B5, LAUNCH_DMA, DST_TYPE, PHYSICAL); 167 168 if (!src.is_virtual && !dst.is_virtual) { 169 NV_PUSH_2U(B0B5, SET_SRC_PHYS_MODE, ce_aperture(src.aperture), 170 SET_DST_PHYS_MODE, ce_aperture(dst.aperture)); 171 } 172 else if (!src.is_virtual) { 173 NV_PUSH_1U(B0B5, SET_SRC_PHYS_MODE, ce_aperture(src.aperture)); 174 } 175 else if (!dst.is_virtual) { 176 NV_PUSH_1U(B0B5, SET_DST_PHYS_MODE, ce_aperture(dst.aperture)); 177 } 178 179 return launch_dma_src_dst_type; 180 } 181 182 // Noop, since DISABLE_PLC doesn't exist in Maxwell. 183 NvU32 uvm_hal_maxwell_ce_plc_mode(void) 184 { 185 return 0; 186 } 187 188 void uvm_hal_maxwell_ce_memcopy(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src, size_t size) 189 { 190 // If >4GB copies ever become an important use case, this function should 191 // use multi-line transfers so we don't have to iterate (bug 1766588). 192 static const size_t max_single_copy_size = 0xFFFFFFFF; 193 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 194 195 NvU32 pipelined_value; 196 NvU32 launch_dma_src_dst_type; 197 NvU32 launch_dma_plc_mode; 198 199 UVM_ASSERT_MSG(gpu->parent->ce_hal->memcopy_is_valid(push, dst, src), 200 "Memcopy validation failed in channel %s, GPU %s.\n", 201 push->channel->name, 202 uvm_gpu_name(gpu)); 203 204 gpu->parent->ce_hal->memcopy_patch_src(push, &src); 205 206 launch_dma_src_dst_type = gpu->parent->ce_hal->phys_mode(push, dst, src); 207 launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode(); 208 209 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED)) 210 pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED); 211 else 212 pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED); 213 214 do { 215 NvU32 copy_this_time = (NvU32)min(size, max_single_copy_size); 216 217 gpu->parent->ce_hal->offset_in_out(push, src.address, dst.address); 218 219 NV_PUSH_1U(B0B5, LINE_LENGTH_IN, copy_this_time); 220 221 NV_PUSH_1U(B0B5, LAUNCH_DMA, 222 HWCONST(B0B5, LAUNCH_DMA, SRC_MEMORY_LAYOUT, PITCH) | 223 HWCONST(B0B5, LAUNCH_DMA, DST_MEMORY_LAYOUT, PITCH) | 224 HWCONST(B0B5, LAUNCH_DMA, MULTI_LINE_ENABLE, FALSE) | 225 HWCONST(B0B5, LAUNCH_DMA, REMAP_ENABLE, FALSE) | 226 HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE) | 227 launch_dma_src_dst_type | 228 launch_dma_plc_mode | 229 pipelined_value); 230 231 pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED); 232 dst.address += copy_this_time; 233 src.address += copy_this_time; 234 size -= copy_this_time; 235 } while (size > 0); 236 237 maxwell_membar_after_transfer(push); 238 } 239 240 void uvm_hal_maxwell_ce_memcopy_v_to_v(uvm_push_t *push, NvU64 dst_va, NvU64 src_va, size_t size) 241 { 242 uvm_push_get_gpu(push)->parent->ce_hal->memcopy(push, 243 uvm_gpu_address_virtual(dst_va), 244 uvm_gpu_address_virtual(src_va), 245 size); 246 } 247 248 // Push SET_DST_PHYS mode if needed and return LAUNCH_DMA_DST_TYPE flags 249 static NvU32 maxwell_memset_push_phys_mode(uvm_push_t *push, uvm_gpu_address_t dst) 250 { 251 if (dst.is_virtual) 252 return HWCONST(B0B5, LAUNCH_DMA, DST_TYPE, VIRTUAL); 253 254 NV_PUSH_1U(B0B5, SET_DST_PHYS_MODE, ce_aperture(dst.aperture)); 255 return HWCONST(B0B5, LAUNCH_DMA, DST_TYPE, PHYSICAL); 256 } 257 258 static void memset_common(uvm_push_t *push, uvm_gpu_address_t dst, size_t size, size_t memset_element_size) 259 { 260 // If >4GB memsets ever become an important use case, this function should 261 // use multi-line transfers so we don't have to iterate (bug 1766588). 262 static const size_t max_single_memset_size = 0xFFFFFFFF; 263 264 uvm_gpu_t *gpu = uvm_push_get_gpu(push); 265 NvU32 pipelined_value; 266 NvU32 launch_dma_dst_type; 267 NvU32 launch_dma_plc_mode; 268 269 UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_is_valid(push, dst, memset_element_size), 270 "Memset validation failed in channel %s, GPU %s.\n", 271 push->channel->name, 272 uvm_gpu_name(gpu)); 273 274 launch_dma_dst_type = maxwell_memset_push_phys_mode(push, dst); 275 launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode(); 276 277 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED)) 278 pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED); 279 else 280 pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED); 281 282 do { 283 NvU32 memset_this_time = (NvU32)min(size, max_single_memset_size); 284 285 gpu->parent->ce_hal->offset_out(push, dst.address); 286 287 NV_PUSH_1U(B0B5, LINE_LENGTH_IN, memset_this_time); 288 289 NV_PUSH_1U(B0B5, LAUNCH_DMA, 290 HWCONST(B0B5, LAUNCH_DMA, SRC_MEMORY_LAYOUT, PITCH) | 291 HWCONST(B0B5, LAUNCH_DMA, DST_MEMORY_LAYOUT, PITCH) | 292 HWCONST(B0B5, LAUNCH_DMA, MULTI_LINE_ENABLE, FALSE) | 293 HWCONST(B0B5, LAUNCH_DMA, REMAP_ENABLE, TRUE) | 294 HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE) | 295 launch_dma_dst_type | 296 launch_dma_plc_mode | 297 pipelined_value); 298 299 dst.address += memset_this_time * memset_element_size; 300 size -= memset_this_time; 301 pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED); 302 } while (size > 0); 303 304 maxwell_membar_after_transfer(push); 305 } 306 307 void uvm_hal_maxwell_ce_memset_1(uvm_push_t *push, uvm_gpu_address_t dst, NvU8 value, size_t size) 308 { 309 NV_PUSH_2U(B0B5, SET_REMAP_CONST_B, (NvU32)value, 310 SET_REMAP_COMPONENTS, 311 HWCONST(B0B5, SET_REMAP_COMPONENTS, DST_X, CONST_B) | 312 HWCONST(B0B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE, ONE) | 313 HWCONST(B0B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS, ONE)); 314 315 memset_common(push, dst, size, 1); 316 } 317 318 void uvm_hal_maxwell_ce_memset_4(uvm_push_t *push, uvm_gpu_address_t dst, NvU32 value, size_t size) 319 { 320 UVM_ASSERT_MSG(size % 4 == 0, "size: %zd\n", size); 321 322 size /= 4; 323 324 NV_PUSH_2U(B0B5, SET_REMAP_CONST_B, value, 325 SET_REMAP_COMPONENTS, 326 HWCONST(B0B5, SET_REMAP_COMPONENTS, DST_X, CONST_B) | 327 HWCONST(B0B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE, FOUR) | 328 HWCONST(B0B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS, ONE)); 329 330 memset_common(push, dst, size, 4); 331 } 332 333 void uvm_hal_maxwell_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 value, size_t size) 334 { 335 UVM_ASSERT_MSG(size % 8 == 0, "size: %zd\n", size); 336 337 size /= 8; 338 339 NV_PUSH_3U(B0B5, SET_REMAP_CONST_A, (NvU32)value, 340 SET_REMAP_CONST_B, (NvU32)(value >> 32), 341 SET_REMAP_COMPONENTS, 342 HWCONST(B0B5, SET_REMAP_COMPONENTS, DST_X, CONST_A) | 343 HWCONST(B0B5, SET_REMAP_COMPONENTS, DST_Y, CONST_B) | 344 HWCONST(B0B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE, FOUR) | 345 HWCONST(B0B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS, TWO)); 346 347 memset_common(push, dst, size, 8); 348 } 349 350 void uvm_hal_maxwell_ce_memset_v_4(uvm_push_t *push, NvU64 dst_va, NvU32 value, size_t size) 351 { 352 uvm_push_get_gpu(push)->parent->ce_hal->memset_4(push, uvm_gpu_address_virtual(dst_va), value, size); 353 } 354 355