1 /*******************************************************************************
2 Copyright (c) 2021-2023 NVIDIA Corporation
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to
6 deal in the Software without restriction, including without limitation the
7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8 sell copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be
12 included in all copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 DEALINGS IN THE SOFTWARE.
21
22 *******************************************************************************/
23
24 #include "uvm_hal.h"
25 #include "uvm_push.h"
26 #include "clb06f.h"
27 #include "clb0b5.h"
28
uvm_hal_maxwell_ce_init(uvm_push_t * push)29 void uvm_hal_maxwell_ce_init(uvm_push_t *push)
30 {
31 // Notably this sends SET_OBJECT with the CE class on subchannel 0 instead
32 // of the recommended by HW subchannel 4 (subchannel 4 is required to
33 // match CE usage on GRCE). For the UVM driver using subchannel 0 has the
34 // benefit of also verifying that we ended up on the right CE engine type
35 // though as SET_OBJECT with CE class on subchannel 0 would fail on GRCE.
36 NV_PUSH_1U(B06F, SET_OBJECT, uvm_push_get_gpu(push)->parent->rm_info.ceClass);
37 }
38
uvm_hal_maxwell_ce_offset_out(uvm_push_t * push,NvU64 offset_out)39 void uvm_hal_maxwell_ce_offset_out(uvm_push_t *push, NvU64 offset_out)
40 {
41 NV_PUSH_2U(B0B5, OFFSET_OUT_UPPER, HWVALUE(B0B5, OFFSET_OUT_UPPER, UPPER, NvOffset_HI32(offset_out)),
42 OFFSET_OUT_LOWER, HWVALUE(B0B5, OFFSET_OUT_LOWER, VALUE, NvOffset_LO32(offset_out)));
43 }
44
uvm_hal_maxwell_ce_offset_in_out(uvm_push_t * push,NvU64 offset_in,NvU64 offset_out)45 void uvm_hal_maxwell_ce_offset_in_out(uvm_push_t *push, NvU64 offset_in, NvU64 offset_out)
46 {
47 NV_PUSH_4U(B0B5, OFFSET_IN_UPPER, HWVALUE(B0B5, OFFSET_IN_UPPER, UPPER, NvOffset_HI32(offset_in)),
48 OFFSET_IN_LOWER, HWVALUE(B0B5, OFFSET_IN_LOWER, VALUE, NvOffset_LO32(offset_in)),
49 OFFSET_OUT_UPPER, HWVALUE(B0B5, OFFSET_OUT_UPPER, UPPER, NvOffset_HI32(offset_out)),
50 OFFSET_OUT_LOWER, HWVALUE(B0B5, OFFSET_OUT_LOWER, VALUE, NvOffset_LO32(offset_out)));
51 }
52
uvm_hal_maxwell_ce_semaphore_release(uvm_push_t * push,NvU64 gpu_va,NvU32 payload)53 void uvm_hal_maxwell_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload)
54 {
55 NvU32 flush_value;
56 bool use_flush;
57
58 use_flush = uvm_hal_membar_before_semaphore(push);
59
60 if (use_flush)
61 flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
62 else
63 flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);
64
65 NV_PUSH_3U(B0B5, SET_SEMAPHORE_A, HWVALUE(B0B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)),
66 SET_SEMAPHORE_B, HWVALUE(B0B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)),
67 SET_SEMAPHORE_PAYLOAD, payload);
68
69 NV_PUSH_1U(B0B5, LAUNCH_DMA, flush_value |
70 HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
71 HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_ONE_WORD_SEMAPHORE));
72 }
73
uvm_hal_maxwell_ce_semaphore_reduction_inc(uvm_push_t * push,NvU64 gpu_va,NvU32 payload)74 void uvm_hal_maxwell_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, NvU32 payload)
75 {
76 NvU32 flush_value;
77 bool use_flush;
78
79 use_flush = uvm_hal_membar_before_semaphore(push);
80
81 if (use_flush)
82 flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
83 else
84 flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);
85
86 NV_PUSH_3U(B0B5, SET_SEMAPHORE_A, HWVALUE(B0B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)),
87 SET_SEMAPHORE_B, HWVALUE(B0B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)),
88 SET_SEMAPHORE_PAYLOAD, payload);
89
90 NV_PUSH_1U(B0B5, LAUNCH_DMA, flush_value |
91 HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
92 HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_ONE_WORD_SEMAPHORE) |
93 HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_REDUCTION, INC) |
94 HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_REDUCTION_SIGN, UNSIGNED) |
95 HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_REDUCTION_ENABLE, TRUE));
96 }
97
uvm_hal_maxwell_ce_semaphore_timestamp(uvm_push_t * push,NvU64 gpu_va)98 void uvm_hal_maxwell_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va)
99 {
100 NvU32 flush_value;
101 bool use_flush;
102
103 use_flush = uvm_hal_membar_before_semaphore(push);
104
105 if (use_flush)
106 flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
107 else
108 flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);
109
110 NV_PUSH_3U(B0B5, SET_SEMAPHORE_A, HWVALUE(B0B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)),
111 SET_SEMAPHORE_B, HWVALUE(B0B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)),
112 SET_SEMAPHORE_PAYLOAD, 0xdeadbeef);
113
114 NV_PUSH_1U(B0B5, LAUNCH_DMA, flush_value |
115 HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
116 HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_FOUR_WORD_SEMAPHORE));
117 }
118
maxwell_membar_after_transfer(uvm_push_t * push)119 static void maxwell_membar_after_transfer(uvm_push_t *push)
120 {
121 uvm_gpu_t *gpu = uvm_push_get_gpu(push);
122
123 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
124 return;
125
126 // Flush on transfers only works when paired with a semaphore release. Use a
127 // host WFI + MEMBAR.
128 // Bug 1709888
129 gpu->parent->host_hal->wait_for_idle(push);
130
131 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
132 gpu->parent->host_hal->membar_gpu(push);
133 else
134 gpu->parent->host_hal->membar_sys(push);
135 }
136
ce_aperture(uvm_aperture_t aperture)137 static NvU32 ce_aperture(uvm_aperture_t aperture)
138 {
139 BUILD_BUG_ON(HWCONST(B0B5, SET_SRC_PHYS_MODE, TARGET, LOCAL_FB) !=
140 HWCONST(B0B5, SET_DST_PHYS_MODE, TARGET, LOCAL_FB));
141 BUILD_BUG_ON(HWCONST(B0B5, SET_SRC_PHYS_MODE, TARGET, COHERENT_SYSMEM) !=
142 HWCONST(B0B5, SET_DST_PHYS_MODE, TARGET, COHERENT_SYSMEM));
143
144 UVM_ASSERT_MSG(aperture == UVM_APERTURE_VID || aperture == UVM_APERTURE_SYS, "aperture 0x%x\n", aperture);
145
146 if (aperture == UVM_APERTURE_SYS)
147 return HWCONST(B0B5, SET_SRC_PHYS_MODE, TARGET, COHERENT_SYSMEM);
148 else
149 return HWCONST(B0B5, SET_SRC_PHYS_MODE, TARGET, LOCAL_FB);
150 }
151
152 // Push SET_{SRC,DST}_PHYS mode if needed and return LAUNCH_DMA_{SRC,DST}_TYPE
153 // flags
uvm_hal_maxwell_ce_phys_mode(uvm_push_t * push,uvm_gpu_address_t dst,uvm_gpu_address_t src)154 NvU32 uvm_hal_maxwell_ce_phys_mode(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src)
155 {
156 NvU32 launch_dma_src_dst_type = 0;
157
158 if (src.is_virtual)
159 launch_dma_src_dst_type |= HWCONST(B0B5, LAUNCH_DMA, SRC_TYPE, VIRTUAL);
160 else
161 launch_dma_src_dst_type |= HWCONST(B0B5, LAUNCH_DMA, SRC_TYPE, PHYSICAL);
162
163 if (dst.is_virtual)
164 launch_dma_src_dst_type |= HWCONST(B0B5, LAUNCH_DMA, DST_TYPE, VIRTUAL);
165 else
166 launch_dma_src_dst_type |= HWCONST(B0B5, LAUNCH_DMA, DST_TYPE, PHYSICAL);
167
168 if (!src.is_virtual && !dst.is_virtual) {
169 NV_PUSH_2U(B0B5, SET_SRC_PHYS_MODE, ce_aperture(src.aperture),
170 SET_DST_PHYS_MODE, ce_aperture(dst.aperture));
171 }
172 else if (!src.is_virtual) {
173 NV_PUSH_1U(B0B5, SET_SRC_PHYS_MODE, ce_aperture(src.aperture));
174 }
175 else if (!dst.is_virtual) {
176 NV_PUSH_1U(B0B5, SET_DST_PHYS_MODE, ce_aperture(dst.aperture));
177 }
178
179 return launch_dma_src_dst_type;
180 }
181
182 // Noop, since DISABLE_PLC doesn't exist in Maxwell.
uvm_hal_maxwell_ce_plc_mode(void)183 NvU32 uvm_hal_maxwell_ce_plc_mode(void)
184 {
185 return 0;
186 }
187
188 // Noop, since COPY_TYPE doesn't exist in Maxwell.
uvm_hal_maxwell_ce_memcopy_copy_type(uvm_gpu_address_t dst,uvm_gpu_address_t src)189 NvU32 uvm_hal_maxwell_ce_memcopy_copy_type(uvm_gpu_address_t dst, uvm_gpu_address_t src)
190 {
191 return 0;
192 }
193
uvm_hal_maxwell_ce_memcopy(uvm_push_t * push,uvm_gpu_address_t dst,uvm_gpu_address_t src,size_t size)194 void uvm_hal_maxwell_ce_memcopy(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src, size_t size)
195 {
196 // If >4GB copies ever become an important use case, this function should
197 // use multi-line transfers so we don't have to iterate (bug 1766588).
198 static const size_t max_single_copy_size = 0xFFFFFFFF;
199 uvm_gpu_t *gpu = uvm_push_get_gpu(push);
200
201 NvU32 pipelined_value;
202 NvU32 launch_dma_src_dst_type;
203 NvU32 launch_dma_plc_mode;
204 NvU32 copy_type_value;
205
206 UVM_ASSERT_MSG(gpu->parent->ce_hal->memcopy_is_valid(push, dst, src),
207 "Memcopy validation failed in channel %s, GPU %s.\n",
208 push->channel->name,
209 uvm_gpu_name(gpu));
210
211 gpu->parent->ce_hal->memcopy_patch_src(push, &src);
212
213 launch_dma_src_dst_type = gpu->parent->ce_hal->phys_mode(push, dst, src);
214 launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();
215 copy_type_value = gpu->parent->ce_hal->memcopy_copy_type(dst, src);
216
217 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED))
218 pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
219 else
220 pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED);
221
222 do {
223 NvU32 copy_this_time = (NvU32)min(size, max_single_copy_size);
224
225 gpu->parent->ce_hal->offset_in_out(push, src.address, dst.address);
226
227 NV_PUSH_1U(B0B5, LINE_LENGTH_IN, copy_this_time);
228
229 NV_PUSH_1U(B0B5, LAUNCH_DMA,
230 HWCONST(B0B5, LAUNCH_DMA, SRC_MEMORY_LAYOUT, PITCH) |
231 HWCONST(B0B5, LAUNCH_DMA, DST_MEMORY_LAYOUT, PITCH) |
232 HWCONST(B0B5, LAUNCH_DMA, MULTI_LINE_ENABLE, FALSE) |
233 HWCONST(B0B5, LAUNCH_DMA, REMAP_ENABLE, FALSE) |
234 HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE) |
235 launch_dma_src_dst_type |
236 launch_dma_plc_mode |
237 copy_type_value |
238 pipelined_value);
239
240 pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
241 dst.address += copy_this_time;
242 src.address += copy_this_time;
243 size -= copy_this_time;
244 } while (size > 0);
245
246 maxwell_membar_after_transfer(push);
247 }
248
uvm_hal_maxwell_ce_memcopy_v_to_v(uvm_push_t * push,NvU64 dst_va,NvU64 src_va,size_t size)249 void uvm_hal_maxwell_ce_memcopy_v_to_v(uvm_push_t *push, NvU64 dst_va, NvU64 src_va, size_t size)
250 {
251 uvm_push_get_gpu(push)->parent->ce_hal->memcopy(push,
252 uvm_gpu_address_virtual(dst_va),
253 uvm_gpu_address_virtual(src_va),
254 size);
255 }
256
257 // Push SET_DST_PHYS mode if needed and return LAUNCH_DMA_DST_TYPE flags
maxwell_memset_push_phys_mode(uvm_push_t * push,uvm_gpu_address_t dst)258 static NvU32 maxwell_memset_push_phys_mode(uvm_push_t *push, uvm_gpu_address_t dst)
259 {
260 if (dst.is_virtual)
261 return HWCONST(B0B5, LAUNCH_DMA, DST_TYPE, VIRTUAL);
262
263 NV_PUSH_1U(B0B5, SET_DST_PHYS_MODE, ce_aperture(dst.aperture));
264 return HWCONST(B0B5, LAUNCH_DMA, DST_TYPE, PHYSICAL);
265 }
266
memset_common(uvm_push_t * push,uvm_gpu_address_t dst,size_t size,size_t memset_element_size)267 static void memset_common(uvm_push_t *push, uvm_gpu_address_t dst, size_t size, size_t memset_element_size)
268 {
269 // If >4GB memsets ever become an important use case, this function should
270 // use multi-line transfers so we don't have to iterate (bug 1766588).
271 static const size_t max_single_memset_size = 0xFFFFFFFF;
272
273 uvm_gpu_t *gpu = uvm_push_get_gpu(push);
274 NvU32 pipelined_value;
275 NvU32 launch_dma_dst_type;
276 NvU32 launch_dma_plc_mode;
277
278 UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_is_valid(push, dst, size, memset_element_size),
279 "Memset validation failed in channel %s, GPU %s.\n",
280 push->channel->name,
281 uvm_gpu_name(gpu));
282
283 launch_dma_dst_type = maxwell_memset_push_phys_mode(push, dst);
284 launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();
285
286 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED))
287 pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
288 else
289 pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED);
290
291 do {
292 NvU32 memset_this_time = (NvU32)min(size, max_single_memset_size);
293
294 gpu->parent->ce_hal->offset_out(push, dst.address);
295
296 NV_PUSH_1U(B0B5, LINE_LENGTH_IN, memset_this_time);
297
298 NV_PUSH_1U(B0B5, LAUNCH_DMA,
299 HWCONST(B0B5, LAUNCH_DMA, SRC_MEMORY_LAYOUT, PITCH) |
300 HWCONST(B0B5, LAUNCH_DMA, DST_MEMORY_LAYOUT, PITCH) |
301 HWCONST(B0B5, LAUNCH_DMA, MULTI_LINE_ENABLE, FALSE) |
302 HWCONST(B0B5, LAUNCH_DMA, REMAP_ENABLE, TRUE) |
303 HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE) |
304 launch_dma_dst_type |
305 launch_dma_plc_mode |
306 pipelined_value);
307
308 dst.address += memset_this_time * memset_element_size;
309 size -= memset_this_time;
310 pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
311 } while (size > 0);
312
313 maxwell_membar_after_transfer(push);
314 }
315
uvm_hal_maxwell_ce_memset_1(uvm_push_t * push,uvm_gpu_address_t dst,NvU8 value,size_t size)316 void uvm_hal_maxwell_ce_memset_1(uvm_push_t *push, uvm_gpu_address_t dst, NvU8 value, size_t size)
317 {
318 NV_PUSH_2U(B0B5, SET_REMAP_CONST_B, (NvU32)value,
319 SET_REMAP_COMPONENTS,
320 HWCONST(B0B5, SET_REMAP_COMPONENTS, DST_X, CONST_B) |
321 HWCONST(B0B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE, ONE) |
322 HWCONST(B0B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS, ONE));
323
324 memset_common(push, dst, size, 1);
325 }
326
uvm_hal_maxwell_ce_memset_4(uvm_push_t * push,uvm_gpu_address_t dst,NvU32 value,size_t size)327 void uvm_hal_maxwell_ce_memset_4(uvm_push_t *push, uvm_gpu_address_t dst, NvU32 value, size_t size)
328 {
329 UVM_ASSERT_MSG(size % 4 == 0, "size: %zd\n", size);
330
331 size /= 4;
332
333 NV_PUSH_2U(B0B5, SET_REMAP_CONST_B, value,
334 SET_REMAP_COMPONENTS,
335 HWCONST(B0B5, SET_REMAP_COMPONENTS, DST_X, CONST_B) |
336 HWCONST(B0B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE, FOUR) |
337 HWCONST(B0B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS, ONE));
338
339 memset_common(push, dst, size, 4);
340 }
341
uvm_hal_maxwell_ce_memset_8(uvm_push_t * push,uvm_gpu_address_t dst,NvU64 value,size_t size)342 void uvm_hal_maxwell_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 value, size_t size)
343 {
344 UVM_ASSERT_MSG(size % 8 == 0, "size: %zd\n", size);
345
346 size /= 8;
347
348 NV_PUSH_3U(B0B5, SET_REMAP_CONST_A, (NvU32)value,
349 SET_REMAP_CONST_B, (NvU32)(value >> 32),
350 SET_REMAP_COMPONENTS,
351 HWCONST(B0B5, SET_REMAP_COMPONENTS, DST_X, CONST_A) |
352 HWCONST(B0B5, SET_REMAP_COMPONENTS, DST_Y, CONST_B) |
353 HWCONST(B0B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE, FOUR) |
354 HWCONST(B0B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS, TWO));
355
356 memset_common(push, dst, size, 8);
357 }
358
uvm_hal_maxwell_ce_memset_v_4(uvm_push_t * push,NvU64 dst_va,NvU32 value,size_t size)359 void uvm_hal_maxwell_ce_memset_v_4(uvm_push_t *push, NvU64 dst_va, NvU32 value, size_t size)
360 {
361 uvm_push_get_gpu(push)->parent->ce_hal->memset_4(push, uvm_gpu_address_virtual(dst_va), value, size);
362 }
363
uvm_hal_maxwell_ce_encrypt_unsupported(uvm_push_t * push,uvm_gpu_address_t dst,uvm_gpu_address_t src,NvU32 size,uvm_gpu_address_t auth_tag)364 void uvm_hal_maxwell_ce_encrypt_unsupported(uvm_push_t *push,
365 uvm_gpu_address_t dst,
366 uvm_gpu_address_t src,
367 NvU32 size,
368 uvm_gpu_address_t auth_tag)
369 {
370 uvm_gpu_t *gpu = uvm_push_get_gpu(push);
371
372 UVM_ASSERT_MSG(false, "CE encrypt is not supported on GPU: %s.\n", uvm_gpu_name(gpu));
373 }
374
uvm_hal_maxwell_ce_decrypt_unsupported(uvm_push_t * push,uvm_gpu_address_t dst,uvm_gpu_address_t src,NvU32 size,uvm_gpu_address_t auth_tag)375 void uvm_hal_maxwell_ce_decrypt_unsupported(uvm_push_t *push,
376 uvm_gpu_address_t dst,
377 uvm_gpu_address_t src,
378 NvU32 size,
379 uvm_gpu_address_t auth_tag)
380 {
381 uvm_gpu_t *gpu = uvm_push_get_gpu(push);
382
383 UVM_ASSERT_MSG(false, "CE decrypt is not supported on GPU: %s.\n", uvm_gpu_name(gpu));
384 }
385