1 /*******************************************************************************
2     Copyright (c) 2021-2022 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_hal.h"
25 #include "uvm_push.h"
26 #include "clb06f.h"
27 #include "clb0b5.h"
28 
29 void uvm_hal_maxwell_ce_init(uvm_push_t *push)
30 {
31     // Notably this sends SET_OBJECT with the CE class on subchannel 0 instead
32     // of the recommended by HW subchannel 4 (subchannel 4 is recommended to
33     // match CE usage on GRCE). For the UVM driver using subchannel 0 has the
34     // benefit of also verifying that we ended up on the right PBDMA though as
35     // SET_OBJECT with CE class on subchannel 0 would fail on GRCE.
36     NV_PUSH_1U(B06F, SET_OBJECT, uvm_push_get_gpu(push)->parent->rm_info.ceClass);
37 }
38 
39 void uvm_hal_maxwell_ce_offset_out(uvm_push_t *push, NvU64 offset_out)
40 {
41     NV_PUSH_2U(B0B5, OFFSET_OUT_UPPER, HWVALUE(B0B5, OFFSET_OUT_UPPER, UPPER, NvOffset_HI32(offset_out)),
42                      OFFSET_OUT_LOWER, HWVALUE(B0B5, OFFSET_OUT_LOWER, VALUE, NvOffset_LO32(offset_out)));
43 }
44 
45 void uvm_hal_maxwell_ce_offset_in_out(uvm_push_t *push, NvU64 offset_in, NvU64 offset_out)
46 {
47     NV_PUSH_4U(B0B5, OFFSET_IN_UPPER,  HWVALUE(B0B5, OFFSET_IN_UPPER,  UPPER, NvOffset_HI32(offset_in)),
48                      OFFSET_IN_LOWER,  HWVALUE(B0B5, OFFSET_IN_LOWER,  VALUE, NvOffset_LO32(offset_in)),
49                      OFFSET_OUT_UPPER, HWVALUE(B0B5, OFFSET_OUT_UPPER, UPPER, NvOffset_HI32(offset_out)),
50                      OFFSET_OUT_LOWER, HWVALUE(B0B5, OFFSET_OUT_LOWER, VALUE, NvOffset_LO32(offset_out)));
51 }
52 
53 void uvm_hal_maxwell_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload)
54 {
55     NvU32 flush_value;
56     bool use_flush;
57 
58     use_flush = uvm_hal_membar_before_semaphore(push);
59 
60     if (use_flush)
61         flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
62     else
63         flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);
64 
65     NV_PUSH_3U(B0B5, SET_SEMAPHORE_A, HWVALUE(B0B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)),
66                      SET_SEMAPHORE_B, HWVALUE(B0B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)),
67                      SET_SEMAPHORE_PAYLOAD, payload);
68 
69     NV_PUSH_1U(B0B5, LAUNCH_DMA, flush_value |
70        HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
71        HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_ONE_WORD_SEMAPHORE));
72 }
73 
74 void uvm_hal_maxwell_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, NvU32 payload)
75 {
76     NvU32 flush_value;
77     bool use_flush;
78 
79     use_flush = uvm_hal_membar_before_semaphore(push);
80 
81     if (use_flush)
82         flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
83     else
84         flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);
85 
86     NV_PUSH_3U(B0B5, SET_SEMAPHORE_A, HWVALUE(B0B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)),
87                      SET_SEMAPHORE_B, HWVALUE(B0B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)),
88                      SET_SEMAPHORE_PAYLOAD, payload);
89 
90     NV_PUSH_1U(B0B5, LAUNCH_DMA, flush_value |
91        HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
92        HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_ONE_WORD_SEMAPHORE) |
93        HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_REDUCTION, INC) |
94        HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_REDUCTION_SIGN, UNSIGNED) |
95        HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_REDUCTION_ENABLE, TRUE));
96 }
97 
98 void uvm_hal_maxwell_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va)
99 {
100     NvU32 flush_value;
101     bool use_flush;
102 
103     use_flush = uvm_hal_membar_before_semaphore(push);
104 
105     if (use_flush)
106         flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
107     else
108         flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);
109 
110     NV_PUSH_3U(B0B5, SET_SEMAPHORE_A, HWVALUE(B0B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)),
111                      SET_SEMAPHORE_B, HWVALUE(B0B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)),
112                      SET_SEMAPHORE_PAYLOAD, 0xdeadbeef);
113 
114     NV_PUSH_1U(B0B5, LAUNCH_DMA, flush_value |
115        HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
116        HWCONST(B0B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_FOUR_WORD_SEMAPHORE));
117 }
118 
119 static void maxwell_membar_after_transfer(uvm_push_t *push)
120 {
121     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
122 
123     if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
124         return;
125 
126     // Flush on transfers only works when paired with a semaphore release. Use a
127     // host WFI + MEMBAR.
128     // Bug 1709888
129     gpu->parent->host_hal->wait_for_idle(push);
130 
131     if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
132         gpu->parent->host_hal->membar_gpu(push);
133     else
134         gpu->parent->host_hal->membar_sys(push);
135 }
136 
137 static NvU32 ce_aperture(uvm_aperture_t aperture)
138 {
139     BUILD_BUG_ON(HWCONST(B0B5, SET_SRC_PHYS_MODE, TARGET, LOCAL_FB) !=
140                  HWCONST(B0B5, SET_DST_PHYS_MODE, TARGET, LOCAL_FB));
141     BUILD_BUG_ON(HWCONST(B0B5, SET_SRC_PHYS_MODE, TARGET, COHERENT_SYSMEM) !=
142                  HWCONST(B0B5, SET_DST_PHYS_MODE, TARGET, COHERENT_SYSMEM));
143 
144     UVM_ASSERT_MSG(aperture == UVM_APERTURE_VID || aperture == UVM_APERTURE_SYS, "aperture 0x%x\n", aperture);
145 
146     if (aperture == UVM_APERTURE_SYS)
147         return HWCONST(B0B5, SET_SRC_PHYS_MODE, TARGET, COHERENT_SYSMEM);
148     else
149         return HWCONST(B0B5, SET_SRC_PHYS_MODE, TARGET, LOCAL_FB);
150 }
151 
152 // Push SET_{SRC,DST}_PHYS mode if needed and return LAUNCH_DMA_{SRC,DST}_TYPE
153 // flags
154 NvU32 uvm_hal_maxwell_ce_phys_mode(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src)
155 {
156     NvU32 launch_dma_src_dst_type = 0;
157 
158     if (src.is_virtual)
159         launch_dma_src_dst_type |= HWCONST(B0B5, LAUNCH_DMA, SRC_TYPE, VIRTUAL);
160     else
161         launch_dma_src_dst_type |= HWCONST(B0B5, LAUNCH_DMA, SRC_TYPE, PHYSICAL);
162 
163     if (dst.is_virtual)
164         launch_dma_src_dst_type |= HWCONST(B0B5, LAUNCH_DMA, DST_TYPE, VIRTUAL);
165     else
166         launch_dma_src_dst_type |= HWCONST(B0B5, LAUNCH_DMA, DST_TYPE, PHYSICAL);
167 
168     if (!src.is_virtual && !dst.is_virtual) {
169         NV_PUSH_2U(B0B5, SET_SRC_PHYS_MODE, ce_aperture(src.aperture),
170                          SET_DST_PHYS_MODE, ce_aperture(dst.aperture));
171     }
172     else if (!src.is_virtual) {
173         NV_PUSH_1U(B0B5, SET_SRC_PHYS_MODE, ce_aperture(src.aperture));
174     }
175     else if (!dst.is_virtual) {
176         NV_PUSH_1U(B0B5, SET_DST_PHYS_MODE, ce_aperture(dst.aperture));
177     }
178 
179     return launch_dma_src_dst_type;
180 }
181 
182 // Noop, since DISABLE_PLC doesn't exist in Maxwell.
183 NvU32 uvm_hal_maxwell_ce_plc_mode(void)
184 {
185     return 0;
186 }
187 
188 void uvm_hal_maxwell_ce_memcopy(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src, size_t size)
189 {
190     // If >4GB copies ever become an important use case, this function should
191     // use multi-line transfers so we don't have to iterate (bug 1766588).
192     static const size_t max_single_copy_size = 0xFFFFFFFF;
193     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
194 
195     NvU32 pipelined_value;
196     NvU32 launch_dma_src_dst_type;
197     NvU32 launch_dma_plc_mode;
198 
199     UVM_ASSERT_MSG(gpu->parent->ce_hal->memcopy_is_valid(push, dst, src),
200                    "Memcopy validation failed in channel %s, GPU %s.\n",
201                    push->channel->name,
202                    uvm_gpu_name(gpu));
203 
204     gpu->parent->ce_hal->memcopy_patch_src(push, &src);
205 
206     launch_dma_src_dst_type = gpu->parent->ce_hal->phys_mode(push, dst, src);
207     launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();
208 
209     if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED))
210         pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
211     else
212         pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED);
213 
214     do {
215         NvU32 copy_this_time = (NvU32)min(size, max_single_copy_size);
216 
217         gpu->parent->ce_hal->offset_in_out(push, src.address, dst.address);
218 
219         NV_PUSH_1U(B0B5, LINE_LENGTH_IN, copy_this_time);
220 
221         NV_PUSH_1U(B0B5, LAUNCH_DMA,
222            HWCONST(B0B5, LAUNCH_DMA, SRC_MEMORY_LAYOUT, PITCH) |
223            HWCONST(B0B5, LAUNCH_DMA, DST_MEMORY_LAYOUT, PITCH) |
224            HWCONST(B0B5, LAUNCH_DMA, MULTI_LINE_ENABLE, FALSE) |
225            HWCONST(B0B5, LAUNCH_DMA, REMAP_ENABLE, FALSE) |
226            HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE) |
227            launch_dma_src_dst_type |
228            launch_dma_plc_mode |
229            pipelined_value);
230 
231         pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
232         dst.address += copy_this_time;
233         src.address += copy_this_time;
234         size -= copy_this_time;
235     } while (size > 0);
236 
237     maxwell_membar_after_transfer(push);
238 }
239 
240 void uvm_hal_maxwell_ce_memcopy_v_to_v(uvm_push_t *push, NvU64 dst_va, NvU64 src_va, size_t size)
241 {
242     uvm_push_get_gpu(push)->parent->ce_hal->memcopy(push,
243                                                     uvm_gpu_address_virtual(dst_va),
244                                                     uvm_gpu_address_virtual(src_va),
245                                                     size);
246 }
247 
248 // Push SET_DST_PHYS mode if needed and return LAUNCH_DMA_DST_TYPE flags
249 static NvU32 maxwell_memset_push_phys_mode(uvm_push_t *push, uvm_gpu_address_t dst)
250 {
251     if (dst.is_virtual)
252         return HWCONST(B0B5, LAUNCH_DMA, DST_TYPE, VIRTUAL);
253 
254     NV_PUSH_1U(B0B5, SET_DST_PHYS_MODE, ce_aperture(dst.aperture));
255     return HWCONST(B0B5, LAUNCH_DMA, DST_TYPE, PHYSICAL);
256 }
257 
258 static void memset_common(uvm_push_t *push, uvm_gpu_address_t dst, size_t size, size_t memset_element_size)
259 {
260     // If >4GB memsets ever become an important use case, this function should
261     // use multi-line transfers so we don't have to iterate (bug 1766588).
262     static const size_t max_single_memset_size = 0xFFFFFFFF;
263 
264     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
265     NvU32 pipelined_value;
266     NvU32 launch_dma_dst_type;
267     NvU32 launch_dma_plc_mode;
268 
269     UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_is_valid(push, dst, memset_element_size),
270                    "Memset validation failed in channel %s, GPU %s.\n",
271                    push->channel->name,
272                    uvm_gpu_name(gpu));
273 
274     launch_dma_dst_type = maxwell_memset_push_phys_mode(push, dst);
275     launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();
276 
277     if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED))
278         pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
279     else
280         pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED);
281 
282     do {
283         NvU32 memset_this_time = (NvU32)min(size, max_single_memset_size);
284 
285         gpu->parent->ce_hal->offset_out(push, dst.address);
286 
287         NV_PUSH_1U(B0B5, LINE_LENGTH_IN, memset_this_time);
288 
289         NV_PUSH_1U(B0B5, LAUNCH_DMA,
290            HWCONST(B0B5, LAUNCH_DMA, SRC_MEMORY_LAYOUT, PITCH) |
291            HWCONST(B0B5, LAUNCH_DMA, DST_MEMORY_LAYOUT, PITCH) |
292            HWCONST(B0B5, LAUNCH_DMA, MULTI_LINE_ENABLE, FALSE) |
293            HWCONST(B0B5, LAUNCH_DMA, REMAP_ENABLE, TRUE) |
294            HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE) |
295            launch_dma_dst_type |
296            launch_dma_plc_mode |
297            pipelined_value);
298 
299         dst.address += memset_this_time * memset_element_size;
300         size -= memset_this_time;
301         pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
302     } while (size > 0);
303 
304     maxwell_membar_after_transfer(push);
305 }
306 
307 void uvm_hal_maxwell_ce_memset_1(uvm_push_t *push, uvm_gpu_address_t dst, NvU8 value, size_t size)
308 {
309     NV_PUSH_2U(B0B5, SET_REMAP_CONST_B,    (NvU32)value,
310                      SET_REMAP_COMPONENTS,
311        HWCONST(B0B5, SET_REMAP_COMPONENTS, DST_X,               CONST_B) |
312        HWCONST(B0B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE,      ONE)     |
313        HWCONST(B0B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS,  ONE));
314 
315     memset_common(push, dst, size, 1);
316 }
317 
318 void uvm_hal_maxwell_ce_memset_4(uvm_push_t *push, uvm_gpu_address_t dst, NvU32 value, size_t size)
319 {
320     UVM_ASSERT_MSG(size % 4 == 0, "size: %zd\n", size);
321 
322     size /= 4;
323 
324     NV_PUSH_2U(B0B5, SET_REMAP_CONST_B,    value,
325                      SET_REMAP_COMPONENTS,
326        HWCONST(B0B5, SET_REMAP_COMPONENTS, DST_X,               CONST_B) |
327        HWCONST(B0B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE,      FOUR)    |
328        HWCONST(B0B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS,  ONE));
329 
330     memset_common(push, dst, size, 4);
331 }
332 
333 void uvm_hal_maxwell_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 value, size_t size)
334 {
335     UVM_ASSERT_MSG(size % 8 == 0, "size: %zd\n", size);
336 
337     size /= 8;
338 
339     NV_PUSH_3U(B0B5, SET_REMAP_CONST_A, (NvU32)value,
340                      SET_REMAP_CONST_B, (NvU32)(value >> 32),
341                      SET_REMAP_COMPONENTS,
342        HWCONST(B0B5, SET_REMAP_COMPONENTS, DST_X,               CONST_A) |
343        HWCONST(B0B5, SET_REMAP_COMPONENTS, DST_Y,               CONST_B) |
344        HWCONST(B0B5, SET_REMAP_COMPONENTS, COMPONENT_SIZE,      FOUR)    |
345        HWCONST(B0B5, SET_REMAP_COMPONENTS, NUM_DST_COMPONENTS,  TWO));
346 
347     memset_common(push, dst, size, 8);
348 }
349 
350 void uvm_hal_maxwell_ce_memset_v_4(uvm_push_t *push, NvU64 dst_va, NvU32 value, size_t size)
351 {
352     uvm_push_get_gpu(push)->parent->ce_hal->memset_4(push, uvm_gpu_address_virtual(dst_va), value, size);
353 }
354 
355