1 /*
2 * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 * SPDX-License-Identifier: MIT
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24
25
26 #include "nvidia-push-init.h"
27 #include "nvidia-push-utils.h"
28 #include "nvidia-push-priv.h"
29 #include "nvidia-push-priv-imports.h"
30
31 #include "nvos.h"
32
33 #include "nv_assert.h"
34
35 #include "alloc/alloc_channel.h"
36 #include "class/cl0002.h" // NV01_CONTEXT_DMA
37 #include "class/cl003e.h" // NV01_MEMORY_SYSTEM
38 #include "class/cl0040.h" // NV01_MEMORY_LOCAL_USER
39
40 #include "class/cla16f.h" // KEPLER_CHANNEL_GPFIFO_B
41 #include "class/cla26f.h" // KEPLER_CHANNEL_GPFIFO_C
42 #include "class/clb06f.h" // MAXWELL_CHANNEL_GPFIFO_A
43 #include "class/clc06f.h" // PASCAL_CHANNEL_GPFIFO_A
44 #include "class/clc36f.h" // VOLTA_CHANNEL_GPFIFO_A
45 #include "class/clc46f.h" // TURING_CHANNEL_GPFIFO_A
46 #include "class/cl50a0.h" // NV50_MEMORY_VIRTUAL
47 #include "class/clc56f.h" // AMPERE_CHANNEL_GPFIFO_A
48 #include "class/clc86f.h" // HOPPER_CHANNEL_GPFIFO_A
49 #include "class/clc361.h" // VOLTA_USERMODE_A
50 #include "class/clc661.h" // HOPPER_USERMODE_A
51
52 #include "ctrl/ctrl0080/ctrl0080fifo.h" // NV0080_CTRL_CMD_FIFO_GET_CAPS_V2
53 #include "ctrl/ctrl2080/ctrl2080bus.h" // NV2080_CTRL_CMD_BUS_GET_INFO
54 #include "ctrl/ctrla06f.h" // KEPLER_CHANNEL_GPFIFO_A
55 #include "ctrl/ctrlc36f.h" // VOLTA_CHANNEL_GPFIFO_A
56
GetHandle(const NvU32 * pHandlePool,NvU8 handlePoolSize,NvU64 * pUsedHandleBitmask)57 static NvU32 GetHandle(
58 const NvU32 *pHandlePool,
59 NvU8 handlePoolSize,
60 NvU64 *pUsedHandleBitmask)
61 {
62 NvU8 i;
63 const NvU64 usedHandleBitmask = *pUsedHandleBitmask;
64
65 /*
66 * We assume there are less than 64 handles in the pool. If the
67 * pool is larger than that, we'll need a fancier bitmask.
68 */
69 nvAssert(handlePoolSize < (sizeof(NvU64) * 8));
70
71 for (i = 0; i < handlePoolSize; i++) {
72 if ((usedHandleBitmask & NVBIT64(i)) == 0) {
73 *pUsedHandleBitmask |= NVBIT64(i);
74 return pHandlePool[i];
75 }
76 }
77
78 nvAssert(!"Exhausted handlePool!");
79
80 return 0;
81 }
82
GetChannelHandle(const NvPushAllocChannelParams * pParams,NvU64 * pUsedHandleBitmask)83 static NvU32 GetChannelHandle(
84 const NvPushAllocChannelParams *pParams,
85 NvU64 *pUsedHandleBitmask)
86 {
87 return GetHandle(pParams->handlePool,
88 ARRAY_LEN(pParams->handlePool),
89 pUsedHandleBitmask);
90 }
91
GetDeviceHandle(const NvPushAllocDeviceParams * pParams,NvU64 * pUsedHandleBitmask)92 static NvU32 GetDeviceHandle(
93 const NvPushAllocDeviceParams *pParams,
94 NvU64 *pUsedHandleBitmask)
95 {
96 return GetHandle(pParams->handlePool,
97 ARRAY_LEN(pParams->handlePool),
98 pUsedHandleBitmask);
99 }
100
FreeSemaSurface(NvPushChannelPtr p)101 static void FreeSemaSurface(NvPushChannelPtr p)
102 {
103 NvPushDevicePtr pDevice = p->pDevice;
104 NvU32 *handle = p->progressSemaphore.handle;
105 void **ptr = p->progressSemaphore.ptr;
106 NvU32 status;
107 int sd;
108
109 if (p->progressSemaphore.gpuVA) {
110 for (sd = pDevice->numSubDevices - 1; sd >= 0; sd--) {
111 const int deviceIndex = __nvPushGetDeviceIndex(pDevice, sd);
112 status = nvPushImportRmApiUnmapMemoryDma(
113 pDevice,
114 pDevice->subDevice[sd].handle,
115 pDevice->subDevice[deviceIndex].gpuVASpaceCtxDma,
116 handle[sd],
117 0,
118 p->progressSemaphore.gpuVA);
119 if (status != NVOS_STATUS_SUCCESS) {
120 nvAssert(!"Failed to unmap progressSemaphore");
121 }
122 }
123 p->progressSemaphore.gpuVA = 0;
124 }
125
126 for (sd = pDevice->numSubDevices - 1; sd >= 0; sd--) {
127 const int deviceIndex = __nvPushGetDeviceIndex(pDevice, sd);
128 if (!handle[sd]) {
129 continue;
130 }
131 status = nvPushImportRmApiFree(
132 pDevice,
133 pDevice->subDevice[deviceIndex].deviceHandle,
134 handle[sd]);
135 if (status != NVOS_STATUS_SUCCESS) {
136 nvAssert(!"Failed to free progressSemaphore");
137 }
138 handle[sd] = 0;
139
140 /* Freeing this memory automatically unmaps it. */
141 ptr[sd] = NULL;
142 }
143 }
144
AllocSemaSurface(NvPushChannelPtr p,const NvPushAllocChannelParams * pParams,NvBool coherent,NvU64 * pUsedHandleBitmask)145 static NvBool AllocSemaSurface(
146 NvPushChannelPtr p,
147 const NvPushAllocChannelParams *pParams,
148 NvBool coherent,
149 NvU64 *pUsedHandleBitmask)
150 {
151 NvPushDevicePtr pDevice = p->pDevice;
152 NvU32 *handle = p->progressSemaphore.handle;
153 void **ptr = p->progressSemaphore.ptr;
154 NvU32 status;
155 const NvU64 size = 4096;
156 unsigned int sd;
157
158 /* 1. Allocate sysmem surface(s) to back the semaphore, get CPU mapping */
159 for (sd = 0; sd < pDevice->numSubDevices; sd++) {
160 const int deviceIndex = __nvPushGetDeviceIndex(pDevice, sd);
161 NvU64 limit = size - 1;
162 const NvU32 flags = DRF_DEF(OS02, _FLAGS, _PHYSICALITY, _NONCONTIGUOUS) |
163 (coherent ? DRF_DEF(OS02, _FLAGS, _COHERENCY, _CACHED) :
164 DRF_DEF(OS02, _FLAGS, _COHERENCY, _UNCACHED));
165
166 handle[sd] = GetChannelHandle(pParams, pUsedHandleBitmask);
167
168 status = nvPushImportRmApiAllocMemory64(pDevice,
169 pDevice->subDevice[deviceIndex].deviceHandle,
170 handle[sd],
171 NV01_MEMORY_SYSTEM,
172 flags,
173 &ptr[sd],
174 &limit);
175
176 if (status != NVOS_STATUS_SUCCESS) {
177 handle[sd] = 0;
178 nvAssert(!"Failed to allocate FIFO semaphore surface");
179 goto fail;
180 }
181 }
182
183 /* 2. Map the surface(s) into the GPU(s) */
184 for (sd = 0; sd < pDevice->numSubDevices; sd++) {
185 NvU32 flags = DRF_DEF(OS46, _FLAGS, _ACCESS, _READ_WRITE) |
186 DRF_DEF(OS46, _FLAGS, _PAGE_SIZE, _4KB) |
187 (coherent ? DRF_DEF(OS46, _FLAGS, _CACHE_SNOOP, _ENABLE) :
188 DRF_DEF(OS46, _FLAGS, _CACHE_SNOOP, _DISABLE));
189 const int deviceIndex = __nvPushGetDeviceIndex(pDevice, sd);
190
191 /*
192 * Note that this mapping is somewhat special because we use a
193 * different surface for each subdevice, but want to map at the same
194 * virtual address on all subdevices.
195 */
196 if (sd == 0) {
197 /*
198 * Create a new virtual mapping.
199 *
200 * The MapMemoryDma call will assign to
201 * 'p->progressSemaphore.gpuVA'.
202 *
203 * In !clientSli, this creates a broadcast mapping that we override
204 * with the _DMA_UNICAST_REUSE_ALLOC flag below.
205 * In clientSli, each mapping is already unicast.
206 *
207 * In both cases, the DMA_OFFSET_FIXED flag ensures the VA matches
208 * between all subdevices.
209 */
210 p->progressSemaphore.gpuVA = 0;
211 flags = FLD_SET_DRF(OS46, _FLAGS, _DMA_OFFSET_FIXED, _FALSE, flags);
212 } else {
213 /*
214 * The MapMemoryDma call will read from
215 * 'p->progressSemaphore.gpuVA'.
216 */
217 nvAssert(p->progressSemaphore.gpuVA != 0);
218 if (!pDevice->clientSli) {
219 flags = FLD_SET_DRF(OS46, _FLAGS, _DMA_UNICAST_REUSE_ALLOC, _TRUE, flags);
220 }
221 flags = FLD_SET_DRF(OS46, _FLAGS, _DMA_OFFSET_FIXED, _TRUE, flags);
222 }
223
224 status = nvPushImportRmApiMapMemoryDma(pDevice,
225 pDevice->subDevice[sd].handle,
226 pDevice->subDevice[deviceIndex].gpuVASpaceCtxDma,
227 handle[sd],
228 0,
229 size,
230 flags,
231 &p->progressSemaphore.gpuVA);
232 if (status != NVOS_STATUS_SUCCESS) {
233 nvAssert(!"Failed to map FIFO semaphore surface");
234 goto fail;
235 }
236 }
237
238 return TRUE;
239 fail:
240 FreeSemaSurface(p);
241 return FALSE;
242 }
243
244 /*
245 * The size of the "progress tracker" portion of the pushbuffer.
246 *
247 * We use one set of progress tracker methods for every two GPFIFO entries (one
248 * GPFIFO entry is for the main pushbuffer, the other is for the progress
249 * tracker methods).
250 */
ProgressTrackerBufferSize(NvPushChannelPtr buffer)251 static inline NvU32 ProgressTrackerBufferSize(NvPushChannelPtr buffer)
252 {
253 return __nvPushProgressTrackerEntrySize(buffer->pDevice) *
254 (buffer->numGpFifoEntries / 2);
255 }
256
257 /*
258 * The size of the pushbuffer allocation, including all segments and GPFIFO
259 * entries.
260 */
CalculateGPBufferSize(NvPushChannelPtr buffer)261 static inline NvU32 CalculateGPBufferSize(NvPushChannelPtr buffer)
262 {
263 return __nvPushProgressTrackerOffset(buffer) +
264 ProgressTrackerBufferSize(buffer);
265 }
266
267 /*!
268 * Set up an NvPushChannelSegmentRec's initial state based on the provided data
269 *
270 * \param segment Pointer to segment structure to initialize
271 * \param ptr CPU mapping to the base of the segment.
272 * \param gpuOffset GPU mapping of the base of the segment.
273 * \param size Size of the segment, in bytes.
274 */
InitDmaSegment(NvPushChannelSegmentPtr segment,void * ptr,NvU64 gpuOffset,NvU32 size)275 static void InitDmaSegment(NvPushChannelSegmentPtr segment,
276 void *ptr,
277 NvU64 gpuOffset,
278 NvU32 size)
279 {
280 segment->base = (NvPushChannelUnion *)ptr;
281 segment->buffer = (NvPushChannelUnion *)ptr;
282 segment->sizeInBytes = size;
283 segment->freeDwords = size >> 2;
284 segment->gpuMapOffset = gpuOffset;
285 segment->putOffset = 0;
286 }
287
288 /*!
289 * Set up the work submit token. RM will write this into the "error context
290 * DMA" at the offset we request.
291 */
RequestChidToken(NvPushChannelPtr p)292 static NvBool RequestChidToken(NvPushChannelPtr p)
293 {
294 NvPushDevicePtr pDevice = p->pDevice;
295 int deviceIndex;
296
297 for (deviceIndex = 0;
298 deviceIndex < __nvPushGetNumDevices(pDevice);
299 deviceIndex++) {
300
301 NVC36F_CTRL_GPFIFO_SET_WORK_SUBMIT_TOKEN_NOTIF_INDEX_PARAMS notifParams = { 0 };
302 NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN_PARAMS tokenParams = { 0 };
303 NvU32 status;
304
305 notifParams.index = NV_CHANNELGPFIFO_NOTIFICATION_TYPE__SIZE_1 + deviceIndex;
306
307 status = nvPushImportRmApiControl(pDevice,
308 p->channelHandle[deviceIndex],
309 NVC36F_CTRL_CMD_GPFIFO_SET_WORK_SUBMIT_TOKEN_NOTIF_INDEX,
310 ¬ifParams,
311 sizeof(notifParams));
312 if (status != NVOS_STATUS_SUCCESS) {
313 return FALSE;
314 }
315
316 /*
317 * Request the channel's "work submit token". This isn't actually used for
318 * anything but RM needs it to be called after the channel has been allocated,
319 * for reasons.
320 */
321 status = nvPushImportRmApiControl(pDevice,
322 p->channelHandle[deviceIndex],
323 NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN,
324 &tokenParams,
325 sizeof(tokenParams));
326 if (status != NVOS_STATUS_SUCCESS) {
327 return FALSE;
328 }
329 }
330 return TRUE;
331 }
332
BindAndScheduleChannel(NvPushDevicePtr pDevice,NvU32 channelHandle,NvU32 engineType)333 static NvBool BindAndScheduleChannel(NvPushDevicePtr pDevice,
334 NvU32 channelHandle,
335 NvU32 engineType)
336 {
337 NVA06F_CTRL_BIND_PARAMS bindParams = { 0 };
338 NVA06F_CTRL_GPFIFO_SCHEDULE_PARAMS scheduleParams = { 0 };
339 NvBool ret;
340
341 bindParams.engineType = engineType;
342 ret = nvPushImportRmApiControl(pDevice,
343 channelHandle,
344 NVA06F_CTRL_CMD_BIND,
345 &bindParams,
346 sizeof(bindParams));
347
348 if (ret != NVOS_STATUS_SUCCESS) {
349 nvPushImportLogError(pDevice, "Failed to bind the channel");
350 return FALSE;
351 }
352
353 scheduleParams.bEnable = NV_TRUE;
354 ret = nvPushImportRmApiControl(pDevice,
355 channelHandle,
356 NVA06F_CTRL_CMD_GPFIFO_SCHEDULE,
357 &scheduleParams,
358 sizeof(scheduleParams));
359
360 if (ret != NVOS_STATUS_SUCCESS) {
361 nvPushImportLogError(pDevice,
362 "Failed to schedule the channel");
363 return FALSE;
364 }
365
366 return TRUE;
367 }
368
AllocChannelObject(NvPushChannelPtr buffer,const NvPushAllocChannelParams * pParams,NvU64 * pUsedHandleBitmask,NvU64 gpuAddress)369 static NvBool AllocChannelObject(
370 NvPushChannelPtr buffer,
371 const NvPushAllocChannelParams *pParams,
372 NvU64 *pUsedHandleBitmask,
373 NvU64 gpuAddress)
374 {
375 NvPushDevicePtr pDevice = buffer->pDevice;
376 NV_CHANNEL_ALLOC_PARAMS params = { 0 };
377 unsigned int sd;
378 NvU32 userdMapHandle[NV_MAX_SUBDEVICES];
379 NvU32 ret;
380 const NvU64 gpFifoOffset = gpuAddress + __nvPushGpFifoOffset(buffer);
381 int deviceIndex;
382
383 for (deviceIndex = 0;
384 deviceIndex < __nvPushGetNumDevices(pDevice);
385 deviceIndex++) {
386 buffer->channelHandle[deviceIndex] = GetChannelHandle(pParams, pUsedHandleBitmask);
387 nvAssert(buffer->notifiers.errorCtxDma != 0);
388
389 /* Open the DMA channel by allocating the CHANNEL_GPFIFO object */
390 params.hObjectError = buffer->notifiers.errorCtxDma;
391 if (pDevice->subDevice[deviceIndex].gpuVASpaceObject != 0) {
392 params.hVASpace = pDevice->subDevice[deviceIndex].gpuVASpaceObject;
393 } else {
394 params.hObjectBuffer = pDevice->subDevice[deviceIndex].gpuVASpaceCtxDma;
395 }
396 // Offset is relative to the ctx dma
397 params.gpFifoOffset = gpFifoOffset;
398
399 if (pDevice->hal.caps.allocateDoubleSizeGpFifo) {
400 // On Tegra, we have to allocate twice the GPFIFO size. This is because
401 // the kernel will add its own entries (max 2) for the kickoff for the
402 // pre-sync and post-sync fences. This means the max kickoff size is not
403 // actually buffer->numGpFifoEntries - 1, it's
404 // most likely buffer->numGpFifoEntries - 3.
405 //
406 // TODO: Tell the users the actual max kickoff size to avoid this
407 // WAR. NvRmTegraChannelGetMaxKickoffGpfifoCount() retrieves this piece
408 // of info on Tegra. Bug 2404063.
409 params.gpFifoEntries = buffer->numGpFifoEntries * 2;
410 } else {
411 params.gpFifoEntries = buffer->numGpFifoEntries;
412 }
413
414 params.flags = 0;
415 if (pParams->secureChannel) {
416 params.flags |= DRF_DEF(OS04, _FLAGS, _CC_SECURE, _TRUE);
417 }
418 if (pParams->difrPrefetch) {
419 params.flags |= DRF_DEF(OS04,
420 _FLAGS,
421 _SET_EVICT_LAST_CE_PREFETCH_CHANNEL,
422 _TRUE);
423 }
424
425 if (pDevice->hal.caps.clientAllocatesUserD) {
426 if (pDevice->clientSli) {
427 params.hUserdMemory[0] = buffer->userD[deviceIndex].hMemory;
428 params.userdOffset[0] = 0;
429 } else {
430 for (sd = 0; sd < pDevice->numSubDevices; sd++) {
431 params.hUserdMemory[sd] = buffer->userD[0].hMemory;
432 params.userdOffset[sd] = 0;
433 }
434 }
435 userdMapHandle[deviceIndex] = buffer->userD[deviceIndex].hMemory;
436 } else {
437 userdMapHandle[deviceIndex] = buffer->channelHandle[deviceIndex];
438 }
439 params.engineType = pParams->engineType;
440 if (pDevice->clientSli) {
441 params.subDeviceId = (1 << deviceIndex);
442 }
443
444 if ((ret = nvPushImportRmApiAlloc(pDevice,
445 pDevice->subDevice[deviceIndex].deviceHandle,
446 buffer->channelHandle[deviceIndex],
447 pDevice->gpfifoClass,
448 ¶ms)) != NVOS_STATUS_SUCCESS)
449 {
450 nvPushImportLogError(pDevice,
451 "Push buffer object allocation failed: 0x%x (%s)",
452 ret, nvstatusToString(ret));
453 buffer->channelHandle[deviceIndex] = 0;
454 return FALSE;
455 }
456
457 if (!BindAndScheduleChannel(pDevice,
458 buffer->channelHandle[deviceIndex],
459 pParams->engineType)) {
460 return FALSE;
461 }
462 }
463
464 for (sd = 0; sd < pDevice->numSubDevices; sd++) {
465 void *pUserD;
466
467 deviceIndex = __nvPushGetDeviceIndex(pDevice, sd);
468
469 // Map the DMA controls for each subdevice.
470 ret = nvPushImportRmApiMapMemory(pDevice,
471 pDevice->subDevice[sd].handle,
472 userdMapHandle[deviceIndex],
473 0,
474 pDevice->userDSize,
475 &pUserD,
476 0);
477 if (ret != NVOS_STATUS_SUCCESS) {
478 nvPushImportLogError(pDevice,
479 "Push buffer mapping failed: 0x%x (%s)",
480 ret, nvstatusToString(ret));
481 return FALSE;
482 }
483
484 buffer->control[sd] = pUserD;
485 }
486
487 return TRUE;
488 }
489
490 /*
491 * It might be nice to suballocate these rather
492 * than create a separate RM allocation for each channel.
493 */
nvDmaAllocUserD(NvPushChannelPtr p,const NvPushAllocChannelParams * pParams,NvU64 * pUsedHandleBitmask)494 static NvBool nvDmaAllocUserD(
495 NvPushChannelPtr p,
496 const NvPushAllocChannelParams *pParams,
497 NvU64 *pUsedHandleBitmask)
498 {
499 NvPushDevicePtr pDevice = p->pDevice;
500 int deviceIndex;
501
502 if (!pDevice->hal.caps.clientAllocatesUserD) {
503 return TRUE;
504 }
505
506 for (deviceIndex = 0;
507 deviceIndex < __nvPushGetNumDevices(pDevice);
508 deviceIndex++) {
509 NV_MEMORY_ALLOCATION_PARAMS memAllocParams = { 0 };
510 const NvU32 attr =
511 DRF_DEF(OS32, _ATTR, _LOCATION, _VIDMEM) |
512 DRF_DEF(OS32, _ATTR, _PAGE_SIZE, _4KB) |
513 DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED);
514 const NvU32 flags =
515 NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE |
516 NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM;
517 NvU32 ret;
518
519 NvU32 hMemory = GetChannelHandle(pParams, pUsedHandleBitmask);
520
521 memAllocParams.owner = pDevice->clientHandle;
522 memAllocParams.type = NVOS32_TYPE_DMA;
523 memAllocParams.size = pDevice->userDSize;
524 memAllocParams.attr = attr;
525 memAllocParams.flags = flags;
526 memAllocParams.alignment = pDevice->userDSize;
527
528 ret = nvPushImportRmApiAlloc(pDevice,
529 pDevice->subDevice[deviceIndex].deviceHandle,
530 hMemory,
531 NV01_MEMORY_LOCAL_USER,
532 &memAllocParams);
533 if (ret != NV_OK) {
534 return FALSE;
535 }
536
537 p->userD[deviceIndex].hMemory = hMemory;
538 }
539
540 return TRUE;
541 }
542
nvPushGetSupportedClassIndex(NvPushDevicePtr pDevice,const void * pClassTable,size_t classTableStride,size_t classTableLength)543 int nvPushGetSupportedClassIndex(
544 NvPushDevicePtr pDevice,
545 const void *pClassTable,
546 size_t classTableStride,
547 size_t classTableLength)
548 {
549 unsigned int i, j;
550
551 for (i = 0; i < classTableLength; i++) {
552
553 const NvU8 *bytes = (const NvU8 *)pClassTable;
554 const size_t byteOffset = i * classTableStride;
555 const NvPushSupportedClass *pClass =
556 (const NvPushSupportedClass *) (bytes + byteOffset);
557
558 if (nvPushIsAModel(pDevice)) {
559 if (pDevice->amodelConfig == pClass->amodelConfig) {
560 return i;
561 }
562 continue;
563 }
564
565 for (j = 0; j < pDevice->numClasses; j++) {
566 if (pClass->classNumber == pDevice->supportedClasses[j]) {
567 return i;
568 }
569 }
570 }
571 return -1;
572 }
573
GetChannelClassAndUserDSize(NvPushDevicePtr pDevice,const NvPushAllocDeviceParams * pParams)574 static NvBool GetChannelClassAndUserDSize(
575 NvPushDevicePtr pDevice,
576 const NvPushAllocDeviceParams *pParams)
577 {
578 const struct {
579 NvPushSupportedClass base;
580 size_t gpFifoSize;
581 } gpFifoDmaClasses[] = {
582 {
583 { HOPPER_CHANNEL_GPFIFO_A,
584 NV_AMODEL_HOPPER },
585 sizeof(HopperAControlGPFifo)
586 },
587 {
588 { AMPERE_CHANNEL_GPFIFO_A,
589 NV_AMODEL_ADA },
590 sizeof(AmpereAControlGPFifo)
591 },
592 {
593 { AMPERE_CHANNEL_GPFIFO_A,
594 NV_AMODEL_AMPERE },
595 sizeof(AmpereAControlGPFifo)
596 },
597 {
598 { TURING_CHANNEL_GPFIFO_A,
599 NV_AMODEL_TURING },
600 sizeof(TuringAControlGPFifo)
601 },
602 {
603 { VOLTA_CHANNEL_GPFIFO_A,
604 NV_AMODEL_VOLTA },
605 sizeof(VoltaAControlGPFifo)
606 },
607 {
608 { PASCAL_CHANNEL_GPFIFO_A,
609 NV_AMODEL_PASCAL },
610 sizeof(PascalAControlGPFifo)
611 },
612 {
613 { MAXWELL_CHANNEL_GPFIFO_A,
614 NV_AMODEL_MAXWELL },
615 sizeof(MaxwellAControlGPFifo)
616 },
617 {
618 { KEPLER_CHANNEL_GPFIFO_C,
619 NV_AMODEL_KEPLER_SM35 },
620 sizeof(KeplerCControlGPFifo)
621 },
622 {
623 { KEPLER_CHANNEL_GPFIFO_B,
624 NV_AMODEL_KEPLER },
625 sizeof(KeplerBControlGPFifo)
626 },
627
628 };
629
630 int i;
631
632 i = nvPushGetSupportedClassIndex(pDevice, gpFifoDmaClasses,
633 sizeof(gpFifoDmaClasses[0]),
634 ARRAY_LEN(gpFifoDmaClasses));
635 if (i == -1) {
636 return FALSE;
637 }
638
639 pDevice->gpfifoClass = gpFifoDmaClasses[i].base.classNumber;
640 pDevice->userDSize = gpFifoDmaClasses[i].gpFifoSize;
641 return TRUE;
642 }
643
644 /*
645 * Query GPU<->CPU coherency. In particular, *pCoherent is set to TRUE when
646 * the GPU is capable of accessing CPU-cached system memory coherently with
647 * respect to CPU accesses.
648 *
649 * For surfaces with CPU read/write or CPU read-mostly such as notifiers:
650 * If *pCoherent is TRUE:
651 * - create CPU mappings with COHERENCY_WRITE_BACK
652 * - create GPU mappings with CACHE_SNOOP_ENABLE
653 * If *pCoherent is FALSE:
654 * - create CPU mappings with COHERENCY_UNCACHED
655 * - create GPU mappings with CACHE_SNOOP_DISABLE
656 *
657 * (CPU write-mostly surfaces such as the pushbuffer always use WRITE_COMBINED
658 * memory.)
659 *
660 * Note we only query on the first subdevice and assume the other subdevices
661 * are the same.
662 */
GetCoherenceFlags(NvPushChannelPtr pChannel,NvBool * pCoherent)663 static NvBool GetCoherenceFlags(
664 NvPushChannelPtr pChannel,
665 NvBool *pCoherent)
666 {
667 NvPushDevicePtr pDevice = pChannel->pDevice;
668 NV2080_CTRL_BUS_GET_INFO_PARAMS busInfo = { 0 };
669 struct {
670 NV2080_CTRL_BUS_INFO coherentFlags;
671 } busInfoList;
672
673 NvU32 ret;
674
675 NVMISC_MEMSET(&busInfoList, 0, sizeof(busInfoList));
676 busInfoList.coherentFlags.index =
677 NV2080_CTRL_BUS_INFO_INDEX_COHERENT_DMA_FLAGS;
678
679 busInfo.busInfoListSize = sizeof(busInfoList) /
680 sizeof(NV2080_CTRL_BUS_INFO);
681 busInfo.busInfoList = NV_PTR_TO_NvP64(&busInfoList);
682
683 ret = nvPushImportRmApiControl(pDevice,
684 pDevice->subDevice[0].handle,
685 NV2080_CTRL_CMD_BUS_GET_INFO,
686 &busInfo, sizeof(busInfo));
687
688 if (ret != NVOS_STATUS_SUCCESS) {
689 return FALSE;
690 }
691
692 *pCoherent =
693 FLD_TEST_DRF(2080_CTRL_BUS_INFO, _COHERENT_DMA_FLAGS, _GPUGART, _TRUE,
694 busInfoList.coherentFlags.data);
695 return TRUE;
696 }
697
TryAllocAndMapPushbuffer(NvPushChannelPtr pChannel,const NvU32 allocFlags,const NvU32 mapFlags,const NvU32 limit,void ** pCpuAddress,NvU64 * pGpuAddress)698 static NvBool TryAllocAndMapPushbuffer(
699 NvPushChannelPtr pChannel,
700 const NvU32 allocFlags,
701 const NvU32 mapFlags,
702 const NvU32 limit,
703 void **pCpuAddress,
704 NvU64 *pGpuAddress)
705 {
706 NvU32 ret;
707 NvU64 localLimit;
708 NvU64 size = limit + 1;
709 void *cpuAddress = NULL;
710 NvU64 gpuAddress = 0;
711 NvPushDevicePtr pDevice = pChannel->pDevice;
712 int deviceIndex;
713 NvBool vaAlloc[NV_MAX_SUBDEVICES] = { 0 };
714 NvBool vaMap[NV_MAX_SUBDEVICES] = { 0 };
715 NvBool surfaceAlloc = FALSE;
716
717 for (deviceIndex = 0;
718 deviceIndex < __nvPushGetNumDevices(pDevice);
719 deviceIndex++) {
720 NV_MEMORY_ALLOCATION_PARAMS vaParams = { 0 };
721
722 vaParams.owner = 0x70757368;
723 vaParams.type = NVOS32_TYPE_DMA;
724 vaParams.flags =
725 NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED |
726 NVOS32_ALLOC_FLAGS_VIRTUAL;
727 vaParams.size = size;
728 vaParams.hVASpace = pDevice->subDevice[deviceIndex].gpuVASpaceObject;
729
730 if (deviceIndex == 0) {
731 /* For the first device, RM assigns a virtual address. */
732 if (pChannel->pDevice->hal.caps.extendedBase) {
733 /*
734 * Force the virtual mapping to be naturally aligned.
735 * This ensures that the allocation cannot cross a 40-bit
736 * boundary, so we can initialize the higher bits of the VA
737 * with the PB_EXTENDED_BASE_OPERAND GPFIFO command once at
738 * init time and not worry about it being able to change
739 * between any two GPFIFO entries.
740 */
741 vaParams.flags |= NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE;
742 vaParams.alignment = size;
743 ROUNDUP_POW2_U64(vaParams.alignment);
744 }
745 } else {
746 /* For subsequent devices, use the same virtual address. */
747 vaParams.flags |= NVOS32_ALLOC_FLAGS_FIXED_ADDRESS_ALLOCATE;
748 nvAssert(gpuAddress != 0);
749 vaParams.offset = gpuAddress;
750 }
751
752 ret = nvPushImportRmApiAlloc(
753 pDevice,
754 pDevice->subDevice[deviceIndex].deviceHandle,
755 pChannel->pushbufferVAHandle[deviceIndex],
756 NV50_MEMORY_VIRTUAL,
757 &vaParams);
758
759 if (ret != NVOS_STATUS_SUCCESS) {
760 goto fail;
761 }
762 vaAlloc[deviceIndex] = TRUE;
763
764 if (deviceIndex == 0) {
765 gpuAddress = vaParams.offset;
766 nvAssert(vaParams.size >= size);
767 /* The VA allocation may have been bloated to a larger size, to
768 * align with the page size. Adjust to ensure that we allocate a
769 * surface of at least that size, or else attempts to map it will
770 * fail. */
771 size = vaParams.size;
772 } else {
773 nvAssert(gpuAddress == vaParams.offset);
774 nvAssert(vaParams.size == size);
775 }
776 }
777
778 /* Allocate a single surface in system memory for the pushbuffer. */
779 localLimit = size - 1;
780 ret = nvPushImportRmApiAllocMemory64(
781 pDevice,
782 pDevice->subDevice[0].deviceHandle,
783 pChannel->pushbufferHandle,
784 NV01_MEMORY_SYSTEM,
785 allocFlags,
786 &cpuAddress,
787 &localLimit);
788
789 if (ret != NVOS_STATUS_SUCCESS) {
790 goto fail;
791 }
792 nvAssert(localLimit + 1 >= size);
793 surfaceAlloc = TRUE;
794
795 for (deviceIndex = 0;
796 deviceIndex < __nvPushGetNumDevices(pDevice);
797 deviceIndex++) {
798 NvU64 mapOffset = 0;
799
800 ret = nvPushImportRmApiMapMemoryDma(
801 pDevice,
802 pDevice->subDevice[deviceIndex].deviceHandle,
803 pChannel->pushbufferVAHandle[deviceIndex],
804 pChannel->pushbufferHandle,
805 0,
806 size,
807 mapFlags,
808 &mapOffset);
809
810 if (ret != NVOS_STATUS_SUCCESS) {
811 goto fail;
812 }
813 vaMap[deviceIndex] = TRUE;
814 /* mapMemoryDma takes in a relative offset but assigns an absolute VA */
815 nvAssert(mapOffset == gpuAddress);
816 }
817
818 /* success */
819 *pCpuAddress = cpuAddress;
820 *pGpuAddress = gpuAddress;
821 return TRUE;
822
823 fail:
824 for (deviceIndex = __nvPushGetNumDevices(pDevice) - 1;
825 deviceIndex >= 0;
826 deviceIndex--) {
827 if (vaMap[deviceIndex]) {
828 ret = nvPushImportRmApiUnmapMemoryDma(pDevice,
829 pDevice->subDevice[deviceIndex].deviceHandle,
830 pChannel->pushbufferVAHandle[deviceIndex],
831 pChannel->pushbufferHandle,
832 0,
833 gpuAddress);
834 nvAssert(ret == NVOS_STATUS_SUCCESS);
835 vaMap[deviceIndex] = FALSE;
836 }
837 if (vaAlloc[deviceIndex]) {
838 ret = nvPushImportRmApiFree(pDevice,
839 pDevice->subDevice[deviceIndex].deviceHandle,
840 pChannel->pushbufferVAHandle[deviceIndex]);
841 nvAssert(ret == NVOS_STATUS_SUCCESS);
842 vaAlloc[deviceIndex] = FALSE;
843 }
844 };
845
846 if (surfaceAlloc) {
847 ret = nvPushImportRmApiFree(pDevice,
848 pDevice->subDevice[0].deviceHandle,
849 pChannel->pushbufferHandle);
850 nvAssert(ret == NVOS_STATUS_SUCCESS);
851 }
852
853 return FALSE;
854 }
855
AllocPushbuffer(NvPushChannelPtr pChannel,const NvPushAllocChannelParams * pParams,NvU64 * pUsedHandleBitmask,void ** pCpuAddress,NvU64 * pGpuAddress)856 static NvBool AllocPushbuffer(
857 NvPushChannelPtr pChannel,
858 const NvPushAllocChannelParams *pParams,
859 NvU64 *pUsedHandleBitmask,
860 void **pCpuAddress,
861 NvU64 *pGpuAddress)
862 {
863 const NvU32 size = CalculateGPBufferSize(pChannel);
864 NvU32 limit = size - 1;
865 int deviceIndex;
866
867 pChannel->pushbufferHandle = GetChannelHandle(pParams, pUsedHandleBitmask);
868 for (deviceIndex = 0;
869 deviceIndex < __nvPushGetNumDevices(pChannel->pDevice);
870 deviceIndex++) {
871 pChannel->pushbufferVAHandle[deviceIndex] =
872 GetChannelHandle(pParams, pUsedHandleBitmask);
873 }
874
875 if (TryAllocAndMapPushbuffer(
876 pChannel,
877 DRF_DEF(OS02, _FLAGS, _PHYSICALITY, _NONCONTIGUOUS) |
878 DRF_DEF(OS02, _FLAGS, _COHERENCY, _WRITE_COMBINE),
879 DRF_DEF(OS46, _FLAGS, _CACHE_SNOOP, _DISABLE),
880 limit,
881 pCpuAddress,
882 pGpuAddress)) {
883 return TRUE;
884 }
885
886 pChannel->pushbufferHandle = 0;
887 NVMISC_MEMSET(pChannel->pushbufferVAHandle, 0, sizeof(pChannel->pushbufferVAHandle));
888 return FALSE;
889 }
890
891 /*!
892 * Free resources allocated in AllocUserMode().
893 */
FreeUserMode(NvPushDevicePtr pDevice)894 static void FreeUserMode(
895 NvPushDevicePtr pDevice)
896 {
897 NvU32 sd;
898
899 for (sd = 0; sd < pDevice->numSubDevices; sd++) {
900
901 if (pDevice->subDevice[sd].pUserMode != NULL) {
902 nvPushImportRmApiUnmapMemory(
903 pDevice,
904 pDevice->subDevice[sd].handle,
905 pDevice->subDevice[sd].hUserMode,
906 pDevice->subDevice[sd].pUserMode,
907 0 /* flags */);
908 pDevice->subDevice[sd].pUserMode = NULL;
909 }
910
911 if (pDevice->subDevice[sd].hUserMode != 0) {
912 nvPushImportRmApiFree(
913 pDevice,
914 pDevice->subDevice[sd].handle,
915 pDevice->subDevice[sd].hUserMode);
916 pDevice->subDevice[sd].hUserMode = 0;
917 }
918 }
919 }
920
921 /*!
922 * Allocate and map the "usermode" object on each subdevice, supported on GV100
923 * and up. This mapping exposes registers considered safe for userspace to
924 * access directly. Most importantly, it contains the "doorbell" register
925 * which we use to notify HOST that we've updated GP_PUT so that it will fetch
926 * work for the channel.
927 */
AllocUserMode(NvPushDevicePtr pDevice,const NvPushAllocDeviceParams * pParams,NvU64 * pUsedHandleBitmask)928 static NvBool AllocUserMode(
929 NvPushDevicePtr pDevice,
930 const NvPushAllocDeviceParams *pParams,
931 NvU64 *pUsedHandleBitmask)
932 {
933 unsigned int sd;
934
935 static const NvPushSupportedClass userModeClasses[] = {
936 { HOPPER_USERMODE_A,
937 NV_AMODEL_HOPPER },
938 { VOLTA_USERMODE_A,
939 NV_AMODEL_VOLTA },
940 };
941 int i;
942
943 if (!pDevice->hal.caps.clientAllocatesUserD) {
944 return TRUE;
945 }
946
947 i = nvPushGetSupportedClassIndex(pDevice, userModeClasses,
948 sizeof(userModeClasses[0]),
949 ARRAY_LEN(userModeClasses));
950 if (i == -1) {
951 return FALSE;
952 }
953
954 for (sd = 0; sd < pDevice->numSubDevices; sd++) {
955 NvU32 ret;
956 void *allocParams = NULL;
957
958 NV_HOPPER_USERMODE_A_PARAMS hopperParams = { 0 };
959 if (userModeClasses[i].classNumber != VOLTA_USERMODE_A) {
960 allocParams = &hopperParams;
961 // The BAR1 mapping is used for (faster and more efficient) writes
962 // to perform work submission, but can't be used for reads.
963 // If we ever want to read from the USERMODE region (e.g., to read
964 // PTIMER) then we need a second mapping.
965 hopperParams.bBar1Mapping = NV_TRUE;
966 }
967
968 pDevice->subDevice[sd].hUserMode =
969 GetDeviceHandle(pParams, pUsedHandleBitmask);
970
971 ret = nvPushImportRmApiAlloc(
972 pDevice,
973 pDevice->subDevice[sd].handle,
974 pDevice->subDevice[sd].hUserMode,
975 userModeClasses[i].classNumber,
976 allocParams);
977
978 if (ret != NVOS_STATUS_SUCCESS) {
979 pDevice->subDevice[sd].hUserMode = 0;
980 goto fail;
981 }
982
983 ret = nvPushImportRmApiMapMemory(
984 pDevice,
985 pDevice->subDevice[sd].handle,
986 pDevice->subDevice[sd].hUserMode,
987 0, /* offset */
988 NVC361_NV_USERMODE__SIZE,
989 &pDevice->subDevice[sd].pUserMode,
990 0 /* flags */);
991
992 if (ret != NVOS_STATUS_SUCCESS) {
993 goto fail;
994 }
995 }
996
997 return TRUE;
998
999 fail:
1000 FreeUserMode(pDevice);
1001 return FALSE;
1002 }
1003
CheckCaps(NvPushDevicePtr pDevice)1004 static void CheckCaps(NvPushDevicePtr pDevice)
1005 {
1006 int deviceIndex;
1007
1008 pDevice->hostLBoverflowBug1667921 = FALSE;
1009
1010 for (deviceIndex = 0;
1011 deviceIndex < __nvPushGetNumDevices(pDevice);
1012 deviceIndex++) {
1013 NV0080_CTRL_FIFO_GET_CAPS_V2_PARAMS fifoCapsParams = { 0 };
1014 NvU32 ret;
1015
1016 ret = nvPushImportRmApiControl(pDevice,
1017 pDevice->subDevice[deviceIndex].deviceHandle,
1018 NV0080_CTRL_CMD_FIFO_GET_CAPS_V2,
1019 &fifoCapsParams,
1020 sizeof(fifoCapsParams));
1021 if (ret != NVOS_STATUS_SUCCESS) {
1022 nvAssert(!"Failed to determine chip fifo capabilities");
1023 return;
1024 }
1025
1026 pDevice->hostLBoverflowBug1667921 |=
1027 !!NV0080_CTRL_FIFO_GET_CAP(fifoCapsParams.capsTbl,
1028 NV0080_CTRL_FIFO_CAPS_HAS_HOST_LB_OVERFLOW_BUG_1667921);
1029 }
1030 }
1031
1032
FreeNotifiers(NvPushChannelPtr pChannel)1033 static void FreeNotifiers(
1034 NvPushChannelPtr pChannel)
1035 {
1036 NvPushDevicePtr pDevice = pChannel->pDevice;
1037
1038 if (pChannel->notifiers.errorCtxDma != 0) {
1039 nvPushImportRmApiFree(pDevice,
1040 pDevice->clientHandle,
1041 pChannel->notifiers.errorCtxDma);
1042 pChannel->notifiers.errorCtxDma = 0;
1043
1044 }
1045
1046 if (pChannel->notifiers.gpuAddress != 0) {
1047 int deviceIndex;
1048 for (deviceIndex = __nvPushGetNumDevices(pDevice) - 1;
1049 deviceIndex >= 0;
1050 deviceIndex--) {
1051 nvPushImportRmApiUnmapMemoryDma(pDevice,
1052 pDevice->subDevice[deviceIndex].deviceHandle,
1053 pDevice->subDevice[deviceIndex].gpuVASpaceCtxDma,
1054 pChannel->notifiers.memoryHandle,
1055 0,
1056 pChannel->notifiers.gpuAddress);
1057 }
1058 pChannel->notifiers.gpuAddress = 0;
1059 }
1060
1061 if (pChannel->notifiers.memoryHandle != 0) {
1062 nvPushImportRmApiFree(pDevice,
1063 pDevice->subDevice[0].deviceHandle,
1064 pChannel->notifiers.memoryHandle);
1065 pChannel->notifiers.memoryHandle = 0;
1066 }
1067 }
1068
1069 /*
1070 * Allocate enough notifier memory to store:
1071 * - numNotifiers host driver requested NvNotifications, per subDevice
1072 * - NV_PUSH_NUM_INTERNAL_NOTIFIERS NvNotifications, per channel
1073 */
AllocNotifiers(NvPushChannelPtr pChannel,const NvPushAllocChannelParams * pParams,NvBool coherent,NvU64 * pUsedHandleBitmask)1074 static NvBool AllocNotifiers(
1075 NvPushChannelPtr pChannel,
1076 const NvPushAllocChannelParams *pParams,
1077 NvBool coherent,
1078 NvU64 *pUsedHandleBitmask)
1079 {
1080 NvPushDevicePtr pDevice = pChannel->pDevice;
1081 const NvU32 size =
1082 (((pParams->numNotifiers * pDevice->numSubDevices) +
1083 NV_PUSH_NUM_INTERNAL_NOTIFIERS) *
1084 sizeof(NvNotification));
1085 NV_CONTEXT_DMA_ALLOCATION_PARAMS ctxdmaParams = { 0 };
1086
1087 NvU64 limit = size - 1;
1088 int deviceIndex;
1089 NvU32 ret;
1090 NvU32 allocFlags, gpuMapFlags;
1091
1092 /*
1093 * The host-driver specified number of notifiers must not collide
1094 * with the reserved bit we use to indicate internal notifiers.
1095 */
1096 if (pParams->numNotifiers & NV_PUSH_NOTIFIER_INTERNAL_BIT) {
1097 return FALSE;
1098 }
1099
1100 pChannel->notifiers.num = pParams->numNotifiers;
1101 pChannel->notifiers.memoryHandle =
1102 GetChannelHandle(pParams, pUsedHandleBitmask);
1103
1104 allocFlags = DRF_DEF(OS02,_FLAGS,_PHYSICALITY,_NONCONTIGUOUS),
1105 gpuMapFlags = 0;
1106 if (coherent) {
1107 allocFlags = FLD_SET_DRF(OS02, _FLAGS, _COHERENCY, _WRITE_BACK, allocFlags);
1108 gpuMapFlags = FLD_SET_DRF(OS46, _FLAGS, _CACHE_SNOOP, _ENABLE, gpuMapFlags);
1109 } else {
1110 allocFlags = FLD_SET_DRF(OS02, _FLAGS, _COHERENCY, _UNCACHED, allocFlags);
1111 gpuMapFlags = FLD_SET_DRF(OS46, _FLAGS, _CACHE_SNOOP, _DISABLE, gpuMapFlags);
1112 }
1113
1114 ret = nvPushImportRmApiAllocMemory64(
1115 pDevice,
1116 pDevice->subDevice[0].deviceHandle,
1117 pChannel->notifiers.memoryHandle,
1118 NV01_MEMORY_SYSTEM,
1119 allocFlags,
1120 (void **)&pChannel->notifiers.cpuAddress,
1121 &limit);
1122
1123 if (ret != NVOS_STATUS_SUCCESS) {
1124 pChannel->notifiers.memoryHandle = 0;
1125 goto fail;
1126 }
1127
1128 /* Map the memory into the GPU's VA space. */
1129
1130 for (deviceIndex = 0;
1131 deviceIndex < __nvPushGetNumDevices(pDevice);
1132 deviceIndex++) {
1133 NvU32 mapFlags = gpuMapFlags;
1134 NvU64 gpuAddress;
1135 if (deviceIndex == 0) {
1136 /* For the first device, RM assigns a virtual address. */
1137 gpuAddress = 0;
1138 } else {
1139 /* For subsequent devices, use the same virtual address. */
1140 mapFlags = FLD_SET_DRF(OS46, _FLAGS, _DMA_OFFSET_FIXED, _TRUE,
1141 mapFlags);
1142 gpuAddress = pChannel->notifiers.gpuAddress;
1143 nvAssert(gpuAddress != 0);
1144 }
1145 ret = nvPushImportRmApiMapMemoryDma(
1146 pDevice,
1147 pDevice->subDevice[deviceIndex].deviceHandle,
1148 pDevice->subDevice[deviceIndex].gpuVASpaceCtxDma,
1149 pChannel->notifiers.memoryHandle,
1150 0, /* offset */
1151 size,
1152 mapFlags,
1153 &gpuAddress);
1154
1155 if (ret != NVOS_STATUS_SUCCESS) {
1156 goto fail;
1157 }
1158
1159 if (deviceIndex == 0) {
1160 pChannel->notifiers.gpuAddress = gpuAddress;
1161 } else {
1162 nvAssert(pChannel->notifiers.gpuAddress == gpuAddress);
1163 }
1164 }
1165
1166 /* Create the internal notifier ctxDma. */
1167
1168 pChannel->notifiers.errorCtxDma =
1169 GetChannelHandle(pParams, pUsedHandleBitmask);
1170
1171 ctxdmaParams.hMemory = pChannel->notifiers.memoryHandle;
1172 ctxdmaParams.flags = DRF_DEF(OS03, _FLAGS, _MAPPING, _KERNEL) |
1173 DRF_DEF(OS03, _FLAGS, _HASH_TABLE, _DISABLE);
1174 /* the internal notifiers are at the start of the memory */
1175 ctxdmaParams.offset = 0;
1176 ctxdmaParams.limit = (NV_PUSH_NUM_INTERNAL_NOTIFIERS *
1177 sizeof(NvNotification)) - 1;
1178
1179 ret = nvPushImportRmApiAlloc(pDevice,
1180 pDevice->subDevice[0].deviceHandle,
1181 pChannel->notifiers.errorCtxDma,
1182 NV01_CONTEXT_DMA,
1183 &ctxdmaParams);
1184
1185 if (ret != NVOS_STATUS_SUCCESS) {
1186 pChannel->notifiers.errorCtxDma = 0;
1187 goto fail;
1188 }
1189
1190 /*
1191 * Initialize the error notifier; note that there is only one
1192 * error notifier shared by all subdevices, so we specify master as the
1193 * subDeviceMask.
1194 */
1195 nvPushInitWaitForNotifier(pChannel,
1196 NV_PUSH_ERROR_NOTIFIER_INDEX,
1197 NV_PUSH_SUBDEVICE_MASK_PRIMARY);
1198
1199 return TRUE;
1200
1201 fail:
1202 FreeNotifiers(pChannel);
1203 return FALSE;
1204 }
1205
GetExtendedBase(NvU64 offset)1206 static NvU32 GetExtendedBase(NvU64 offset)
1207 {
1208 return NvU64_HI32(offset) >> 8;
1209 }
1210
InitGpFifoExtendedBase(NvPushChannelPtr pChannel)1211 static void InitGpFifoExtendedBase(
1212 NvPushChannelPtr pChannel)
1213 {
1214 const NvU64 pbBase = pChannel->main.gpuMapOffset;
1215 const NvU32 extendedBase = GetExtendedBase(pbBase);
1216 NvU32 *gpPointer = &(pChannel->gpfifo[pChannel->gpPutOffset*2]);
1217 NvU32 i;
1218
1219 if (!pChannel->pDevice->hal.caps.extendedBase) {
1220 nvAssert(extendedBase == 0);
1221 return;
1222 }
1223
1224 /*
1225 * Because of the natural VA alignment specified when allocating the
1226 * pushbuffer, all parts of the pushbuffer surface should be in the same
1227 * 40-bit region.
1228 */
1229 nvAssert(GetExtendedBase(pChannel->main.gpuMapOffset) ==
1230 GetExtendedBase(pChannel->progressTracker.gpuMapOffset));
1231 nvAssert(GetExtendedBase(pChannel->main.gpuMapOffset +
1232 pChannel->main.sizeInBytes - 1) ==
1233 GetExtendedBase(pChannel->main.gpuMapOffset));
1234 nvAssert(GetExtendedBase(pChannel->progressTracker.gpuMapOffset +
1235 pChannel->progressTracker.sizeInBytes - 1) ==
1236 GetExtendedBase(pChannel->progressTracker.gpuMapOffset));
1237
1238 /* Set the "extended base" for all subsequent methods */
1239 gpPointer[0] = DRF_NUM(C86F, _GP_ENTRY0, _PB_EXTENDED_BASE_OPERAND, extendedBase);
1240 gpPointer[1] = DRF_DEF(C86F, _GP_ENTRY1, _OPCODE, _SET_PB_SEGMENT_EXTENDED_BASE);
1241 gpPointer += 2;
1242
1243 /* Pad out with NOP GPFIFO methods so everything remains aligned. */
1244 for (i = 1; i < NV_PUSH_NUM_GPFIFO_ENTRIES_PER_KICKOFF; i++) {
1245 gpPointer[0] = 0;
1246 gpPointer[1] = DRF_DEF(C86F, _GP_ENTRY1, _OPCODE, _NOP);
1247 gpPointer += 2;
1248 }
1249
1250 pChannel->gpPutOffset += NV_PUSH_NUM_GPFIFO_ENTRIES_PER_KICKOFF;
1251
1252 }
1253
nvPushAllocChannel(const NvPushAllocChannelParams * pParams,NvPushChannelPtr buffer)1254 NvBool nvPushAllocChannel(
1255 const NvPushAllocChannelParams *pParams,
1256 NvPushChannelPtr buffer)
1257 {
1258 NvPushDevicePtr pDevice;
1259 void *cpuAddress = NULL;
1260 NvU64 gpuAddress = 0;
1261 NvU64 usedHandleBitmask = 0;
1262 NvBool coherent = FALSE;
1263
1264 NVMISC_MEMSET(buffer, 0, sizeof(*buffer));
1265
1266 pDevice = pParams->pDevice;
1267
1268 buffer->pDevice = pDevice;
1269 buffer->logNvDiss = pParams->logNvDiss;
1270 buffer->noTimeout = pParams->noTimeout;
1271 buffer->ignoreChannelErrors = pParams->ignoreChannelErrors;
1272
1273 buffer->currentSubDevMask = NV_PUSH_SUBDEVICE_MASK_ALL;
1274
1275 /*
1276 * Assign main.sizeInBytes early, because the rest of
1277 * initialization relies on knowing the main pushbuffer size.
1278 * Note this must fit in NV_PUSH_PROGRESS_TRACKER_SEMAPHORE_GET,
1279 * which stores dwords.
1280 */
1281 nvAssert((DRF_MASK(NV_PUSH_PROGRESS_TRACKER_SEMAPHORE_GET) * 4) >
1282 pParams->pushBufferSizeInBytes);
1283 buffer->main.sizeInBytes = pParams->pushBufferSizeInBytes;
1284
1285 /*
1286 * Compute numGpFifoEntries. There are several constraints:
1287 *
1288 * - We make numGpFifoEntries 1/64th the size of the main
1289 * pushbuffer. The maximum pushbuffer size is 1048572, and we
1290 * consume 2 gpFifo entries per kickoff. This works out to be
1291 * 128 bytes of pushbuffer (32 dwords) per kickoff, before we
1292 * are gpFifo-limited.
1293 *
1294 * - Per dev_pbdma.ref, "The number of GP entries in the circular
1295 * buffer is always a power of 2." So, round up to the next
1296 * power of two.
1297 *
1298 * - Because we consume 2 gpFifo entries per kickoff
1299 * (NV_PUSH_NUM_GPFIFO_ENTRIES_PER_KICKOFF), we also align to a
1300 * multiple of 2. This should be guaranteed by the power of 2
1301 * check.
1302 *
1303 * - numGpFifoEntries must fit in
1304 * NV_PUSH_PROGRESS_TRACKER_SEMAPHORE_GP_GET so that the
1305 * progress tracker semaphore releases can report the consumed
1306 * gpFifo entry. The distribution of bits in
1307 * NV_PUSH_PROGRESS_TRACKER_SEMAPHORE should ensure this is
1308 * satisfied.
1309 */
1310
1311 buffer->numGpFifoEntries = pParams->pushBufferSizeInBytes / 64;
1312
1313 buffer->numGpFifoEntries = nvNextPow2_U32(buffer->numGpFifoEntries);
1314
1315 nvAssert((buffer->numGpFifoEntries %
1316 NV_PUSH_NUM_GPFIFO_ENTRIES_PER_KICKOFF) == 0);
1317
1318 nvAssert((DRF_MASK(NV_PUSH_PROGRESS_TRACKER_SEMAPHORE_GP_GET) * 2) >
1319 buffer->numGpFifoEntries);
1320
1321 if (!GetCoherenceFlags(buffer, &coherent)) {
1322 goto failed;
1323 }
1324
1325 if (!AllocNotifiers(buffer, pParams, coherent, &usedHandleBitmask)) {
1326 nvPushImportLogError(pDevice,
1327 "Failed to allocate notification memory.");
1328 goto failed;
1329 }
1330
1331 /* Only allocate memory for one pushbuffer. All subdevices will share */
1332 if (!AllocPushbuffer(buffer,
1333 pParams,
1334 &usedHandleBitmask,
1335 &cpuAddress,
1336 &gpuAddress)) {
1337 nvPushImportLogError(pDevice,
1338 "Push buffer DMA allocation failed");
1339 goto failed;
1340 }
1341
1342 /* First the "main" pushbuffer */
1343 InitDmaSegment(&buffer->main,
1344 cpuAddress,
1345 gpuAddress,
1346 pParams->pushBufferSizeInBytes);
1347 /* Next the GPFIFO */
1348 buffer->gpfifo =
1349 (NvU32 *)((char *)cpuAddress + __nvPushGpFifoOffset(buffer));
1350 buffer->gpPutOffset = 0;
1351 /* Next the "progressTracker" */
1352 InitDmaSegment(&buffer->progressTracker,
1353 (char *)cpuAddress + __nvPushProgressTrackerOffset(buffer),
1354 gpuAddress + __nvPushProgressTrackerOffset(buffer),
1355 ProgressTrackerBufferSize(buffer));
1356
1357 if (!nvDmaAllocUserD(buffer, pParams, &usedHandleBitmask)) {
1358 goto failed;
1359 }
1360
1361 if (!AllocChannelObject(buffer, pParams,
1362 &usedHandleBitmask, gpuAddress)) {
1363 goto failed;
1364 }
1365
1366 if (pDevice->hal.caps.clientAllocatesUserD &&
1367 !RequestChidToken(buffer)) {
1368 goto failed;
1369 }
1370
1371 if (!AllocSemaSurface(buffer, pParams, coherent, &usedHandleBitmask)) {
1372 goto failed;
1373 }
1374
1375 #if defined(DEBUG)
1376 if (buffer->logNvDiss) {
1377 nvPushImportLogNvDiss(buffer, "nvdiss: encoding 2\n");
1378 }
1379 #endif /* DEBUG */
1380
1381 InitGpFifoExtendedBase(buffer);
1382
1383 if (!__nvPushTestPushBuffer(buffer)) {
1384 goto failed;
1385 }
1386
1387 buffer->initialized = TRUE;
1388
1389 return TRUE;
1390
1391 failed:
1392 nvPushFreeChannel(buffer);
1393 return FALSE;
1394 }
1395
1396 /*!
1397 * Free resources allocated by AllocChannel().
1398 */
nvPushFreeChannel(NvPushChannelPtr buffer)1399 void nvPushFreeChannel(NvPushChannelPtr buffer)
1400 {
1401 NvPushDevicePtr pDevice = buffer->pDevice;
1402 unsigned int sd;
1403 int deviceIndex;
1404
1405 if (pDevice == NULL) {
1406 goto done;
1407 }
1408
1409 /* Unmap pushbuffer DMA controls */
1410 for (sd = 0; sd < pDevice->numSubDevices; sd++) {
1411 NvU32 userdMapHandle;
1412
1413 deviceIndex = __nvPushGetDeviceIndex(pDevice, sd);
1414 if (pDevice->hal.caps.clientAllocatesUserD) {
1415 userdMapHandle = buffer->userD[deviceIndex].hMemory;
1416 } else {
1417 userdMapHandle = buffer->channelHandle[deviceIndex];
1418 }
1419
1420 if (buffer->control[sd]) {
1421 nvPushImportRmApiUnmapMemory(pDevice,
1422 pDevice->subDevice[sd].handle,
1423 userdMapHandle,
1424 buffer->control[sd],
1425 0);
1426 buffer->control[sd] = NULL;
1427 }
1428 }
1429
1430 for (deviceIndex = __nvPushGetNumDevices(pDevice) - 1;
1431 deviceIndex >= 0;
1432 deviceIndex--) {
1433 if (buffer->channelHandle[deviceIndex] != 0) {
1434 nvPushImportRmApiFree(pDevice,
1435 pDevice->subDevice[deviceIndex].deviceHandle,
1436 buffer->channelHandle[deviceIndex]);
1437 buffer->channelHandle[deviceIndex] = 0;
1438 }
1439
1440 if (buffer->userD[deviceIndex].hMemory != 0) {
1441 nvPushImportRmApiFree(pDevice,
1442 pDevice->subDevice[deviceIndex].deviceHandle,
1443 buffer->userD[deviceIndex].hMemory);
1444 buffer->userD[deviceIndex].hMemory = 0;
1445 }
1446
1447 if (buffer->pushbufferVAHandle[deviceIndex] != 0) {
1448 nvPushImportRmApiFree(pDevice,
1449 pDevice->subDevice[deviceIndex].deviceHandle,
1450 buffer->pushbufferVAHandle[deviceIndex]);
1451 buffer->pushbufferVAHandle[deviceIndex] = 0;
1452 }
1453 }
1454
1455 if (buffer->pushbufferHandle != 0) {
1456 nvPushImportRmApiFree(pDevice,
1457 pDevice->subDevice[0].deviceHandle,
1458 buffer->pushbufferHandle);
1459 buffer->pushbufferHandle = 0;
1460 }
1461
1462 FreeNotifiers(buffer);
1463
1464 FreeSemaSurface(buffer);
1465
1466 done:
1467 NVMISC_MEMSET(buffer, 0, sizeof(*buffer));
1468 }
1469
nvPushAllocDevice(const NvPushAllocDeviceParams * pParams,NvPushDevicePtr pDevice)1470 NvBool nvPushAllocDevice(
1471 const NvPushAllocDeviceParams *pParams,
1472 NvPushDevicePtr pDevice)
1473 {
1474 unsigned int sd;
1475 NvU64 usedHandleBitmask = 0;
1476
1477 NVMISC_MEMSET(pDevice, 0, sizeof(*pDevice));
1478
1479 pDevice->hostDevice = pParams->hostDevice;
1480 pDevice->pImports = pParams->pImports;
1481 pDevice->numSubDevices = pParams->numSubDevices;
1482 pDevice->clientSli = pParams->clientSli;
1483 pDevice->clientHandle = pParams->clientHandle;
1484
1485 pDevice->numClasses = pParams->numClasses;
1486 pDevice->supportedClasses = pParams->supportedClasses;
1487
1488 pDevice->confidentialComputeMode = pParams->confidentialComputeMode;
1489
1490 for (sd = 0; sd < pParams->numSubDevices; sd++) {
1491 pDevice->subDevice[sd].handle = pParams->subDevice[sd].handle;
1492 pDevice->subDevice[sd].deviceHandle = pParams->subDevice[sd].deviceHandle;
1493 pDevice->subDevice[sd].gpuVASpaceObject = pParams->subDevice[sd].gpuVASpaceObject;
1494 pDevice->subDevice[sd].gpuVASpaceCtxDma = pParams->subDevice[sd].gpuVASpace;
1495 }
1496
1497 if (pParams->amodel.config != NV_AMODEL_NONE) {
1498 nvAssert(!"Ignoring AModel configuration on non-XAMODEL build");
1499 }
1500 pDevice->amodelConfig = pParams->amodel.config;
1501
1502 CheckCaps(pDevice);
1503
1504 if (!GetChannelClassAndUserDSize(pDevice, pParams)) {
1505 nvPushImportLogError(pDevice,
1506 "No supported command buffer format found");
1507 goto fail;
1508 }
1509
1510 if (!__nvPushGetHal(pParams, pDevice->gpfifoClass, &pDevice->hal)) {
1511 nvPushImportLogError(pDevice, "No push buffer implementation found.");
1512 goto fail;
1513 }
1514
1515 if (!AllocUserMode(pDevice, pParams, &usedHandleBitmask)) {
1516 nvPushImportLogError(pDevice,
1517 "Unable to allocate push buffer controls.");
1518 goto fail;
1519 }
1520
1521
1522 return TRUE;
1523
1524 fail:
1525 nvPushFreeDevice(pDevice);
1526
1527 return FALSE;
1528 }
1529
nvPushFreeDevice(NvPushDevicePtr pDevice)1530 void nvPushFreeDevice(
1531 NvPushDevicePtr pDevice)
1532 {
1533 FreeUserMode(pDevice);
1534
1535 NVMISC_MEMSET(pDevice, 0, sizeof(*pDevice));
1536 }
1537